internal/mode/advanced/indexer/encoding.go (60 lines of code) (raw):
package indexer
import (
"fmt"
logkit "gitlab.com/gitlab-org/labkit/log"
"gitlab.com/gitlab-org/go/icu"
)
type Encoder struct {
detector *icu.CharsetDetector
converter *icu.CharsetConverter
}
func NewEncoder(limitFileSize int64) *Encoder {
encoder := &Encoder{}
detector, err := icu.NewCharsetDetector()
if err != nil {
panic(err)
}
encoder.detector = detector
encoder.converter = icu.NewCharsetConverter(int(limitFileSize))
return encoder
}
func (e *Encoder) tryEncodeString(s string) string {
encoded, err := e.encodeString(s)
if err != nil {
logkit.WithError(err).Error("Encode string failed")
return s // TODO: Run it through the UTF-8 replacement encoder
}
return encoded
}
func (e *Encoder) tryEncodeBytes(b []byte) string {
encoded, err := e.encodeBytes(b)
if err != nil {
logkit.WithError(err).Warn("Encode bytes failed")
s := string(b)
return s // TODO: Run it through the UTF-8 replacement encoder
}
return encoded
}
func (e *Encoder) encodeString(s string) (string, error) {
return e.encodeBytes([]byte(s))
}
// encodeString converts a string from an arbitrary encoding to UTF-8
func (e *Encoder) encodeBytes(b []byte) (string, error) {
if len(b) == 0 {
return "", nil
}
matches, err := e.detector.GuessCharset(b)
if err != nil {
return "", fmt.Errorf("Couldn't guess charset: %w", err)
}
// Try encoding for each match, returning the first that succeeds
for _, match := range matches {
utf8, err := e.converter.ConvertToUtf8(b, match.Charset)
if err == nil {
return string(utf8), nil
}
}
// `detector.GuessCharset` may return err == nil && len(matches) == 0
bestGuess := "unknown"
if len(matches) > 0 {
bestGuess = matches[0].Charset
}
return "", fmt.Errorf("Failed to convert from %s to UTF-8", bestGuess)
}