processors/dissect/trim.go

// Licensed to Elasticsearch B.V. under one or more contributor // license agreements. See the NOTICE file distributed with // this work for additional information regarding copyright // ownership. Elasticsearch B.V. licenses this file to you under // the Apache License, Version 2.0 (the "License"); you may // not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. package dissect import ( "errors" "strings" "unicode/utf8" ) const asciiLimit = 128 type trimmer interface { Trim(s string, start, end int) (int, int) } func newTrimmer(trimChars string, trimLeft, trimRight bool) (t trimmer, err error) { if t, err = newASCIITrimmer(trimChars, trimLeft, trimRight); errors.Is(err, errOnlyASCII) { t = newUTF8Trimmer(trimChars, trimLeft, trimRight) err = nil } return t, err } type asciiTrimmer struct { chars [asciiLimit]byte left, right bool } var errOnlyASCII = errors.New("only trimming of ASCII characters is supported") func newASCIITrimmer(trimChars string, trimLeft, trimRight bool) (trimmer, error) { t := asciiTrimmer{ left: trimLeft, right: trimRight, } for _, chr := range []byte(trimChars) { if chr >= asciiLimit { return t, errOnlyASCII } t.chars[chr] = 1 } return t, nil } func (t asciiTrimmer) Trim(s string, start, end int) (int, int) { if t.left { for ; start < end && s[start] < asciiLimit && t.chars[s[start]] != 0; start++ { } } if t.right { for ; start < end && s[end-1] < asciiLimit && t.chars[s[end-1]] != 0; end-- { } } return start, end } type utf8trimmer struct { fn func(rune) bool left, right bool } func newUTF8Trimmer(trimChars string, trimLeft, trimRight bool) trimmer { return utf8trimmer{ // Function that returns true when the rune is not in trimChars. fn: func(r rune) bool { return !strings.ContainsRune(trimChars, r) }, left: trimLeft, right: trimRight, } } func (t utf8trimmer) Trim(s string, start, end int) (int, int) { if t.left { // Find first character not in trimChars. pos := strings.IndexFunc(s[start:end], t.fn) if pos == -1 { return end, end } start += pos } if t.right { // Find last character not in trimChars. pos := strings.LastIndexFunc(s[start:end], t.fn) if pos == -1 { return start, start } // End must point to the following character, need to take into account // that the last character can be more than 1-byte wide. _, width := utf8.DecodeRuneInString(s[start+pos:]) end = start + pos + width } return start, end }

processors/dissect/trim.go (77 lines of code) (raw):