in pkg/linguist/tokenizer/tokenizer.go [89:181]
func Tokenize(input []byte) (tokens []string) {
if len(input) == 0 {
return tokens
}
if len(input) >= ByteLimit {
input = input[:ByteLimit]
}
var (
mlStart = false // in a multiline comment
mlEnd *regexp.Regexp // closing token regexp
stringStart = false // in a string literal
stringEnd byte // closing token byte to be found by the String regexp
)
buf := bytes.NewBuffer(input)
scanlines := bufio.NewScanner(buf)
scanlines.Split(bufio.ScanLines)
// NOTE(tso): the use of goto here is probably interchangeable with continue
line:
for scanlines.Scan() {
ln := scanlines.Bytes()
for _, re := range startLineComment {
if re.Match(ln) {
goto line
}
}
// NOTE(tso): bufio.Scanner.Split(bufio.ScanWords) seems to just split on whitespace
// this may yield inaccurate results where there is a lack of sufficient
// whitespace for the approaches taken here, i.e. jumping straight to the
// next word/line boundary.
lnBuffer := bytes.NewBuffer(ln)
scanwords := bufio.NewScanner(lnBuffer)
scanwords.Split(bufio.ScanWords)
word:
for scanwords.Scan() {
tokenBytes := scanwords.Bytes()
tokenString := scanwords.Text()
// find end of multi-line comment
if mlStart {
if mlEnd.Match(tokenBytes) {
mlStart = false
mlEnd = nil
}
goto word
}
// find end of string literal
if stringStart {
s := stringRegexp.FindSubmatch(tokenBytes)
if s != nil && s[1][0] == stringEnd {
stringStart = false
stringEnd = 0
}
goto word
}
// find single-line comment
for _, re := range beginSingleLineComment {
if re.Match(tokenBytes) {
goto line
}
}
// find start of multi-line comment
if matched, terminator := FindMultiLineComment(tokenBytes); matched {
mlStart = true
mlEnd = terminator
goto word
}
// find start of string literal
if s := stringRegexp.FindSubmatch(tokenBytes); s != nil {
stringStart = true
stringEnd = s[1][0]
goto word
}
// find numeric literal
if n := numberRegexp.Find(tokenBytes); n != nil {
goto word
}
// add valid tokens to result set
tokens = append(tokens, tokenString)
}
}
return tokens
}