func Tokenize()

in pkg/linguist/tokenizer/tokenizer.go [89:181]


func Tokenize(input []byte) (tokens []string) {
	if len(input) == 0 {
		return tokens
	}
	if len(input) >= ByteLimit {
		input = input[:ByteLimit]
	}

	var (
		mlStart     = false        // in a multiline comment
		mlEnd       *regexp.Regexp // closing token regexp
		stringStart = false        // in a string literal
		stringEnd   byte           // closing token byte to be found by the String regexp
	)

	buf := bytes.NewBuffer(input)
	scanlines := bufio.NewScanner(buf)
	scanlines.Split(bufio.ScanLines)

	// NOTE(tso): the use of goto here is probably interchangeable with continue
line:
	for scanlines.Scan() {
		ln := scanlines.Bytes()

		for _, re := range startLineComment {
			if re.Match(ln) {
				goto line
			}
		}

		// NOTE(tso): bufio.Scanner.Split(bufio.ScanWords) seems to just split on whitespace
		// this may yield inaccurate results where there is a lack of sufficient
		// whitespace for the approaches taken here, i.e. jumping straight to the
		// next word/line boundary.
		lnBuffer := bytes.NewBuffer(ln)
		scanwords := bufio.NewScanner(lnBuffer)
		scanwords.Split(bufio.ScanWords)
	word:
		for scanwords.Scan() {
			tokenBytes := scanwords.Bytes()
			tokenString := scanwords.Text()

			// find end of multi-line comment
			if mlStart {
				if mlEnd.Match(tokenBytes) {
					mlStart = false
					mlEnd = nil
				}
				goto word
			}

			// find end of string literal
			if stringStart {
				s := stringRegexp.FindSubmatch(tokenBytes)
				if s != nil && s[1][0] == stringEnd {
					stringStart = false
					stringEnd = 0
				}
				goto word
			}

			// find single-line comment
			for _, re := range beginSingleLineComment {
				if re.Match(tokenBytes) {
					goto line
				}
			}

			// find start of multi-line comment
			if matched, terminator := FindMultiLineComment(tokenBytes); matched {
				mlStart = true
				mlEnd = terminator
				goto word
			}

			// find start of string literal
			if s := stringRegexp.FindSubmatch(tokenBytes); s != nil {
				stringStart = true
				stringEnd = s[1][0]
				goto word
			}

			// find numeric literal
			if n := numberRegexp.Find(tokenBytes); n != nil {
				goto word
			}

			// add valid tokens to result set
			tokens = append(tokens, tokenString)
		}
	}
	return tokens
}