private _myTokenize()

in patched-vscode/src/vs/editor/standalone/common/monarch/monarchLexer.ts [573:873]
229 lines of code
75 McCabe index (conditional complexity)

	private _myTokenize(lineWithoutLF: string, hasEOL: boolean, lineState: MonarchLineState, offsetDelta: number, tokensCollector: IMonarchTokensCollector): MonarchLineState {
		tokensCollector.enterLanguage(this._languageId);

		const lineWithoutLFLength = lineWithoutLF.length;
		const line = (hasEOL && this._lexer.includeLF ? lineWithoutLF + '\n' : lineWithoutLF);
		const lineLength = line.length;

		let embeddedLanguageData = lineState.embeddedLanguageData;
		let stack = lineState.stack;
		let pos = 0;

		// regular expression group matching
		// these never need cloning or equality since they are only used within a line match
		interface GroupMatching {
			matches: string[];
			rule: monarchCommon.IRule | null;
			groups: { action: monarchCommon.FuzzyAction; matched: string }[];
		}
		let groupMatching: GroupMatching | null = null;

		// See https://github.com/microsoft/monaco-editor/issues/1235
		// Evaluate rules at least once for an empty line
		let forceEvaluation = true;

		while (forceEvaluation || pos < lineLength) {

			const pos0 = pos;
			const stackLen0 = stack.depth;
			const groupLen0 = groupMatching ? groupMatching.groups.length : 0;
			const state = stack.state;

			let matches: string[] | null = null;
			let matched: string | null = null;
			let action: monarchCommon.FuzzyAction | monarchCommon.FuzzyAction[] | null = null;
			let rule: monarchCommon.IRule | null = null;

			let enteringEmbeddedLanguage: string | null = null;

			// check if we need to process group matches first
			if (groupMatching) {
				matches = groupMatching.matches;
				const groupEntry = groupMatching.groups.shift()!;
				matched = groupEntry.matched;
				action = groupEntry.action;
				rule = groupMatching.rule;

				// cleanup if necessary
				if (groupMatching.groups.length === 0) {
					groupMatching = null;
				}
			} else {
				// otherwise we match on the token stream

				if (!forceEvaluation && pos >= lineLength) {
					// nothing to do
					break;
				}

				forceEvaluation = false;

				// get the rules for this state
				let rules: monarchCommon.IRule[] | null = this._lexer.tokenizer[state];
				if (!rules) {
					rules = monarchCommon.findRules(this._lexer, state); // do parent matching
					if (!rules) {
						throw monarchCommon.createError(this._lexer, 'tokenizer state is not defined: ' + state);
					}
				}

				// try each rule until we match
				const restOfLine = line.substr(pos);
				for (const rule of rules) {
					if (pos === 0 || !rule.matchOnlyAtLineStart) {
						matches = restOfLine.match(rule.resolveRegex(state));
						if (matches) {
							matched = matches[0];
							action = rule.action;
							break;
						}
					}
				}
			}

			// We matched 'rule' with 'matches' and 'action'
			if (!matches) {
				matches = [''];
				matched = '';
			}

			if (!action) {
				// bad: we didn't match anything, and there is no action to take
				// we need to advance the stream or we get progress trouble
				if (pos < lineLength) {
					matches = [line.charAt(pos)];
					matched = matches[0];
				}
				action = this._lexer.defaultToken;
			}

			if (matched === null) {
				// should never happen, needed for strict null checking
				break;
			}

			// advance stream
			pos += matched.length;

			// maybe call action function (used for 'cases')
			while (monarchCommon.isFuzzyAction(action) && monarchCommon.isIAction(action) && action.test) {
				action = action.test(matched, matches, state, pos === lineLength);
			}

			let result: monarchCommon.FuzzyAction | monarchCommon.FuzzyAction[] | null = null;
			// set the result: either a string or an array of actions
			if (typeof action === 'string' || Array.isArray(action)) {
				result = action;
			} else if (action.group) {
				result = action.group;
			} else if (action.token !== null && action.token !== undefined) {

				// do $n replacements?
				if (action.tokenSubst) {
					result = monarchCommon.substituteMatches(this._lexer, action.token, matched, matches, state);
				} else {
					result = action.token;
				}

				// enter embedded language?
				if (action.nextEmbedded) {
					if (action.nextEmbedded === '@pop') {
						if (!embeddedLanguageData) {
							throw monarchCommon.createError(this._lexer, 'cannot pop embedded language if not inside one');
						}
						embeddedLanguageData = null;
					} else if (embeddedLanguageData) {
						throw monarchCommon.createError(this._lexer, 'cannot enter embedded language from within an embedded language');
					} else {
						enteringEmbeddedLanguage = monarchCommon.substituteMatches(this._lexer, action.nextEmbedded, matched, matches, state);
					}
				}

				// state transformations
				if (action.goBack) { // back up the stream..
					pos = Math.max(0, pos - action.goBack);
				}

				if (action.switchTo && typeof action.switchTo === 'string') {
					let nextState = monarchCommon.substituteMatches(this._lexer, action.switchTo, matched, matches, state);  // switch state without a push...
					if (nextState[0] === '@') {
						nextState = nextState.substr(1); // peel off starting '@'
					}
					if (!monarchCommon.findRules(this._lexer, nextState)) {
						throw monarchCommon.createError(this._lexer, 'trying to switch to a state \'' + nextState + '\' that is undefined in rule: ' + this._safeRuleName(rule));
					} else {
						stack = stack.switchTo(nextState);
					}
				} else if (action.transform && typeof action.transform === 'function') {
					throw monarchCommon.createError(this._lexer, 'action.transform not supported');
				} else if (action.next) {
					if (action.next === '@push') {
						if (stack.depth >= this._lexer.maxStack) {
							throw monarchCommon.createError(this._lexer, 'maximum tokenizer stack size reached: [' +
								stack.state + ',' + stack.parent!.state + ',...]');
						} else {
							stack = stack.push(state);
						}
					} else if (action.next === '@pop') {
						if (stack.depth <= 1) {
							throw monarchCommon.createError(this._lexer, 'trying to pop an empty stack in rule: ' + this._safeRuleName(rule));
						} else {
							stack = stack.pop()!;
						}
					} else if (action.next === '@popall') {
						stack = stack.popall();
					} else {
						let nextState = monarchCommon.substituteMatches(this._lexer, action.next, matched, matches, state);
						if (nextState[0] === '@') {
							nextState = nextState.substr(1); // peel off starting '@'
						}

						if (!monarchCommon.findRules(this._lexer, nextState)) {
							throw monarchCommon.createError(this._lexer, 'trying to set a next state \'' + nextState + '\' that is undefined in rule: ' + this._safeRuleName(rule));
						} else {
							stack = stack.push(nextState);
						}
					}
				}

				if (action.log && typeof (action.log) === 'string') {
					monarchCommon.log(this._lexer, this._lexer.languageId + ': ' + monarchCommon.substituteMatches(this._lexer, action.log, matched, matches, state));
				}
			}

			// check result
			if (result === null) {
				throw monarchCommon.createError(this._lexer, 'lexer rule has no well-defined action in rule: ' + this._safeRuleName(rule));
			}

			const computeNewStateForEmbeddedLanguage = (enteringEmbeddedLanguage: string) => {
				// support language names, mime types, and language ids
				const languageId = (
					this._languageService.getLanguageIdByLanguageName(enteringEmbeddedLanguage)
					|| this._languageService.getLanguageIdByMimeType(enteringEmbeddedLanguage)
					|| enteringEmbeddedLanguage
				);

				const embeddedLanguageData = this._getNestedEmbeddedLanguageData(languageId);

				if (pos < lineLength) {
					// there is content from the embedded language on this line
					const restOfLine = lineWithoutLF.substr(pos);
					return this._nestedTokenize(restOfLine, hasEOL, MonarchLineStateFactory.create(stack, embeddedLanguageData), offsetDelta + pos, tokensCollector);
				} else {
					return MonarchLineStateFactory.create(stack, embeddedLanguageData);
				}
			};

			// is the result a group match?
			if (Array.isArray(result)) {
				if (groupMatching && groupMatching.groups.length > 0) {
					throw monarchCommon.createError(this._lexer, 'groups cannot be nested: ' + this._safeRuleName(rule));
				}
				if (matches.length !== result.length + 1) {
					throw monarchCommon.createError(this._lexer, 'matched number of groups does not match the number of actions in rule: ' + this._safeRuleName(rule));
				}
				let totalLen = 0;
				for (let i = 1; i < matches.length; i++) {
					totalLen += matches[i].length;
				}
				if (totalLen !== matched.length) {
					throw monarchCommon.createError(this._lexer, 'with groups, all characters should be matched in consecutive groups in rule: ' + this._safeRuleName(rule));
				}

				groupMatching = {
					rule: rule,
					matches: matches,
					groups: []
				};
				for (let i = 0; i < result.length; i++) {
					groupMatching.groups[i] = {
						action: result[i],
						matched: matches[i + 1]
					};
				}

				pos -= matched.length;
				// call recursively to initiate first result match
				continue;
			} else {
				// regular result

				// check for '@rematch'
				if (result === '@rematch') {
					pos -= matched.length;
					matched = '';  // better set the next state too..
					matches = null;
					result = '';

					// Even though `@rematch` was specified, if `nextEmbedded` also specified,
					// a state transition should occur.
					if (enteringEmbeddedLanguage !== null) {
						return computeNewStateForEmbeddedLanguage(enteringEmbeddedLanguage);
					}
				}

				// check progress
				if (matched.length === 0) {
					if (lineLength === 0 || stackLen0 !== stack.depth || state !== stack.state || (!groupMatching ? 0 : groupMatching.groups.length) !== groupLen0) {
						continue;
					} else {
						throw monarchCommon.createError(this._lexer, 'no progress in tokenizer in rule: ' + this._safeRuleName(rule));
					}
				}

				// return the result (and check for brace matching)
				// todo: for efficiency we could pre-sanitize tokenPostfix and substitutions
				let tokenType: string | null = null;
				if (monarchCommon.isString(result) && result.indexOf('@brackets') === 0) {
					const rest = result.substr('@brackets'.length);
					const bracket = findBracket(this._lexer, matched);
					if (!bracket) {
						throw monarchCommon.createError(this._lexer, '@brackets token returned but no bracket defined as: ' + matched);
					}
					tokenType = monarchCommon.sanitize(bracket.token + rest);
				} else {
					const token = (result === '' ? '' : result + this._lexer.tokenPostfix);
					tokenType = monarchCommon.sanitize(token);
				}

				if (pos0 < lineWithoutLFLength) {
					tokensCollector.emit(pos0 + offsetDelta, tokenType);
				}
			}

			if (enteringEmbeddedLanguage !== null) {
				return computeNewStateForEmbeddedLanguage(enteringEmbeddedLanguage);
			}
		}

		return MonarchLineStateFactory.create(stack, embeddedLanguageData);
	}