in Runtime/Tokenizers/Tokenizers/Tokenizers.cs [218:289]
public List<string> Encode(List<string> tokens)
{
// Initialize a List<string> to store the encoded tokens
var OutputTokens = new List<string>();
foreach (var token in tokens)
{
// Convert the token into an array of characters
var Chars = token.ToCharArray();
// Initialize a flag to track whether the token is unknown
var IsUnknown = false;
// Initialize the starting index for substring search
var Start = 0;
// Initialize an List<string> to store subtokens of the token
var SubTokens = new List<string>();
while (Start < Chars.Length)
{
var End = Chars.Length;
// Initialize a variable to store the current substring
string CurrentSubstring = null;
while (Start < End)
{
// Get a substring from the character array
var Substr = new string(Chars.Skip(Start).Take(End - Start).ToArray());
if (Start > 0)
{
// Add a prefix to the substring if not the first character
Substr = ContinuingSubwordPrefix + Substr;
}
// Check if the substring is in the vocabulary
if (TokensToIds.ContainsKey(Substr))
{
// Store the current substring
CurrentSubstring = Substr;
break;
}
// Decrease the end index for substring search
--End;
}
if (CurrentSubstring == null)
{
// Set the flag to indicate that the token is unknown
IsUnknown = true;
break;
}
// Add the current substring to the subtokens List<string>
SubTokens.Add(CurrentSubstring);
// Move the start index to the end index for the next iteration
Start = End;
}
if (IsUnknown)
{
// If token is unknown, add the unknown token to the output List<string>
OutputTokens.Add(UnkToken);
}
else
{
// If token is not unknown, add the subtokens to the output List<string>
OutputTokens.AddRange(SubTokens);
}
}
// Return the List<string> of encoded tokens
return OutputTokens;
}