public List Encode()

in Runtime/Tokenizers/Tokenizers/Tokenizers.cs [218:289]


        public List<string> Encode(List<string> tokens)
        {
            // Initialize a List<string> to store the encoded tokens
            var OutputTokens = new List<string>();

            foreach (var token in tokens)
            {
                // Convert the token into an array of characters
                var Chars = token.ToCharArray();

                // Initialize a flag to track whether the token is unknown
                var IsUnknown = false;

                // Initialize the starting index for substring search
                var Start = 0;

                // Initialize an List<string> to store subtokens of the token
                var SubTokens = new List<string>();

                while (Start < Chars.Length)
                {
                    var End = Chars.Length;

                    // Initialize a variable to store the current substring
                    string CurrentSubstring = null;

                    while (Start < End)
                    {
                        // Get a substring from the character array
                        var Substr = new string(Chars.Skip(Start).Take(End - Start).ToArray());

                        if (Start > 0)
                        {
                            // Add a prefix to the substring if not the first character
                            Substr = ContinuingSubwordPrefix + Substr;
                        }

                        // Check if the substring is in the vocabulary
                        if (TokensToIds.ContainsKey(Substr))
                        {
                            // Store the current substring
                            CurrentSubstring = Substr;
                            break;
                        }
                        // Decrease the end index for substring search
                        --End;
                    }
                    if (CurrentSubstring == null)
                    {
                        // Set the flag to indicate that the token is unknown
                        IsUnknown = true;
                        break;
                    }
                    // Add the current substring to the subtokens List<string>
                    SubTokens.Add(CurrentSubstring);
                    // Move the start index to the end index for the next iteration
                    Start = End;
                }
                if (IsUnknown)
                {
                    // If token is unknown, add the unknown token to the output List<string>
                    OutputTokens.Add(UnkToken);
                }
                else
                {
                    // If token is not unknown, add the subtokens to the output List<string>
                    OutputTokens.AddRange(SubTokens);
                }
            }
            // Return the List<string> of encoded tokens
            return OutputTokens;
        }