Sources/Tokenizers/ByteEncoder.swift (260 lines of code) (raw):
//
// ByteEncoder.swift
// CoreMLBert
//
// Created by Julien Chaumond on 18/07/2019.
// Copyright © 2019 Hugging Face. All rights reserved.
//
import Foundation
let byteEncoder: [UTF8.CodeUnit: String] = [
33: "!",
34: "\"",
35: "#",
36: "$",
37: "%",
38: "&",
39: "'",
40: "(",
41: ")",
42: "*",
43: "+",
44: ",",
45: "-",
46: ".",
47: "/",
48: "0",
49: "1",
50: "2",
51: "3",
52: "4",
53: "5",
54: "6",
55: "7",
56: "8",
57: "9",
58: ":",
59: ";",
60: "<",
61: "=",
62: ">",
63: "?",
64: "@",
65: "A",
66: "B",
67: "C",
68: "D",
69: "E",
70: "F",
71: "G",
72: "H",
73: "I",
74: "J",
75: "K",
76: "L",
77: "M",
78: "N",
79: "O",
80: "P",
81: "Q",
82: "R",
83: "S",
84: "T",
85: "U",
86: "V",
87: "W",
88: "X",
89: "Y",
90: "Z",
91: "[",
92: "\\",
93: "]",
94: "^",
95: "_",
96: "`",
97: "a",
98: "b",
99: "c",
100: "d",
101: "e",
102: "f",
103: "g",
104: "h",
105: "i",
106: "j",
107: "k",
108: "l",
109: "m",
110: "n",
111: "o",
112: "p",
113: "q",
114: "r",
115: "s",
116: "t",
117: "u",
118: "v",
119: "w",
120: "x",
121: "y",
122: "z",
123: "{",
124: "|",
125: "}",
126: "~",
161: "\u{00a1}",
162: "\u{00a2}",
163: "\u{00a3}",
164: "\u{00a4}",
165: "\u{00a5}",
166: "\u{00a6}",
167: "\u{00a7}",
168: "\u{00a8}",
169: "\u{00a9}",
170: "\u{00aa}",
171: "\u{00ab}",
172: "\u{00ac}",
174: "\u{00ae}",
175: "\u{00af}",
176: "\u{00b0}",
177: "\u{00b1}",
178: "\u{00b2}",
179: "\u{00b3}",
180: "\u{00b4}",
181: "\u{00b5}",
182: "\u{00b6}",
183: "\u{00b7}",
184: "\u{00b8}",
185: "\u{00b9}",
186: "\u{00ba}",
187: "\u{00bb}",
188: "\u{00bc}",
189: "\u{00bd}",
190: "\u{00be}",
191: "\u{00bf}",
192: "\u{00c0}",
193: "\u{00c1}",
194: "\u{00c2}",
195: "\u{00c3}",
196: "\u{00c4}",
197: "\u{00c5}",
198: "\u{00c6}",
199: "\u{00c7}",
200: "\u{00c8}",
201: "\u{00c9}",
202: "\u{00ca}",
203: "\u{00cb}",
204: "\u{00cc}",
205: "\u{00cd}",
206: "\u{00ce}",
207: "\u{00cf}",
208: "\u{00d0}",
209: "\u{00d1}",
210: "\u{00d2}",
211: "\u{00d3}",
212: "\u{00d4}",
213: "\u{00d5}",
214: "\u{00d6}",
215: "\u{00d7}",
216: "\u{00d8}",
217: "\u{00d9}",
218: "\u{00da}",
219: "\u{00db}",
220: "\u{00dc}",
221: "\u{00dd}",
222: "\u{00de}",
223: "\u{00df}",
224: "\u{00e0}",
225: "\u{00e1}",
226: "\u{00e2}",
227: "\u{00e3}",
228: "\u{00e4}",
229: "\u{00e5}",
230: "\u{00e6}",
231: "\u{00e7}",
232: "\u{00e8}",
233: "\u{00e9}",
234: "\u{00ea}",
235: "\u{00eb}",
236: "\u{00ec}",
237: "\u{00ed}",
238: "\u{00ee}",
239: "\u{00ef}",
240: "\u{00f0}",
241: "\u{00f1}",
242: "\u{00f2}",
243: "\u{00f3}",
244: "\u{00f4}",
245: "\u{00f5}",
246: "\u{00f6}",
247: "\u{00f7}",
248: "\u{00f8}",
249: "\u{00f9}",
250: "\u{00fa}",
251: "\u{00fb}",
252: "\u{00fc}",
253: "\u{00fd}",
254: "\u{00fe}",
255: "\u{00ff}",
0: "\u{0100}",
1: "\u{0101}",
2: "\u{0102}",
3: "\u{0103}",
4: "\u{0104}",
5: "\u{0105}",
6: "\u{0106}",
7: "\u{0107}",
8: "\u{0108}",
9: "\u{0109}",
10: "\u{010a}",
11: "\u{010b}",
12: "\u{010c}",
13: "\u{010d}",
14: "\u{010e}",
15: "\u{010f}",
16: "\u{0110}",
17: "\u{0111}",
18: "\u{0112}",
19: "\u{0113}",
20: "\u{0114}",
21: "\u{0115}",
22: "\u{0116}",
23: "\u{0117}",
24: "\u{0118}",
25: "\u{0119}",
26: "\u{011a}",
27: "\u{011b}",
28: "\u{011c}",
29: "\u{011d}",
30: "\u{011e}",
31: "\u{011f}",
32: "\u{0120}",
127: "\u{0121}",
128: "\u{0122}",
129: "\u{0123}",
130: "\u{0124}",
131: "\u{0125}",
132: "\u{0126}",
133: "\u{0127}",
134: "\u{0128}",
135: "\u{0129}",
136: "\u{012a}",
137: "\u{012b}",
138: "\u{012c}",
139: "\u{012d}",
140: "\u{012e}",
141: "\u{012f}",
142: "\u{0130}",
143: "\u{0131}",
144: "\u{0132}",
145: "\u{0133}",
146: "\u{0134}",
147: "\u{0135}",
148: "\u{0136}",
149: "\u{0137}",
150: "\u{0138}",
151: "\u{0139}",
152: "\u{013a}",
153: "\u{013b}",
154: "\u{013c}",
155: "\u{013d}",
156: "\u{013e}",
157: "\u{013f}",
158: "\u{0140}",
159: "\u{0141}",
160: "\u{0142}",
173: "\u{0143}",
]
let byteDecoder = Utils.invert(byteEncoder)