Sources/Tokenizers/ByteEncoder.swift (260 lines of code) (raw):

// // ByteEncoder.swift // CoreMLBert // // Created by Julien Chaumond on 18/07/2019. // Copyright © 2019 Hugging Face. All rights reserved. // import Foundation let byteEncoder: [UTF8.CodeUnit: String] = [ 33: "!", 34: "\"", 35: "#", 36: "$", 37: "%", 38: "&", 39: "'", 40: "(", 41: ")", 42: "*", 43: "+", 44: ",", 45: "-", 46: ".", 47: "/", 48: "0", 49: "1", 50: "2", 51: "3", 52: "4", 53: "5", 54: "6", 55: "7", 56: "8", 57: "9", 58: ":", 59: ";", 60: "<", 61: "=", 62: ">", 63: "?", 64: "@", 65: "A", 66: "B", 67: "C", 68: "D", 69: "E", 70: "F", 71: "G", 72: "H", 73: "I", 74: "J", 75: "K", 76: "L", 77: "M", 78: "N", 79: "O", 80: "P", 81: "Q", 82: "R", 83: "S", 84: "T", 85: "U", 86: "V", 87: "W", 88: "X", 89: "Y", 90: "Z", 91: "[", 92: "\\", 93: "]", 94: "^", 95: "_", 96: "`", 97: "a", 98: "b", 99: "c", 100: "d", 101: "e", 102: "f", 103: "g", 104: "h", 105: "i", 106: "j", 107: "k", 108: "l", 109: "m", 110: "n", 111: "o", 112: "p", 113: "q", 114: "r", 115: "s", 116: "t", 117: "u", 118: "v", 119: "w", 120: "x", 121: "y", 122: "z", 123: "{", 124: "|", 125: "}", 126: "~", 161: "\u{00a1}", 162: "\u{00a2}", 163: "\u{00a3}", 164: "\u{00a4}", 165: "\u{00a5}", 166: "\u{00a6}", 167: "\u{00a7}", 168: "\u{00a8}", 169: "\u{00a9}", 170: "\u{00aa}", 171: "\u{00ab}", 172: "\u{00ac}", 174: "\u{00ae}", 175: "\u{00af}", 176: "\u{00b0}", 177: "\u{00b1}", 178: "\u{00b2}", 179: "\u{00b3}", 180: "\u{00b4}", 181: "\u{00b5}", 182: "\u{00b6}", 183: "\u{00b7}", 184: "\u{00b8}", 185: "\u{00b9}", 186: "\u{00ba}", 187: "\u{00bb}", 188: "\u{00bc}", 189: "\u{00bd}", 190: "\u{00be}", 191: "\u{00bf}", 192: "\u{00c0}", 193: "\u{00c1}", 194: "\u{00c2}", 195: "\u{00c3}", 196: "\u{00c4}", 197: "\u{00c5}", 198: "\u{00c6}", 199: "\u{00c7}", 200: "\u{00c8}", 201: "\u{00c9}", 202: "\u{00ca}", 203: "\u{00cb}", 204: "\u{00cc}", 205: "\u{00cd}", 206: "\u{00ce}", 207: "\u{00cf}", 208: "\u{00d0}", 209: "\u{00d1}", 210: "\u{00d2}", 211: "\u{00d3}", 212: "\u{00d4}", 213: "\u{00d5}", 214: "\u{00d6}", 215: "\u{00d7}", 216: "\u{00d8}", 217: "\u{00d9}", 218: "\u{00da}", 219: "\u{00db}", 220: "\u{00dc}", 221: "\u{00dd}", 222: "\u{00de}", 223: "\u{00df}", 224: "\u{00e0}", 225: "\u{00e1}", 226: "\u{00e2}", 227: "\u{00e3}", 228: "\u{00e4}", 229: "\u{00e5}", 230: "\u{00e6}", 231: "\u{00e7}", 232: "\u{00e8}", 233: "\u{00e9}", 234: "\u{00ea}", 235: "\u{00eb}", 236: "\u{00ec}", 237: "\u{00ed}", 238: "\u{00ee}", 239: "\u{00ef}", 240: "\u{00f0}", 241: "\u{00f1}", 242: "\u{00f2}", 243: "\u{00f3}", 244: "\u{00f4}", 245: "\u{00f5}", 246: "\u{00f6}", 247: "\u{00f7}", 248: "\u{00f8}", 249: "\u{00f9}", 250: "\u{00fa}", 251: "\u{00fb}", 252: "\u{00fc}", 253: "\u{00fd}", 254: "\u{00fe}", 255: "\u{00ff}", 0: "\u{0100}", 1: "\u{0101}", 2: "\u{0102}", 3: "\u{0103}", 4: "\u{0104}", 5: "\u{0105}", 6: "\u{0106}", 7: "\u{0107}", 8: "\u{0108}", 9: "\u{0109}", 10: "\u{010a}", 11: "\u{010b}", 12: "\u{010c}", 13: "\u{010d}", 14: "\u{010e}", 15: "\u{010f}", 16: "\u{0110}", 17: "\u{0111}", 18: "\u{0112}", 19: "\u{0113}", 20: "\u{0114}", 21: "\u{0115}", 22: "\u{0116}", 23: "\u{0117}", 24: "\u{0118}", 25: "\u{0119}", 26: "\u{011a}", 27: "\u{011b}", 28: "\u{011c}", 29: "\u{011d}", 30: "\u{011e}", 31: "\u{011f}", 32: "\u{0120}", 127: "\u{0121}", 128: "\u{0122}", 129: "\u{0123}", 130: "\u{0124}", 131: "\u{0125}", 132: "\u{0126}", 133: "\u{0127}", 134: "\u{0128}", 135: "\u{0129}", 136: "\u{012a}", 137: "\u{012b}", 138: "\u{012c}", 139: "\u{012d}", 140: "\u{012e}", 141: "\u{012f}", 142: "\u{0130}", 143: "\u{0131}", 144: "\u{0132}", 145: "\u{0133}", 146: "\u{0134}", 147: "\u{0135}", 148: "\u{0136}", 149: "\u{0137}", 150: "\u{0138}", 151: "\u{0139}", 152: "\u{013a}", 153: "\u{013b}", 154: "\u{013c}", 155: "\u{013d}", 156: "\u{013e}", 157: "\u{013f}", 158: "\u{0140}", 159: "\u{0141}", 160: "\u{0142}", 173: "\u{0143}", ] let byteDecoder = Utils.invert(byteEncoder)