neuron_explainer/activations/derived_scalars/least_common_tokens.py (598 lines of code) (raw):

# Note: A handful of strings are commented out because they encode to two tokens. We could recreate # this list as a list of integers to avoid this issue. Doing so would presumably also fix the weird # unrenderable characters ("�"). LEAST_COMMON_GPT2_TOKEN_STRS = [ "�", " �", "��", " Tradable", "galitarian", # "の�", "aughtered", "ciating", " resil", "ruciating", " unsus", " antioxid", " ├──", "66666666", "quished", "osponsors", " surpr", '"},"', " perspect", " Cosponsors", "Orderable", " glim", " Takeru", "hedon", " streng", "sembly", "strous", "ngth", "alkyrie", "usra", " rebell", " disadvant", "EMOTE", " ancest", "TextColor", "paio", " convol", " volunte", " reperto", "Reloaded", " Reincarnated", "untled", "���", " simultane", "lished", " Magicka", " disapp", "=-=-", " Adamant", " ``(", " simulac", "reddits", "landish", " ALEC", " unemploy", " assum", "enfranch", " Sorce", "ifference", " linem", " Awoken", "7601", "!'\"", "sofar", "livious", "everal", "dyl", "ournal", " whistlebl", " ende", "ンジ", "anamo", " murd", " besie", ' …"', "udeb", "Footnote", "clinton", "Interstitial", " incent", " Cth", "FINEST", " cryst", " reperc", " prosec", "accompan", "aucuses", "oldown", "iseum", " confir", "zbollah", " behav", "ulhu", " acknow", "iberal", " dilig", "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\", " juven", "glomer", " nodd", " Yanuk", " Abyssal", " UNCLASSIFIED", " defe", " comprom", " taxp", "ecause", " NASL", " rall", "=-=-=-=-", "displayText", "efficients", " therap", " opio", "ometimes", "acerb", " artific", '":-', "DonaldTrump", "lyak", "..................", " <@", "querque", " contrad", " Jagu", " hemor", "aepernick", " Azerb", " vulner", " subparagraph", "iscons", "imaru", "Depths", " constitu", " Carbuncle", " )))", "ドラゴン", "ヤ", "ゼ", "acly", " pione", " +#", " Sorceress", "atever", " Norn", " Pengu", "repre", " Unloaded", " redes", "qqa", " sovere", " paran", " challeng", "regor", " intrins", "rawdownloadcloneembedreportprint", "龍", ";;;;;;;;;;;;", " anecd", "emort", " satell", "ensical", " Primordial", "emale", " Pastebin", "ocally", " pse", " ..............", " unbeliev", " tradem", " VIDE", " describ", " sophistic", " insurg", " distingu", " preval", " cannabin", " Ezek", " withd", ")=(", "hement", "iversal", "jriwal", " filib", " subdu", "ailability", "andestine", # "龍�", "acebook", " Shards", " Mercenary", " ());", " partName", " prolet", " metic", "ichick", "STDOUT", " lapt", " Beir", " ⓘ", "catentry", "ヴ", "ーティ", "ensional", "bably", " unpop", " behavi", " horm", " 裏", "*=-", " veter", " contrace", "ocobo", "-+-+", " newsp", " Vaugh", " livest", "merga", " sshd", "¯¯", "blance", "````", " proletariat", "ptives", " encount", " egreg", " Frieza", " oun", " mosqu", " advoc", "VIDIA", " quir", " traged", " AoE", " conduc", " Bolshe", " BaseType", "uberty", " Protoss", " proletarian", "steamapps", " nostalg", "76561", "ItemImage", " Metatron", "umbledore", "@@@@@@@@", "WithNo", "uliffe", " horizont", " perpend", "ㅋ", " concess", "pmwiki", "ヘ", " comr", " derog", "エル", "udicrous", "CONCLUS", " discrep", " snipp", "appropriately", " Leban", " tremend", " ≡", "ilater", " GOODMAN", "aterasu", " Flavoring", '"]=>', "GGGGGGGG", "\\\\\\\\\\\\\\\\", " msec", " dracon", " blat", " enthusi", " neighb", "umenthal", " myster", "rarily", "olicy", "DeliveryDate", " GamerGate", " Sakuya", "70710", " defic", " councill", " WATCHED", "ovych", " miscon", " warr", " Archdemon", "gdala", "@#&", " suscept", "ドラ", " Moroc", " Ibid", "soType", "Nitrome", " Canaver", "umbn", " condem", " <+", " CLSID", "iosyn", "groupon", "chwitz", "戦", # "�士", " predec", " Palestin", "apego", "awaru", "ntil", "apons", "noxious", "aeper", " caut", "akeru", " Chocobo", " Shinra", " weap", " Citiz", " Swordsman", "PDATE", "munition", "vertisement", "iggurat", "utonium", " notor", "Sov", "risome", "iannopoulos", " exha", " Ukrain", "conservancy", " ├", " millenn", "itially", "luaj", " Kislyak", " detrim", " misunder", " Pyrrha", " Tsukuyomi", " reluct", " Strongh", " Marketable", '…."', "izoph", " Skydragon", "oldemort", "Marginal", "ThumbnailImage", " cumbers", "00007", "senal", "bleacher", "interstitial", "\u0000", " psychiat", " Gamergate", " explan", " Nanto", '":""},{"', " conclud", " eleph", " citiz", " pestic", " gobl", "hillary", " corrid", "ーテ", "ccording", "hovah", "Poké", "rontal", "Û", "\\/\\/", " srf", "anwhile", "\u0004", "\b", "\u000f", "\u000b", "\u0019", "\u001c", ".''.", " undermin", " [|", "ゴン", " adolesc", " proport", " earthqu", " showc", " welf", " destro", "Redditor", " Okawaru", '},{"', " nomine", "oreAnd", " shenan", "actionDate", "NetMessage", "milo", " looph", "ikuman", " teasp", " occas", '":"/', " subur", " helicop", " obser", '"}],"', "tnc", " Rohing", "aution", " mathemat", " dissatisf", "arnaev", " seiz", "wcsstore", " 4090", # "��極", " practition", " Kinnikuman", ">>\\", "░░", "████████", " entreprene", "ワン", "ウス", "Vaults", " skelet", ".」", " unlaw", "icter", "etheless", " (−", "\u0002", "\u0003", "\u0005", "\u0007", "\u000e", "\u0010", "\u0011", "\u0012", "\u0013", "\u0014", "\u0015", "\u0016", "\u0017", "\u0018", "\u001a", "\u001d", "\u001e", "\u001f", " conflic", "ortunately", "ÃÂ", "ÃÂÃÂ", "¯¯¯¯", "ÃÂÃÂÃÂÃÂ", "taboola", " unnecess", " suspic", "ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ", "byss", "¯¯¯¯¯¯¯¯", " lineback", " Dragonbound", " guiActive", " carbohyd", # " 裏�", " newcom", # "�醒", "ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ", " davidjl", "覚醒", " coerc", "ForgeModLoader", " exting", "?????-", " 裏覚醒", " unden", "¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯", "aditional", " antidepress", "PsyNetMessage", " guiActiveUn", " guiName", " externalTo", " unfocusedRange", " guiActiveUnfocused", " guiIcon", " externalToEVA", " externalToEVAOnly", " unintention", "reportprint", "embedreportprint", "cloneembedreportprint", "rawdownload", "SpaceEngineers", "ActionCode", "externalActionCode", "?????-?????-", "cffff", "MpServer", " gmaxwell", "cffffcc", ' "$:/', " Smartstocks", "bryce", # "��士", "龍喚士", "=~=~", "shapeshifter", " actionGroup", "#$#$", " ItemLevel", "ヘラ", "の魔", " Izan", " attRot", "ÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ", " Mechdragon", " PsyNet", " RandomRedditor", " RandomRedditorWithNo", "ertodd", "00200000", " attm", " sidx", " dstg", " sqor", " istg", " petertodd", "StreamerBot", "TPPStreamerBot", "FactoryReloaded", "iHUD", "ItemThumbnailImage", " UCHIJ", " SetFontSize", "Buyable", "isSpecial", "quickShip", "quickShipAvailable", "isSpecialOrderable", "inventoryQuantity", "channelAvailability", "soDeliveryDate", "龍契士", "EStream", "uyomi", "姫", "oreAndOnline", "InstoreAndOnline", "BuyableInstoreAndOnline", "*/(", " +---", "govtrack", " TAMADRA", "natureconservancy", "assetsadobe", "farious", "Magikarp", "Downloadha", " TheNitrome", " TheNitromeFan", "GoldMagikarp", "闘", "DragonMagazine", " Seym", " srfN", " largeDownload", "?」", " srfAttach", "EStreamFrame", "ゼウス", " SolidGoldMagikarp", "ÍÍ", "黒", "ヴァ", " SetTextColor", " fixme", "dayName", " サーティ", " サーティワン", " Adinida", "ItemTracker", " DevOnline", "20439", "◼", "ÛÛ", " EntityItem", "EngineDebug", "ahime", " strutConnector", " Leilan", ]