misc/precision_filtering/count_common.py [7:27]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
url = 'https://raw.githubusercontent.com/huggingface/datatrove/main/src/datatrove/assets/tokenizer_assignment.csv'
df = pd.read_csv(url)

# Function to generate a tokenizer identifier
def generate_tokenizer_identifier(row):
    return f"{row['type']}-{row['tok_code']}"

df = df.dropna(subset=['type'])

# Apply function to create a new column
df['tokenizer_identifier'] = df.apply(generate_tokenizer_identifier, axis=1)
df['lang_identifier'] = df.apply(lambda x: f"{x['code_3']}_{x['script']}", axis=1)


# Group by tokenizer and map to language codes
tokenizer_to_languages = df.groupby('tokenizer_identifier')['lang_identifier'].apply(set).apply(list).to_dict()


root_path = "/fsx/user_dir/language_tf/"
save_path = "/fsx/user_dir/common_freq/"
corpus_path = '/fsx/user_dir/glotlid-corpus/v3.1/'
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


misc/precision_filtering/wordlist_gen.py [66:86]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
url = 'https://raw.githubusercontent.com/huggingface/datatrove/main/src/datatrove/assets/tokenizer_assignment.csv'
df = pd.read_csv(url)

# Function to generate a tokenizer identifier
def generate_tokenizer_identifier(row):
    return f"{row['type']}-{row['tok_code']}"

df = df.dropna(subset=['type'])

# Apply function to create a new column
df['tokenizer_identifier'] = df.apply(generate_tokenizer_identifier, axis=1)
df['lang_identifier'] = df.apply(lambda x: f"{x['code_3']}_{x['script']}", axis=1)


# Group by tokenizer and map to language codes
tokenizer_to_languages = df.groupby('tokenizer_identifier')['lang_identifier'].apply(set).apply(list).to_dict()


root_path = "/fsx/user_dir/language_tf/"
save_path = "/fsx/user_dir/common_freq/"
corpus_path = '/fsx/user_dir/glotlid-corpus/v3.1/'
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -