misc/precision_filtering/count_common.py [7:18]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
url = 'https://raw.githubusercontent.com/huggingface/datatrove/main/src/datatrove/assets/tokenizer_assignment.csv'
df = pd.read_csv(url)

# Function to generate a tokenizer identifier
def generate_tokenizer_identifier(row):
    return f"{row['type']}-{row['tok_code']}"

df = df.dropna(subset=['type'])

# Apply function to create a new column
df['tokenizer_identifier'] = df.apply(generate_tokenizer_identifier, axis=1)
df['lang_identifier'] = df.apply(lambda x: f"{x['code_3']}_{x['script']}", axis=1)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


misc/precision_filtering/slurm_count_word.py [4:17]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
url = 'https://raw.githubusercontent.com/huggingface/datatrove/main/src/datatrove/assets/tokenizer_assignment.csv'
df = pd.read_csv(url)


# Function to generate a tokenizer identifier
def generate_tokenizer_identifier(row):
    return f"{row['type']}-{row['tok_code']}"

df = df.dropna(subset=['type'])
# df = df[df['type'] != 'StanzaTokenizer']

# Apply function to create a new column
df['tokenizer_identifier'] = df.apply(generate_tokenizer_identifier, axis=1)
df['lang_identifier'] = df.apply(lambda x: f"{x['code_3']}_{x['script']}", axis=1)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -