scripts/translation.py

import os import sys from huggingface_hub import InferenceClient from dotenv import load_dotenv load_dotenv() hf_token = os.environ.get("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN not found in environment variables. Please set it in a .env file.") # Get the directory containing the current script script_dir = os.path.dirname(os.path.abspath(__file__)) default_inp_dir = os.path.join(script_dir, '..', 'units/en') default_model = "deepseek-ai/DeepSeek-R1" default_client = InferenceClient( provider="together", # api_key is read from the environment ) def auto_translate( output_lang: str, prompt: callable, inp_dir: str = default_inp_dir, model: str = default_model, client: InferenceClient = default_client ): get_output_path = lambda x: x.replace('/en', f'/{output_lang}') escape_special_tokens = lambda x: x.replace('<think>', '<%%think%%>').replace('</think>', '<%%/think%%>') unescape_special_tokens = lambda x: x.replace('<%%think%%>', '<think>').replace('<%%/think%%>', '</think>') # Get the list of all files in the directory, recursively inp_files: list[str] = [] print('Collecting files...') for root, dirs, files in os.walk(inp_dir): for file in files: if file.endswith('.mdx') or file == "_toctree.yml": fname = os.path.join(root, file) print(' +', fname) inp_files.append(fname) def write_out_file(fpath: str, content: str): base_path = os.path.dirname(fpath) os.makedirs(base_path, exist_ok=True) with open(fpath, 'w', encoding='utf-8') as f: f.write(content) # Read the content of the file and process for i, inp_file in enumerate(inp_files): out_file = get_output_path(inp_file) if os.path.exists(out_file): print(f'[{i+1}/{len(inp_files)}] Skipping file: {inp_file}') continue with open(inp_file, 'r', encoding='utf-8') as f: content: str = f.read() content = escape_special_tokens(content) if content.strip() == "": print(f'[{i+1}/{len(inp_files)}] Skipping empty file: {inp_file}') write_out_file(out_file, "") continue print(f'[{i+1}/{len(inp_files)}] Processing file: {inp_file}') stream = client.chat.completions.create( model=model, temperature=0.0, messages=[ {"role": "user", "content": prompt(content)}, ], stream=True, ) final_text = "" for chunk in stream: content_chunk = chunk.choices[0].delta.content print(content_chunk, end="", flush=True) final_text += content_chunk # Optionally filter <think>...</think> reasoning process final_text = final_text.split('</think>').pop().strip() # Write the output to the file final_text = unescape_special_tokens(final_text) write_out_file(out_file, final_text) print() print(f' -> Translated to: {out_file}') print("--" * 20) #break

scripts/translation.py (69 lines of code) (raw):