# Setup

In [None]:
HF_ORG_NAME = None # update with the ID of the org you just created
LANGUAGE = None # update this with the language you will work on

In [None]:
assert HF_ORG_NAME is not None, "Please set HF_ORG_NAME to the ID of the Hugging Face org you just created"
assert LANGUAGE is not None, "Please set LANGUAGE to the language your effort focuses on"

In [None]:
import argilla as rg

OWNER_API_KEY = "owner.apikey" # if you haven't setup the secret this is the default owner api key
assert OWNER_API_KEY is not None, "Please set OWNER_API_KEY to the API token you just set in the Space settings"

rg.init(api_url=homepage_url, api_key=OWNER_API_KEY)

In [None]:
from openai import OpenAI
from google.colab import userdata

from distilabel.llm.openai import OpenAILLM
from distilabel.tasks import TextGenerationTask
from distilabel.pipeline import Pipeline

# Get original dataset and translate it

This assumes you have already pushed the untranslated dataset

In [None]:
# let's load the dataset and prepare the source col for distilabel
argilla_ds = rg.FeedbackDataset.from_argilla(f"DIBT Translation for {LANGUAGE}", workspace="admin")
hf_ds = argilla_ds.format_as("datasets").rename_columns({'source': "input"})

In [None]:
api_key=userdata.get("OPENAI_API_KEY")

target_lang = "Spanish" # change this with your target language name

llm = OpenAILLM(
   model="gpt-4-0613", # gpt4-turbo
   api_key=api_key,
   task=TextGenerationTask(system_prompt=f"You will be provided with a text in English, and your task is to translate it into {target_lang}. If it's code please don't translate the actual code, only the comments and the explanation."),
   num_threads=8,
   max_new_tokens=8192,
)

pipe = Pipeline(
    generator=llm
)

In [None]:
# test everything is working so far
ds = pipe.generate(
    dataset=hf_ds.select(range(10)),
    batch_size=4,
    display_progress_bar=True
)
# check the translations before running the full pipeline
ds.to_pandas().head(5)

In [None]:
# if everything is working as expected, run with the full dataset
ds = pipe.generate(
    dataset=hf_ds,
    batch_size=4,
    display_progress_bar=True
)

# Update the translations in the Argilla Space


In [None]:
translations = [gen[0]  for gen in ds['generations']]
len(translations)

In [None]:
altered_records = []

for rec, translation in zip(argilla_ds.records, translations):
    rec.suggestions = [
        {
            "question_name": "target",
            "value": translation
        }
    ]
    altered_records.append(rec)

altered_records[0]

In [None]:
argilla_ds.update_records(altered_records)