# Basic Dataset preparation with chunking


## 1. Concatenate Multiple datasets

---


In [None]:
DATA_DIR = "dataset"
!rm -rf $DATA_DIR 
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
import json
import random
from datasets import load_dataset, concatenate_datasets

def formatting_en_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }


def formatting_ko_func(example):
    if example["input"] == "":
        text = f"<s><|user|>\n{example['instruction']}<|end|>\n<|assistant|>\n{example['output']}<|end|>"
    else:
        text = f"<s><|system|>\n{example['input']}<|end|>\n<|user|>\n{example['instruction']}<|end|>\n<|assistant|>\n{example['output']}<|end|>"
    #example["text"] = text
    return  { "text" : text }

def formatting_guanaco_func(examples):
    txt = examples["text"]
    splits = txt.split("### ")
    txt = "<s>"
    for s in splits[1:]:
        s = s.replace('Human: ', '<|user|>\n') 
        s = s.replace('Assistant: ', '<|assistant|>\n')
        s = s + '<|end|>\n'
        txt += s
    return { "text" : txt }


dataset_ko1 = load_dataset("kyujinpy/KOR-OpenOrca-Platypus-v3", split="train[:1%]")
dataset_ko2 = load_dataset("kyujinpy/KOR-gugugu-platypus-set", split="train[:1%]")
dataset_ko3 = load_dataset("nlpai-lab/openassistant-guanaco-ko", split="train[:1%]")

dataset_ko1 = dataset_ko1.map(formatting_ko_func, remove_columns=dataset_ko1.features, batched=False)
dataset_ko2 = dataset_ko2.map(formatting_ko_func, remove_columns=dataset_ko2.features, batched=False)
dataset_ko3 = dataset_ko3.map(formatting_guanaco_func, remove_columns=['id'], batched=False)

dataset = concatenate_datasets([dataset_ko1, dataset_ko2, dataset_ko3])
dataset = dataset.shuffle(seed=42)

In [None]:
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
train_dataset.to_json(f"{DATA_DIR}/train_example1.jsonl")
test_dataset = dataset['test']
test_dataset.to_json(f"{DATA_DIR}/eval_example1.jsonl")

<br>

## 2. Convert to OpenAI chat format

---


In [None]:
def convert_to_oai_format(data):

    formatted_data = []
    for message in data:
        msg = {"messages": [
            {"role":"system",
            "content":"""You are an AI assistant. Please reply users' answer using polite,clear and respectful language in Korean."""
            },
            {"role":"user",
            "content" :message["instruction"]
            },
            {"role":"assistant",
            "content": message["output"]
            }]
        }
        formatted_data.append(msg)
    random.shuffle(formatted_data)
    
    return formatted_data

def save_jsonl(dictionary_data, file_name):
    with open(file_name, 'w', encoding='UTF-8-sig') as outfile:
        for entry in dictionary_data:
            json.dump(entry, outfile, ensure_ascii=False)
            outfile.write('\n')

In [None]:
from datasets import load_dataset
dataset = load_dataset("kyujinpy/KOR-OpenOrca-Platypus-v3", split="train[:1%]")

In [None]:
dataset = dataset.train_test_split(test_size=0.2)            
formatted_train_data = convert_to_oai_format(dataset['train'])
formatted_valid_data = convert_to_oai_format(dataset['test'])
save_jsonl(formatted_train_data, f"{DATA_DIR}/train_example2.jsonl")
save_jsonl(formatted_train_data, f"{DATA_DIR}/valid_example2.jsonl")