mlebench/competitions/tensorflow2-question-answering/prepare.py (82 lines of code) (raw):
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from mlebench.utils import get_logger
logger = get_logger(__name__)
def extract_string(document_text: str, start_token_idx: int, end_token_idx: int) -> str:
document_tokens = document_text.split(" ")
extract_tokens = document_tokens[start_token_idx:end_token_idx]
return " ".join(extract_tokens)
def prepare(raw: Path, public: Path, private: Path):
"""
Splits the data in raw into public and private datasets with appropriate test/train splits.
"""
# Create train, test from train split
train_file = "simplified-nq-train.jsonl"
logger.info("Counting lines in train file...")
with open(raw / train_file, "r") as f:
n_lines = sum(1 for _ in f)
logger.info(f"Found {n_lines} lines in train file.")
# Read data in chunks to avoid memory issues
train_ids, test_ids = [], []
lightweight_test = [] # We'll use this to create a gold submission later
with tqdm(total=n_lines, desc="Splitting data") as pbar:
for df in pd.read_json(raw / train_file, orient="records", lines=True, chunksize=1_000):
# Convert IDs to strings, Kaggle.com is inconsistent about this but strings make more sense
df["example_id"] = df["example_id"].astype(str)
new_train, new_test = train_test_split(df, test_size=0.1, random_state=0)
keys_to_keep = [
"example_id",
"question_text",
"document_text",
"long_answer_candidates",
]
new_test_without_labels = new_test.copy()[keys_to_keep]
# Append lines to new train and test
with open(public / "simplified-nq-train.jsonl", "a") as f:
f.write(new_train.to_json(orient="records", lines=True))
with open(private / "test.jsonl", "a") as f:
f.write(new_test.to_json(orient="records", lines=True))
with open(public / "simplified-nq-test.jsonl", "a") as f:
f.write(new_test_without_labels.to_json(orient="records", lines=True))
train_ids.extend(new_train["example_id"].tolist())
test_ids.extend(new_test["example_id"].tolist())
lightweight_test.append(
new_test.copy()[["example_id", "question_text", "annotations"]]
) # For gold submission
pbar.update(len(df))
lightweight_test = pd.concat(lightweight_test, ignore_index=True)
assert len(train_ids) + len(test_ids) == n_lines
assert len(lightweight_test) == len(test_ids)
# Create a gold submission with columns "example_id", "PredictionString"
gold_rows = []
for idx, sample in tqdm(
lightweight_test.iterrows(), total=len(lightweight_test), desc="Creating gold submission"
):
sample = sample.to_dict()
assert len(sample["annotations"]) == 1
annotation = sample["annotations"][0]
# Create short answer
# Multiple answers are possible: yes_no_answer or one of short_answers
# We just take the first one
if annotation["yes_no_answer"] != "NONE":
answer = annotation["yes_no_answer"]
elif len(annotation["short_answers"]) > 0:
start_token = annotation["short_answers"][0]["start_token"]
end_token = annotation["short_answers"][0]["end_token"]
answer = f"{start_token}:{end_token}"
else:
answer = ""
logger.debug(f"q: {sample['question_text']}")
logger.debug(f"a: {answer}")
logger.debug("")
gold_rows.append(
{"example_id": f"{sample['example_id']}_short", "PredictionString": answer}
)
# Create long answer
if annotation["long_answer"]["start_token"] != -1:
start_token = annotation["long_answer"]["start_token"]
end_token = annotation["long_answer"]["end_token"]
answer = f"{start_token}:{end_token}"
else:
answer = ""
logger.debug(f"q: {sample['question_text']}")
logger.debug(f"a: {answer}")
logger.debug("")
gold_rows.append({"example_id": f"{sample['example_id']}_long", "PredictionString": answer})
gold_submission = pd.DataFrame(gold_rows)
gold_submission.to_csv(private / "gold_submission.csv", index=False)
# Sample submission
sample_submission = gold_submission.copy()
sample_submission["PredictionString"] = ""
sample_submission.to_csv(public / "sample_submission.csv", index=False)
assert len(gold_submission) == 2 * len(test_ids)
assert len(sample_submission) == 2 * len(test_ids)