in mlebench/competitions/billion-word-imputation/prepare.py [0:0]
def prepare(raw: Path, public: Path, private: Path):
logger.info("Extracting raw / train_v2.txt.zip")
extract(raw / "train_v2.txt.zip", raw)
# computed this ahead of time
total_lines = 30301028
with (
open(raw / "train_v2.txt", "r") as old_train,
open(public / "train_v2.txt", "w") as public_train,
open(public / "test_v2.txt", "w") as public_test,
open(private / "test.csv", "w") as private_test,
):
public_test.write('"id","sentence"\n')
private_test.write('"id","sentence"\n')
line_count = 0
test_count = 0
train_count = 0
# there is one sentence per line
for sentence in tqdm(old_train, desc="Processing data", total=total_lines):
# we will put ~0.01 of the data in test, the rest in train, matching kaggle's original split
# some sentences only have 2 words, so can't remove a word -- keep them in train
if np_rng.uniform() <= 0.01 and len(sentence.strip().split()) > 2:
# get rid of linebreak and escape quotes
sentence = sentence.strip().replace('"', '""')
removed_word_sentence = remove_random_word(sentence)
private_test.write(f'{test_count},"{sentence}"\n')
public_test.write(f'{test_count},"{removed_word_sentence}"\n')
test_count += 1
else:
public_train.write(sentence)
train_count += 1
line_count += 1
if line_count >= total_lines:
break
# we will be compressing the public files (to match what's on kaggle.com)
# so copy our sample submission to private so we have access to it
shutil.copy(public / "test_v2.txt", private / "sample_submission.csv")
# compress the public files
logger.info("Compressing train_v2.txt")
compress_file_to_zip(public / "train_v2.txt", public / "train_v2.txt.zip")
logger.info("Compressing test_v2.txt")
compress_file_to_zip(public / "test_v2.txt", public / "test_v2.txt.zip")
# remove the original files
(public / "train_v2.txt").unlink()
(public / "test_v2.txt").unlink()
# Checks
assert not (public / "train_v2.txt").exists(), "public / 'train_v2.txt' should not exist"
assert (public / "train_v2.txt.zip").exists(), "public / 'train_v2.txt.zip' should exist"
assert not (public / "test_v2.txt").exists(), "public / 'test_v2.txt' should not exist"
assert (public / "test_v2.txt.zip").exists(), "public / 'test_v2.txt.zip' should exist"
private_test_line_count = count_lines_in_file(private / "test.csv")
assert (
# minus 2 to exclude header
private_test_line_count - 1
== test_count
), "private / 'test.csv' has incorrect number of lines"
assert (
count_lines_in_file(private / "sample_submission.csv") == private_test_line_count
), "private / 'sample_submission.csv' has incorrect number of lines"
assert (
test_count + train_count == total_lines
), "Expected the number of test samples and train samples to sum to the total number of samples in the original train file"