mlebench/competitions/text-normalization-challenge-english-language/prepare.py [44:93]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    new_test = answers.drop(["after", "class"], axis=1).copy()

    # Reformat answers to match sample submission format
    answers = answers[["sentence_id", "token_id", "after"]].copy()
    answers["id"] = answers["sentence_id"].astype(str) + "_" + answers["token_id"].astype(str)
    answers = answers[["id", "after"]]

    # Create sample submission
    sample_submission = new_test[["sentence_id", "token_id", "before"]].copy()
    sample_submission["id"] = (
        sample_submission["sentence_id"].astype(str)
        + "_"
        + sample_submission["token_id"].astype(str)
    )
    sample_submission["after"] = sample_submission["before"]
    sample_submission = sample_submission[["id", "after"]]

    # Checks
    assert new_train.columns.tolist() == [
        "sentence_id",
        "token_id",
        "class",
        "before",
        "after",
    ], f"new_train.columns.tolist() == {new_train.columns.tolist()}"
    assert new_test.columns.tolist() == [
        "sentence_id",
        "token_id",
        "before",
    ], f"new_test.columns.tolist() == {new_test.columns.tolist()}"
    assert sample_submission.columns.tolist() == [
        "id",
        "after",
    ], f"sample_submission.columns.tolist() == {sample_submission.columns.tolist()}"
    assert answers.columns.tolist() == [
        "id",
        "after",
    ], f"answers.columns.tolist() == {answers.columns.tolist()}"
    assert len(new_test) + len(new_train) == len(
        old_train
    ), f"New train and test sets do not sum to old train set, got {len(new_test) + len(new_train)} and {len(old_train)}"

    # Write CSVs
    answers.to_csv(
        private / "answers.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
    )
    sample_submission.to_csv(
        private / "sample_submission.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
    )
    new_train.to_csv(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


mlebench/competitions/text-normalization-challenge-russian-language/prepare.py [42:91]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    new_test = answers.drop(["after", "class"], axis=1).copy()

    # Reformat answers to match sample submission format
    answers = answers[["sentence_id", "token_id", "after"]].copy()
    answers["id"] = answers["sentence_id"].astype(str) + "_" + answers["token_id"].astype(str)
    answers = answers[["id", "after"]]

    # Create sample submission
    sample_submission = new_test[["sentence_id", "token_id", "before"]].copy()
    sample_submission["id"] = (
        sample_submission["sentence_id"].astype(str)
        + "_"
        + sample_submission["token_id"].astype(str)
    )
    sample_submission["after"] = sample_submission["before"]
    sample_submission = sample_submission[["id", "after"]]

    # Checks
    assert new_train.columns.tolist() == [
        "sentence_id",
        "token_id",
        "class",
        "before",
        "after",
    ], f"new_train.columns.tolist() == {new_train.columns.tolist()}"
    assert new_test.columns.tolist() == [
        "sentence_id",
        "token_id",
        "before",
    ], f"new_test.columns.tolist() == {new_test.columns.tolist()}"
    assert sample_submission.columns.tolist() == [
        "id",
        "after",
    ], f"sample_submission.columns.tolist() == {sample_submission.columns.tolist()}"
    assert answers.columns.tolist() == [
        "id",
        "after",
    ], f"answers.columns.tolist() == {answers.columns.tolist()}"
    assert len(new_test) + len(new_train) == len(
        old_train
    ), f"New train and test sets do not sum to old train set, got {len(new_test) + len(new_train)} and {len(old_train)}"

    # Write CSVs
    answers.to_csv(
        private / "answers.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
    )
    sample_submission.to_csv(
        private / "sample_submission.csv", index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC
    )
    new_train.to_csv(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -