mlebench/competitions/text-normalization-challenge-english-language/prepare.py [23:36]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    unique_sentence_ids = old_train["sentence_id"].unique()
    train_sentence_ids, test_sentence_ids = train_test_split(
        unique_sentence_ids, test_size=0.1, random_state=0
    )
    new_train = old_train[old_train["sentence_id"].isin(train_sentence_ids)]
    answers = old_train[old_train["sentence_id"].isin(test_sentence_ids)]
    assert set(new_train["sentence_id"]).isdisjoint(
        set(answers["sentence_id"])
    ), f"sentence_id is not disjoint between train and test sets"

    # "sentence_id" counts need to be reset for new_train and answers
    new_train_id_mapping = {
        old_id: new_id for new_id, old_id in enumerate(new_train["sentence_id"].unique())
    }
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



mlebench/competitions/text-normalization-challenge-russian-language/prepare.py [21:34]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    unique_sentence_ids = old_train["sentence_id"].unique()
    train_sentence_ids, test_sentence_ids = train_test_split(
        unique_sentence_ids, test_size=0.1, random_state=0
    )
    new_train = old_train[old_train["sentence_id"].isin(train_sentence_ids)]
    answers = old_train[old_train["sentence_id"].isin(test_sentence_ids)]
    assert set(new_train["sentence_id"]).isdisjoint(
        set(answers["sentence_id"])
    ), f"sentence_id is not disjoint between train and test sets"

    # "sentence_id" counts need to be reset for new_train and answers
    new_train_id_mapping = {
        old_id: new_id for new_id, old_id in enumerate(new_train["sentence_id"].unique())
    }
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



