in muss/mining/training.py [0:0]
def get_all_baseline_rows():
paths = {
('asset', 'test'): ('en', TEST_SETS_PATHS[('asset_test', 'orig')], TEST_SETS_PATHS[('asset_test', 'refs')]),
('asset', 'valid'): ('en', TEST_SETS_PATHS[('asset_valid', 'orig')], TEST_SETS_PATHS[('asset_valid', 'refs')]),
('turkcorpus_detokenized', 'test'): (
'en',
TEST_SETS_PATHS[('turkcorpus_test', 'orig')],
TEST_SETS_PATHS[('turkcorpus_test', 'refs')],
),
('turkcorpus_detokenized', 'valid'): (
'en',
TEST_SETS_PATHS[('turkcorpus_valid', 'orig')],
TEST_SETS_PATHS[('turkcorpus_valid', 'refs')],
),
('alector', 'test'): (
'fr',
get_data_filepath('alector', 'test', 'complex'),
[get_data_filepath('alector', 'test', 'simple')],
),
('alector', 'valid'): (
'fr',
get_data_filepath('alector', 'valid', 'complex'),
[get_data_filepath('alector', 'valid', 'simple')],
),
# Old dataset with problems
('simplext_corpus_all', 'test'): (
'es',
get_data_filepath('simplext_corpus_all', 'test', 'complex'),
[get_data_filepath('simplext_corpus_all', 'test', 'simple')],
),
('simplext_corpus_all', 'valid'): (
'es',
get_data_filepath('simplext_corpus_all', 'valid', 'complex'),
[get_data_filepath('simplext_corpus_all', 'valid', 'simple')],
),
('simplext_corpus_all_fixed', 'test'): (
'es',
get_data_filepath('simplext_corpus_all_fixed', 'test', 'complex'),
[get_data_filepath('simplext_corpus_all_fixed', 'test', 'simple')],
),
('simplext_corpus_all_fixed', 'valid'): (
'es',
get_data_filepath('simplext_corpus_all_fixed', 'valid', 'complex'),
[get_data_filepath('simplext_corpus_all_fixed', 'valid', 'simple')],
),
('simpitiki', 'test'): (
'it',
get_data_filepath('simpitiki', 'test', 'complex'),
[get_data_filepath('simpitiki', 'test', 'simple')],
),
('simpitiki', 'valid'): (
'it',
get_data_filepath('simpitiki', 'valid', 'complex'),
[get_data_filepath('simpitiki', 'valid', 'simple')],
),
}
rows = []
for (dataset, phase), (language, orig_sents_path, refs_sents_paths) in tqdm(paths.items()):
dataset_rows = get_baseline_rows(orig_sents_path, tuple(refs_sents_paths), language)
for row in dataset_rows:
row['dataset'] = dataset
row['phase'] = phase
rows.extend(dataset_rows)
return rows