def collect_data()

in preprocessing.py [0:0]


def collect_data(nl2bash, dev_dir):
    train_data = []

    # original
    df = pd.read_json(nl2bash).T
    df['origin'] = 'original'
    train_data.append(df)

    # augmented
    df = pd.concat([pd.read_json(f'{dev_dir}/{name}', lines=True) for name in (
        # 'en-de-en-temp-sampling.json',
        'en-de-en-ru-en.json',
        'en-de-en.json',
        'en-ru-en-de-en.json',
        'en-ru-en.json',
    )])
    df = df[['invocation', 'cmd']].dropna()
    df['origin'] = 'augmented'
    train_data.append(df)

    # augmented
    df = pd.read_csv(f'{dev_dir}/generated.csv')
    df = df.rename(columns={'query': 'invocation'})
    df = df[['invocation', 'cmd']].dropna()
    df['origin'] = 'generated'
    train_data.append(df)

    # manpage examples
    df = pd.read_csv(f'{dev_dir}/manpage_examples.csv')
    df = df.rename(columns={'context': 'invocation', 'command': 'cmd'})
    df = df[['invocation', 'cmd']].dropna()
    df['origin'] = 'manpage'
    train_data.append(df)

    train_data = pd.concat(train_data)
    train_data['invocation'] = train_data['invocation'].apply(str.lower).apply(str.strip)
    train_data['cmd'] = train_data['cmd'].apply(str.strip)
    train_data = train_data.drop_duplicates().reset_index(drop=True)

    return train_data