in preprocessing.py [0:0]
def collect_data(nl2bash, dev_dir):
train_data = []
# original
df = pd.read_json(nl2bash).T
df['origin'] = 'original'
train_data.append(df)
# augmented
df = pd.concat([pd.read_json(f'{dev_dir}/{name}', lines=True) for name in (
# 'en-de-en-temp-sampling.json',
'en-de-en-ru-en.json',
'en-de-en.json',
'en-ru-en-de-en.json',
'en-ru-en.json',
)])
df = df[['invocation', 'cmd']].dropna()
df['origin'] = 'augmented'
train_data.append(df)
# augmented
df = pd.read_csv(f'{dev_dir}/generated.csv')
df = df.rename(columns={'query': 'invocation'})
df = df[['invocation', 'cmd']].dropna()
df['origin'] = 'generated'
train_data.append(df)
# manpage examples
df = pd.read_csv(f'{dev_dir}/manpage_examples.csv')
df = df.rename(columns={'context': 'invocation', 'command': 'cmd'})
df = df[['invocation', 'cmd']].dropna()
df['origin'] = 'manpage'
train_data.append(df)
train_data = pd.concat(train_data)
train_data['invocation'] = train_data['invocation'].apply(str.lower).apply(str.strip)
train_data['cmd'] = train_data['cmd'].apply(str.strip)
train_data = train_data.drop_duplicates().reset_index(drop=True)
return train_data