in preprocessing.py [0:0]
def main(nl2bash, dev_dir, cmd_options):
df = collect_data(nl2bash, dev_dir)
df['cmd_cleaned'] = df['cmd'].apply(clean_cmd)
df['text_cleaned'] = df['invocation'].apply(clean_text)
df = df.drop_duplicates(subset=('cmd_cleaned', 'text_cleaned'))
df = df.loc[df.cmd_cleaned.apply(lambda x: not x.startswith('root'))]
df['cmd_cleaned'] = df['cmd_cleaned'].apply(remove_args)
df = df.drop_duplicates(subset=('cmd_cleaned', 'text_cleaned'))
df = df.loc[df.cmd_cleaned.apply(lambda x: not x.startswith('root'))]
df.to_csv(f'{dev_dir}/train.csv')
mandf = pd.read_csv(cmd_options)
mandf = mandf.dropna(subset=['options']).reset_index(drop=True)
def foo(options):
options = eval(options)
rlist = []
for opt in options:
rlist.append({
'short': opt['short'],
'long': opt['long'],
'text': clean_text(opt['text']) if isinstance(opt['text'], str) else ' '.join([clean_text(x) for x in opt['text']])
})
return rlist
mandf['cleaned_options'] = mandf.options.apply(foo)
mandf.to_csv(f'{dev_dir}/man.csv')
with open(f'{dev_dir}/text', 'w') as f:
for x in df.text_cleaned:
f.write(x.lower() + '\n')
for x in mandf.cleaned_options:
for opt in x:
f.write(opt['text'].lower() + '\n')
with open(f'{dev_dir}/cmd', 'w') as f:
for x in df.cmd_cleaned:
f.write(x + '\n')
with open(f'{dev_dir}/all', 'w') as f:
for x in df.text_cleaned:
f.write(x.lower() + '\n')
for x in mandf.cleaned_options:
for opt in x:
f.write(opt['text'].lower() + '\n')
for x in df.cmd_cleaned:
f.write(x + '\n')