preprocessing.py (129 lines of code) (raw):
import re
from submission_code.preprocessing import clean_text
import argparse
import pandas as pd
import numpy as np
import sys
sys.path.append('../clai/utils')
from bashlint.data_tools import bash_parser
from metric.metric_utils import compute_metric
def collect_data(nl2bash, dev_dir):
train_data = []
# original
df = pd.read_json(nl2bash).T
df['origin'] = 'original'
train_data.append(df)
# augmented
df = pd.concat([pd.read_json(f'{dev_dir}/{name}', lines=True) for name in (
# 'en-de-en-temp-sampling.json',
'en-de-en-ru-en.json',
'en-de-en.json',
'en-ru-en-de-en.json',
'en-ru-en.json',
)])
df = df[['invocation', 'cmd']].dropna()
df['origin'] = 'augmented'
train_data.append(df)
# augmented
df = pd.read_csv(f'{dev_dir}/generated.csv')
df = df.rename(columns={'query': 'invocation'})
df = df[['invocation', 'cmd']].dropna()
df['origin'] = 'generated'
train_data.append(df)
# manpage examples
df = pd.read_csv(f'{dev_dir}/manpage_examples.csv')
df = df.rename(columns={'context': 'invocation', 'command': 'cmd'})
df = df[['invocation', 'cmd']].dropna()
df['origin'] = 'manpage'
train_data.append(df)
train_data = pd.concat(train_data)
train_data['invocation'] = train_data['invocation'].apply(str.lower).apply(str.strip)
train_data['cmd'] = train_data['cmd'].apply(str.strip)
train_data = train_data.drop_duplicates().reset_index(drop=True)
return train_data
def _clean_cmd(node):
if node.kind.upper() == 'COMMANDSUBSTITUTION':
r = ' $(' + node.value
r += ' '.join([_clean_cmd(child) for child in node.children])
r += ')'
return r
if node.kind.upper() == 'PROCESSSUBSTITUTION':
r = ' ' + node.value + '('
r += ' '.join([_clean_cmd(child) for child in node.children])
r += ')'
return r
if node.kind.upper() == 'PIPELINE':
r = '|'.join([_clean_cmd(child) for child in node.children])
return r
r = ' ' + node.value
if node.kind.upper() == 'ARGUMENT':
r = ' ARG'
r += ' '.join([_clean_cmd(child) for child in node.children])
if '::;' in node.value:
r += ' \;'
if '::+' in node.value:
r += ' \+'
return r
def clean_cmd(cmd):
cmd = _clean_cmd(bash_parser(cmd)).replace('::;', '').replace('::+', '')
cmd = cmd.strip()
cmd = re.sub('\s+', ' ', cmd)
return cmd
def remove_args(cmd):
new_cmd = ''
found = True
while found:
found = False
parts = cmd.split('ARG')
for i in range(1, len(parts)):
new_cmd = 'ARG'.join(parts[:i]) + ' ' + 'ARG'.join(parts[i:])
if compute_metric(new_cmd, 1, cmd) == 1:
found |= True
cmd = new_cmd
break
cmd = re.sub('\s+', ' ', cmd)
cmd = cmd.strip()
return cmd
def main(nl2bash, dev_dir, cmd_options):
df = collect_data(nl2bash, dev_dir)
df['cmd_cleaned'] = df['cmd'].apply(clean_cmd)
df['text_cleaned'] = df['invocation'].apply(clean_text)
df = df.drop_duplicates(subset=('cmd_cleaned', 'text_cleaned'))
df = df.loc[df.cmd_cleaned.apply(lambda x: not x.startswith('root'))]
df['cmd_cleaned'] = df['cmd_cleaned'].apply(remove_args)
df = df.drop_duplicates(subset=('cmd_cleaned', 'text_cleaned'))
df = df.loc[df.cmd_cleaned.apply(lambda x: not x.startswith('root'))]
df.to_csv(f'{dev_dir}/train.csv')
mandf = pd.read_csv(cmd_options)
mandf = mandf.dropna(subset=['options']).reset_index(drop=True)
def foo(options):
options = eval(options)
rlist = []
for opt in options:
rlist.append({
'short': opt['short'],
'long': opt['long'],
'text': clean_text(opt['text']) if isinstance(opt['text'], str) else ' '.join([clean_text(x) for x in opt['text']])
})
return rlist
mandf['cleaned_options'] = mandf.options.apply(foo)
mandf.to_csv(f'{dev_dir}/man.csv')
with open(f'{dev_dir}/text', 'w') as f:
for x in df.text_cleaned:
f.write(x.lower() + '\n')
for x in mandf.cleaned_options:
for opt in x:
f.write(opt['text'].lower() + '\n')
with open(f'{dev_dir}/cmd', 'w') as f:
for x in df.cmd_cleaned:
f.write(x + '\n')
with open(f'{dev_dir}/all', 'w') as f:
for x in df.text_cleaned:
f.write(x.lower() + '\n')
for x in mandf.cleaned_options:
for opt in x:
f.write(opt['text'].lower() + '\n')
for x in df.cmd_cleaned:
f.write(x + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('nl2bash', type=str)
parser.add_argument("dev_dir", type=str)
parser.add_argument("cmd_options", type=str)
args = parser.parse_args()
main(args.nl2bash, args.dev_dir, args.cmd_options)