def main()

in preprocessing.py [0:0]
41 lines of code
12 McCabe index (conditional complexity)

def main(nl2bash, dev_dir, cmd_options):
    df = collect_data(nl2bash, dev_dir)

    df['cmd_cleaned'] = df['cmd'].apply(clean_cmd)
    df['text_cleaned'] = df['invocation'].apply(clean_text)
    df = df.drop_duplicates(subset=('cmd_cleaned', 'text_cleaned'))
    df = df.loc[df.cmd_cleaned.apply(lambda x: not x.startswith('root'))]
    df['cmd_cleaned'] = df['cmd_cleaned'].apply(remove_args)
    df = df.drop_duplicates(subset=('cmd_cleaned', 'text_cleaned'))
    df = df.loc[df.cmd_cleaned.apply(lambda x: not x.startswith('root'))]
    df.to_csv(f'{dev_dir}/train.csv')

    mandf = pd.read_csv(cmd_options)
    mandf = mandf.dropna(subset=['options']).reset_index(drop=True)

    def foo(options):
        options = eval(options)
        rlist = []
        for opt in options:
            rlist.append({
                'short': opt['short'],
                'long': opt['long'],
                'text': clean_text(opt['text']) if isinstance(opt['text'], str) else ' '.join([clean_text(x) for x in opt['text']])
            })
        return rlist

    mandf['cleaned_options'] = mandf.options.apply(foo)
    mandf.to_csv(f'{dev_dir}/man.csv')

    with open(f'{dev_dir}/text', 'w') as f:
        for x in df.text_cleaned:
            f.write(x.lower() + '\n')
        for x in mandf.cleaned_options:
            for opt in x:
                f.write(opt['text'].lower() + '\n')

    with open(f'{dev_dir}/cmd', 'w') as f:
        for x in df.cmd_cleaned:
            f.write(x + '\n')

    with open(f'{dev_dir}/all', 'w') as f:
        for x in df.text_cleaned:
            f.write(x.lower() + '\n')
        for x in mandf.cleaned_options:
            for opt in x:
                f.write(opt['text'].lower() + '\n')
        for x in df.cmd_cleaned:
            f.write(x + '\n')