in clutrr/main.py [0:0]
def store(self, train_data, test_data, args):
"""
Take the dataset and do the following:
- Create a name for the files
- Create a folder and put the files in
- Write the config in a file and put it in the folder
- Compute the hash of the train and test files and store it in a file
:param train_data list of rows
:param test_data list of list of rows
:return:
"""
train_tasks = args.train_tasks.split(',')
all_puzzles = {}
train_df = []
test_df = []
for i, td in enumerate(train_data):
train_rows_puzzles, train_args = td
assert len(train_rows_puzzles) == 3
train_rows, train_puzzles = train_rows_puzzles[:-1], train_rows_puzzles[-1]
trdfs = [r for r in train_rows[1] if r[-1] == 'train']
tsdfs = [r for r in train_rows[1] if r[-1] == 'test']
train_df.append(pd.DataFrame(columns=train_rows[0], data=trdfs))
test_df.append(pd.DataFrame(columns=train_rows[0], data=tsdfs))
train_df = pd.concat(train_df)
test_df = pd.concat(test_df)
logger.info("Training rows : {}".format(len(train_df)))
logger.info("Testing rows : {}".format(len(test_df)))
# prepare configs
all_config = {}
train_fl_name = self.assign_name(train_args, args.train_tasks)
all_config['train_task'] = {args.train_tasks: train_fl_name}
all_config['test_tasks'] = {}
test_fl_names = []
all_config['args'] = {}
all_config['args'][train_fl_name] = vars(train_args)
test_tasks = args.test_tasks.split(',')
test_dfs = []
for test_task in test_tasks:
train_args.data_type = 'test'
test_fl_name = self.assign_name(train_args,test_task)
all_config['args'][test_fl_name] = vars(train_args)
test_fl_names.append(test_fl_name)
test_dfs.append(test_df[test_df.task_name == 'task_'+test_task])
base_path = os.path.abspath(os.pardir)
# derive folder name as a random selection of characters
directory = ''
while True:
folder_name = 'data_{}'.format(str(uuid.uuid4())[:8])
directory = os.path.join(base_path, args.output_dir, folder_name)
if not os.path.exists(directory):
os.makedirs(directory)
break
train_df.to_csv(os.path.join(directory, train_fl_name))
for i,test_fl_name in enumerate(test_fl_names):
test_df = test_dfs[i]
test_df.to_csv(os.path.join(directory, test_fl_name))
# dump config
json.dump(all_config, open(os.path.join(directory, 'config.json'),'w'))
if args.store_full_puzzles:
# dump all puzzles
pkl.dump(all_puzzles, open(os.path.join(directory, 'puzzles.pkl'),'wb'), protocol=-1)
shutil.make_archive(directory, 'zip', directory)
logger.info("Created dataset in {}".format(directory))
self.analyze_data(directory)
if args.mturk:
self.keep_unique(directory)