def store()

in clutrr/main.py [0:0]


    def store(self, train_data, test_data, args):
        """
        Take the dataset and do the following:
        - Create a name for the files
        - Create a folder and put the files in
        - Write the config in a file and put it in the folder
        - Compute the hash of the train and test files and store it in a file
        :param train_data list of rows
        :param test_data list of list of rows
        :return:
        """
        train_tasks = args.train_tasks.split(',')
        all_puzzles = {}
        train_df = []
        test_df = []
        for i, td in enumerate(train_data):
            train_rows_puzzles, train_args = td
            assert len(train_rows_puzzles) == 3
            train_rows, train_puzzles = train_rows_puzzles[:-1], train_rows_puzzles[-1]
            trdfs = [r for r in train_rows[1] if r[-1] == 'train']
            tsdfs = [r for r in train_rows[1] if r[-1] == 'test']
            train_df.append(pd.DataFrame(columns=train_rows[0], data=trdfs))
            test_df.append(pd.DataFrame(columns=train_rows[0], data=tsdfs))

        train_df = pd.concat(train_df)
        test_df = pd.concat(test_df)
        logger.info("Training rows : {}".format(len(train_df)))
        logger.info("Testing rows : {}".format(len(test_df)))

        # prepare configs
        all_config = {}
        train_fl_name = self.assign_name(train_args, args.train_tasks)
        all_config['train_task'] = {args.train_tasks: train_fl_name}
        all_config['test_tasks'] = {}
        test_fl_names = []
        all_config['args'] = {}
        all_config['args'][train_fl_name] = vars(train_args)
        test_tasks = args.test_tasks.split(',')
        test_dfs = []
        for test_task in test_tasks:
            train_args.data_type = 'test'
            test_fl_name = self.assign_name(train_args,test_task)
            all_config['args'][test_fl_name] = vars(train_args)
            test_fl_names.append(test_fl_name)
            test_dfs.append(test_df[test_df.task_name == 'task_'+test_task])

        base_path = os.path.abspath(os.pardir)
        # derive folder name as a random selection of characters
        directory = ''
        while True:
            folder_name = 'data_{}'.format(str(uuid.uuid4())[:8])
            directory = os.path.join(base_path, args.output_dir, folder_name)
            if not os.path.exists(directory):
                os.makedirs(directory)
                break
        train_df.to_csv(os.path.join(directory, train_fl_name))
        for i,test_fl_name in enumerate(test_fl_names):
            test_df = test_dfs[i]
            test_df.to_csv(os.path.join(directory, test_fl_name))
        # dump config
        json.dump(all_config, open(os.path.join(directory, 'config.json'),'w'))
        if args.store_full_puzzles:
            # dump all puzzles
            pkl.dump(all_puzzles, open(os.path.join(directory, 'puzzles.pkl'),'wb'), protocol=-1)
        shutil.make_archive(directory, 'zip', directory)

        logger.info("Created dataset in {}".format(directory))
        self.analyze_data(directory)
        if args.mturk:
            self.keep_unique(directory)