in clutrr/utils/data_backend.py [0:0]
def upload(self, data_path, db='gold'):
"""
Given a csv file, upload the entire dataframe in the particular db
:param data:
:param db:
:return:
"""
print("Reading {}".format(data_path))
data = self._read_csv(data_path)
records = data.to_dict(orient='records')
# add used counter if gold and test
# add reviewed counter if mturk
num_records = len(records)
print("Number of records found : {}".format(len(records)))
for rec in records:
if db == 'gold':
rec['used'] = 0
else:
rec['reviewed'] = 0
sents = nltk.sent_tokenize(rec['story'])
rec['relation_length'] = len(sents)
mdb = getattr(self, db)
# prune the records which are already present in the database
keep_idx = []
for rec_idx, rec in enumerate(records):
fd = mdb.find({'id': rec['id']}).count()
if fd == 0:
keep_idx.append(rec_idx)
records = [records[idx] for idx in keep_idx]
num_kept = len(records)
print("Number of records already in db : {}".format(num_records - num_kept))
if len(records) > 0:
r = mdb.insert_many(records)
print("Inserted {} records in db {}".format(len(records), db))