in mdr/retrieval/data/encode_datasets.py [0:0]
def __init__(self,
tokenizer,
data_path,
max_q_len,
max_c_len,
is_query_embed,
save_path
):
super().__init__()
self.is_query_embed = is_query_embed
self.tokenizer = tokenizer
self.max_c_len = max_c_len
if not os.path.exists(save_path):
os.mkdir(save_path)
save_path = os.path.join(save_path, "id2doc.json") # ID to doc mapping
print(f"Loading data from {data_path}")
if self.is_query_embed:
self.data = [json.loads(_.strip())
for _ in tqdm(open(data_path).readlines())]
else:
if data_path.endswith("tsv"):
self.data = []
with open(data_path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t', )
for row in reader:
if row[0] != 'id':
id_, text, title = row[0], row[1], row[2]
self.data.append({"id": id_, "text": text, "title": title})
elif "fever" in data_path:
raw_data = [json.loads(l) for l in tqdm(open(data_path).readlines())]
self.data = []
for _ in raw_data:
# _["title"] = normalize(_["title"])
# _["title"] = convert_brc(_["title"])
# _["text"] = convert_brc(_["text"])
self.data.append(_)
else:
self.data = [json.loads(l) for l in open(data_path).readlines()]
print(f"load {len(self.data)} documents...")
id2doc = {}
for idx, doc in enumerate(self.data):
id2doc[idx] = (doc["title"], doc["text"], doc.get("intro", False))
with open(save_path, "w") as g:
json.dump(id2doc, g)
self.max_len = max_q_len if is_query_embed else max_c_len
print(f"Max sequence length: {self.max_len}")