def __init__()

in mdr/retrieval/data/encode_datasets.py [0:0]


    def __init__(self,
                 tokenizer,
                 data_path,
                 max_q_len,
                 max_c_len,
                 is_query_embed,
                 save_path
                 ):
        super().__init__()
        self.is_query_embed = is_query_embed
        self.tokenizer = tokenizer
        self.max_c_len = max_c_len

        if not os.path.exists(save_path):
            os.mkdir(save_path)
        save_path = os.path.join(save_path, "id2doc.json") # ID to doc mapping

        print(f"Loading data from {data_path}")
        if self.is_query_embed:
            self.data = [json.loads(_.strip())
                        for _ in tqdm(open(data_path).readlines())]
        else:
            if data_path.endswith("tsv"):
                self.data = []
                with open(data_path) as tsvfile:
                    reader = csv.reader(tsvfile, delimiter='\t', )
                    for row in reader:
                        if row[0] != 'id':
                            id_, text, title = row[0], row[1], row[2]
                            self.data.append({"id": id_, "text": text, "title": title})
            elif "fever" in data_path:
                raw_data = [json.loads(l) for l in tqdm(open(data_path).readlines())]
                self.data = []
                for _ in raw_data:
                #     _["title"] = normalize(_["title"])
                    # _["title"] = convert_brc(_["title"])
                    # _["text"] = convert_brc(_["text"])

                    self.data.append(_)
            else:
                self.data = [json.loads(l) for l in open(data_path).readlines()]
            print(f"load {len(self.data)} documents...")
            id2doc = {}
            for idx, doc in enumerate(self.data):
                id2doc[idx] = (doc["title"], doc["text"], doc.get("intro", False))
            with open(save_path, "w") as g:
                json.dump(id2doc, g)

        self.max_len = max_q_len if is_query_embed else max_c_len
        print(f"Max sequence length: {self.max_len}")