def index_data()

in code/src/data/dictionary.py [0:0]


    def index_data(txt_path, bin_path, dico, attr_list, attr_cols, attr_values):
        """
        Index sentences with a dictionary.
        """
        if os.path.isfile(bin_path):
            print("Loading data from %s ..." % bin_path)
            data = torch.load(bin_path)
            assert dico == data['dico']
            assert attr_values == data['attr_values']
            return data

        positions = []
        sentences = []
        attributes = []
        unk_words = {}
        count_empty_sentences = 0
        count_unknown_labels = 0
        label2id = {attr: {label: i for i, label in enumerate(labels)} for attr, labels in attr_values.items()}

        # index sentences
        f = open(txt_path, 'r', encoding='utf-8')
        for i, line in enumerate(f):
            if i % 100000 == 0 and i > 0:
                print(i)
            s = line.rstrip()
            # skip empty sentences
            if len(s) == 0:
                print("Empty sentence in line %i." % i)
                count_empty_sentences += 1
                continue
            s = s.split('\t')
            # index sentence words
            count_unk = 0
            indexed = []
            for w in s[attr_cols[0]].strip().split():
                word_id = dico.index(w, no_unk=False)
                if word_id < 4 + SPECIAL_WORDS and word_id != dico.unk_index:
                    logger.warning('Found unexpected special word "%s" (%i)!!' % (w, word_id))
                    continue
                indexed.append(word_id)
                if word_id == dico.unk_index:
                    unk_words[w] = unk_words.get(w, 0) + 1
                    count_unk += 1
            # index attributes
            sentence_attrs = []
            for attr, col in zip(attr_list, attr_cols[1:]):
                sentence_attrs.append(label2id[attr].get(s[col], None))
            # skip sentences with unknown attributes
            if any([x is None for x in sentence_attrs]):
                count_unknown_labels += 1
                continue
            # add sentence
            positions.append([len(sentences), len(sentences) + len(indexed)])
            sentences.extend(indexed)
            sentences.append(-1)
            attributes.append(sentence_attrs)

        f.close()

        print("Read %i sentences. %i were skipped because empty, and %i because contained unknown attributes."
              % (len(positions), count_empty_sentences, count_unknown_labels))

        # tensorize data
        positions = torch.LongTensor(positions)
        if len(dico) < 1 << 15:
            sentences = torch.ShortTensor(sentences)
        elif len(dico) < 1 << 31:
            sentences = torch.IntTensor(sentences)
        else:
            sentences = torch.LongTensor(sentences)
        attributes = torch.LongTensor(attributes)
        assert attributes.size() == (len(positions), len(attr_values))
        data = {
            'dico': dico,
            'attr_values': attr_values,
            'positions': positions,
            'sentences': sentences,
            'attributes': attributes,
            'unk_words': unk_words,
        }
        print("Saving the data to %s ..." % bin_path)
        torch.save(data, bin_path)

        return data