src/graph_construction.py [35:75]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def truncate(line, separator_tag="story_separator_special_tag", total_words=500):
    line_word_split = line.split()
    if len(line_word_split) < total_words:
        return line
    else:
        sources_split = line.split(separator_tag)
        # previous dataset had separator at the end of each example
        if sources_split[-1] == "":
            del sources_split[-1]
        num_sources = len(sources_split)
        words_ar = [source.split() for source in sources_split]
        num_words_ar = [len(words) for words in words_ar]
        #logging.debug(f"initial number of words: {str(num_words_ar)}")
        per_source_count = math.floor(total_words / num_sources)
        total_ar = [0] * num_sources
        total = 0
        done = {}
        while total < total_words and len(done) < len(num_words_ar):
            # e.g. total=499 and still trying to add -- just add from the first doc which isn't done
            if per_source_count == 0:
                for index, x in enumerate(total_ar):
                    if index not in done:
                        total_ar[index] += total_words - total
                        break
                break
            min_amount = min(min([x for x in num_words_ar if x > 0]), per_source_count)
            total_ar = [x + min_amount if index not in done else x for index, x in enumerate(total_ar)]
            for index, val in enumerate(num_words_ar):
                if val == min_amount:
                    done[index] = True
            num_words_ar = [x - min_amount for x in num_words_ar]
            total = sum(total_ar)
            if len(done) == len(num_words_ar):
                break
            per_source_count = math.floor((total_words - total) / (len(num_words_ar) - len(done))) 
        final_words_ar = []
        for count_words, words in enumerate(words_ar):
            cur_string = " ".join(words[:total_ar[count_words]])
            final_words_ar.append(cur_string)
        final_str = (" " + separator_tag + " ").join(final_words_ar).strip()
        return final_str
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



src/prepare_data.py [92:132]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def truncate(line, separator_tag="story_separator_special_tag", total_words=500):
    line_word_split = line.split()
    if len(line_word_split) < total_words:
        return line
    else:
        sources_split = line.split(separator_tag)
        # previous dataset had separator at the end of each example
        if sources_split[-1] == "":
            del sources_split[-1]
        num_sources = len(sources_split)
        words_ar = [source.split() for source in sources_split]
        num_words_ar = [len(words) for words in words_ar]
        #logging.debug(f"initial number of words: {str(num_words_ar)}")
        per_source_count = math.floor(total_words / num_sources)
        total_ar = [0] * num_sources
        total = 0
        done = {}
        while total < total_words and len(done) < len(num_words_ar):
            # e.g. total=499 and still trying to add -- just add from the first doc which isn't done
            if per_source_count == 0:
                for index, x in enumerate(total_ar):
                    if index not in done:
                        total_ar[index] += total_words - total
                        break
                break
            min_amount = min(min([x for x in num_words_ar if x > 0]), per_source_count)
            total_ar = [x + min_amount if index not in done else x for index, x in enumerate(total_ar)]
            for index, val in enumerate(num_words_ar):
                if val == min_amount:
                    done[index] = True
            num_words_ar = [x - min_amount for x in num_words_ar]
            total = sum(total_ar)
            if len(done) == len(num_words_ar):
                break
            per_source_count = math.floor((total_words - total) / (len(num_words_ar) - len(done))) 
        final_words_ar = []
        for count_words, words in enumerate(words_ar):
            cur_string = " ".join(words[:total_ar[count_words]])
            final_words_ar.append(cur_string)
        final_str = (" " + separator_tag + " ").join(final_words_ar).strip()
        return final_str
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



