def filter_shards()

in filtering/deduplication/filter_oscar_jsonl.py [0:0]


def filter_shards(shard_id):
    print("Processing shard {}".format(shard_id))
    shard_lines = []
    for line in tqdm(oscar_shards[shard_id]):
        # if len(line["text"]) < 500:
        if (line["included_in_dedup"] and line["dup_ratio"] == 0.0) or ((not line["included_in_dedup"]) and (not line["has_dup_25"])):
            shard_lines.append({"text": line["text"]})
    return shard_lines