obelics/callers/line_deduplicate_web_documents.py (65 lines of code) (raw):
import argparse
from PIL import Image
from obelics.processors import WebDocumentLineDeduplication
# Useful to avoid DecompressionBombError
Image.MAX_IMAGE_PIXELS = None
def get_args():
parser = argparse.ArgumentParser(description="Line deduplicate web documents.")
parser.add_argument(
"--path_sharded_dataset",
type=str,
default="/gpfsscratch/rech/cnw/commun/local_datasets/web_document_dataset_45M_sharded_filtered_2/train",
help="Path to the folder containing the shards of the web document dataset.",
)
parser.add_argument(
"--path_save_domain_to_positions",
type=str,
default="/gpfswork/rech/cnw/urd43gx/line_dedup/domain_to_positions.json",
help=(
"Path of the file to save the dictionary to go from a domain name to positions in the web document"
" dataset."
),
)
parser.add_argument(
"--path_save_domain_to_duplicated_texts",
type=str,
default="/gpfswork/rech/cnw/urd43gx/line_dedup/domain_to_duplicated_texts.json",
help="Path of the file to save the dictionary containing the deduplicated texts for each domain.",
)
parser.add_argument(
"--id_shard_to_line_deduplicate",
type=int,
default=2,
help="Id of the shard to perform line deduplication on.",
)
parser.add_argument(
"--num_proc",
type=int,
default=1,
help="Number of processes for the map operation in the line deduplication.",
)
parser.add_argument(
"--path_save_line_deduplicated_sharded_dataset",
type=str,
default="/gpfsscratch/rech/cnw/commun/local_datasets/web_document_dataset_45M_sharded_filtered_2_line_deduplicated/train",
help="Path to the folder to save the shards of the line deduplicated web document dataset.",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = get_args()
path_sharded_dataset = args.path_sharded_dataset
path_save_domain_to_positions = args.path_save_domain_to_positions
path_save_domain_to_duplicated_texts = args.path_save_domain_to_duplicated_texts
id_shard_to_line_deduplicate = args.id_shard_to_line_deduplicate
num_proc = args.num_proc
path_save_line_deduplicated_sharded_dataset = args.path_save_line_deduplicated_sharded_dataset
web_document_line_deduplication = WebDocumentLineDeduplication(
path_sharded_dataset=path_sharded_dataset,
path_save_domain_to_positions=path_save_domain_to_positions,
path_save_domain_to_duplicated_texts=path_save_domain_to_duplicated_texts,
id_shard_to_line_deduplicate=id_shard_to_line_deduplicate,
num_proc=num_proc,
path_save_line_deduplicated_sharded_dataset=path_save_line_deduplicated_sharded_dataset,
)
web_document_line_deduplication.get_paths_subdatasets()
# web_document_line_deduplication.get_domain_to_positions()
# web_document_line_deduplication.get_domain_to_duplicated_texts()
web_document_line_deduplication.line_deduplicate_web_documents()