vision/data/datasets_processing_scripts/build_webdocs_dataset/python_scripts/05_filtering_web_docs.py [113:228]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    logger.info("Finished loading the web document dataset")

    with open(args.path_config_filter_web_documents) as f:
        filtering_params = yaml.load(f, Loader=yaml.FullLoader)

    web_document_filtering_node_level = WebDocumentFilteringNodeLevel(
        cond_check_format=filtering_params["cond_check_format"],
        valid_formats=filtering_params["valid_formats"],
        cond_check_size_image=filtering_params["cond_check_size_image"],
        original_width_min_cutoff=filtering_params["original_width_min_cutoff"],
        original_width_max_cutoff=filtering_params["original_width_max_cutoff"],
        original_height_min_cutoff=filtering_params["original_height_min_cutoff"],
        original_height_max_cutoff=filtering_params["original_height_max_cutoff"],
        rendered_width_min_cutoff=filtering_params["rendered_width_min_cutoff"],
        rendered_width_max_cutoff=filtering_params["rendered_width_max_cutoff"],
        rendered_height_min_cutoff=filtering_params["rendered_height_min_cutoff"],
        rendered_height_max_cutoff=filtering_params["rendered_height_max_cutoff"],
        aspect_ratio_max_cutoff=filtering_params["aspect_ratio_max_cutoff"],
        cond_remove_non_printing_characters=filtering_params["cond_remove_non_printing_characters"],
        non_printing_characters_re=NON_PRINTING_CHARACTERS_RE,
        cond_standardize_whitespace=filtering_params["cond_standardize_whitespace"],
        cond_check_number_words_node_level=filtering_params["cond_check_number_words_node_level"],
        strip_characters=SPECIAL_CHARACTERS,
        number_words_node_level_min_cutoff=filtering_params["number_words_node_level_min_cutoff"],
        number_words_node_level_max_cutoff=filtering_params["number_words_node_level_max_cutoff"],
        cond_check_character_repetition_ratio_node_level=filtering_params[
            "cond_check_character_repetition_ratio_node_level"
        ],
        character_repetition_length_node_level=filtering_params["character_repetition_length_node_level"],
        character_repetition_node_level_max_cutoff=filtering_params["character_repetition_node_level_max_cutoff"],
        cond_check_word_repetition_ratio_node_level=filtering_params["cond_check_word_repetition_ratio_node_level"],
        word_repetition_length_node_level=filtering_params["word_repetition_length_node_level"],
        word_repetition_node_level_max_cutoff=filtering_params["word_repetition_node_level_max_cutoff"],
        cond_check_special_character_ratio_node_level=filtering_params[
            "cond_check_special_character_ratio_node_level"
        ],
        special_character_ratio_node_level_max_cutoff=filtering_params[
            "special_character_ratio_node_level_max_cutoff"
        ],
        cond_check_stopword_ratio_node_level=filtering_params["cond_check_stopword_ratio_node_level"],
        stopwords=STOPWORDS,
        stopword_ratio_node_level_min_cutoff=filtering_params["stopword_ratio_node_level_min_cutoff"],
        cond_check_flagged_word_ratio_node_level=filtering_params["cond_check_flagged_word_ratio_node_level"],
        flagged_words=FLAGGED_WORDS,
        flagged_word_ratio_node_level_max_cutoff=filtering_params["flagged_word_ratio_node_level_max_cutoff"],
        cond_check_punctuation_ratio_node_level=filtering_params["cond_check_punctuation_ratio_node_level"],
        min_number_words_to_check_punctuation_ratio_node_level=filtering_params[
            "min_number_words_to_check_punctuation_ratio_node_level"
        ],
        punctuation=PUNCTUATION,
        punctuation_ratio_node_level_min_cutoff=filtering_params["punctuation_ratio_node_level_min_cutoff"],
        cond_check_common_word_ratio_node_level=filtering_params["cond_check_common_word_ratio_node_level"],
        path_common_words=args.path_common_words,
        common_word_ratio_node_level_min_cutoff=filtering_params["common_word_ratio_node_level_min_cutoff"],
        cond_check_lang_id_node_level=filtering_params["cond_check_lang_id_node_level"],
        path_lang_id_model=args.path_lang_id_model,
        lang_id_node_level_min_cutoff=filtering_params["lang_id_node_level_min_cutoff"],
        cond_check_perplexity_score_node_level=filtering_params["cond_check_perplexity_score_node_level"],
        digits_re=DIGITS_RE,
        unicode_punctuation=UNICODE_PUNCTUATION,
        path_sentencepiece_model=args.path_sentencepiece_model,
        path_kenlm_model=args.path_kenlm_model,
        perplexity_score_node_level_max_cutoff=filtering_params["perplexity_score_node_level_max_cutoff"],
    )

    logger.info("Starting filtering the web document dataset at node level")
    web_document_dataset_filtered = web_document_dataset.map(web_document_filtering_node_level, num_proc=args.num_proc)
    logger.info("Finished filtering the web document dataset at node level")

    web_document_filtering_doc_level = WebDocumentFilteringDocLevel(
        cond_check_number_images=filtering_params["cond_check_number_images"],
        number_images_min_cutoff=filtering_params["number_images_min_cutoff"],
        number_images_max_cutoff=filtering_params["number_images_max_cutoff"],
        cond_check_number_words_doc_level=filtering_params["cond_check_number_words_doc_level"],
        strip_characters=SPECIAL_CHARACTERS,
        number_words_doc_level_min_cutoff=filtering_params["number_words_doc_level_min_cutoff"],
        number_words_doc_level_max_cutoff=filtering_params["number_words_doc_level_max_cutoff"],
        cond_check_character_repetition_ratio_doc_level=filtering_params[
            "cond_check_character_repetition_ratio_doc_level"
        ],
        character_repetition_length_doc_level=filtering_params["character_repetition_length_doc_level"],
        character_repetition_doc_level_max_cutoff=filtering_params["character_repetition_doc_level_max_cutoff"],
        cond_check_word_repetition_ratio_doc_level=filtering_params["cond_check_word_repetition_ratio_doc_level"],
        word_repetition_length_doc_level=filtering_params["word_repetition_length_doc_level"],
        word_repetition_doc_level_max_cutoff=filtering_params["word_repetition_doc_level_max_cutoff"],
        cond_check_special_character_ratio_doc_level=filtering_params["cond_check_special_character_ratio_doc_level"],
        special_character_ratio_doc_level_max_cutoff=filtering_params["special_character_ratio_doc_level_max_cutoff"],
        cond_check_stopword_ratio_doc_level=filtering_params["cond_check_stopword_ratio_doc_level"],
        stopwords=STOPWORDS,
        stopword_ratio_doc_level_min_cutoff=filtering_params["stopword_ratio_doc_level_min_cutoff"],
        cond_check_flagged_word_ratio_doc_level=filtering_params["cond_check_flagged_word_ratio_doc_level"],
        flagged_words=FLAGGED_WORDS,
        flagged_word_ratio_doc_level_max_cutoff=filtering_params["flagged_word_ratio_doc_level_max_cutoff"],
        cond_check_punctuation_ratio_doc_level=filtering_params["cond_check_punctuation_ratio_doc_level"],
        punctuation=PUNCTUATION,
        punctuation_ratio_doc_level_min_cutoff=filtering_params["punctuation_ratio_doc_level_min_cutoff"],
        cond_check_common_word_ratio_doc_level=filtering_params["cond_check_common_word_ratio_doc_level"],
        path_common_words=args.path_common_words,
        common_word_ratio_doc_level_min_cutoff=filtering_params["common_word_ratio_doc_level_min_cutoff"],
        cond_check_lang_id_doc_level=filtering_params["cond_check_lang_id_doc_level"],
        path_lang_id_model=args.path_lang_id_model,
        lang_id_doc_level_min_cutoff=filtering_params["lang_id_doc_level_min_cutoff"],
        cond_check_perplexity_score_doc_level=filtering_params["cond_check_perplexity_score_doc_level"],
        non_printing_characters_re=NON_PRINTING_CHARACTERS_RE,
        digits_re=DIGITS_RE,
        unicode_punctuation=UNICODE_PUNCTUATION,
        path_sentencepiece_model=args.path_sentencepiece_model,
        path_kenlm_model=args.path_kenlm_model,
        perplexity_score_doc_level_max_cutoff=filtering_params["perplexity_score_doc_level_max_cutoff"],
    )

    logger.info("Starting filtering the web document dataset at doc level")
    web_document_dataset_filtered = web_document_dataset_filtered.filter(
        web_document_filtering_doc_level, num_proc=args.num_proc
    )
    logger.info("Finished filtering the web document dataset at doc level")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



vision/m4/sourcing/data_collection/callers/filter_web_documents.py [93:208]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    logger.info("Finished loading the web document dataset")

    with open(args.path_config_filter_web_documents) as f:
        filtering_params = yaml.load(f, Loader=yaml.FullLoader)

    web_document_filtering_node_level = WebDocumentFilteringNodeLevel(
        cond_check_format=filtering_params["cond_check_format"],
        valid_formats=filtering_params["valid_formats"],
        cond_check_size_image=filtering_params["cond_check_size_image"],
        original_width_min_cutoff=filtering_params["original_width_min_cutoff"],
        original_width_max_cutoff=filtering_params["original_width_max_cutoff"],
        original_height_min_cutoff=filtering_params["original_height_min_cutoff"],
        original_height_max_cutoff=filtering_params["original_height_max_cutoff"],
        rendered_width_min_cutoff=filtering_params["rendered_width_min_cutoff"],
        rendered_width_max_cutoff=filtering_params["rendered_width_max_cutoff"],
        rendered_height_min_cutoff=filtering_params["rendered_height_min_cutoff"],
        rendered_height_max_cutoff=filtering_params["rendered_height_max_cutoff"],
        aspect_ratio_max_cutoff=filtering_params["aspect_ratio_max_cutoff"],
        cond_remove_non_printing_characters=filtering_params["cond_remove_non_printing_characters"],
        non_printing_characters_re=NON_PRINTING_CHARACTERS_RE,
        cond_standardize_whitespace=filtering_params["cond_standardize_whitespace"],
        cond_check_number_words_node_level=filtering_params["cond_check_number_words_node_level"],
        strip_characters=SPECIAL_CHARACTERS,
        number_words_node_level_min_cutoff=filtering_params["number_words_node_level_min_cutoff"],
        number_words_node_level_max_cutoff=filtering_params["number_words_node_level_max_cutoff"],
        cond_check_character_repetition_ratio_node_level=filtering_params[
            "cond_check_character_repetition_ratio_node_level"
        ],
        character_repetition_length_node_level=filtering_params["character_repetition_length_node_level"],
        character_repetition_node_level_max_cutoff=filtering_params["character_repetition_node_level_max_cutoff"],
        cond_check_word_repetition_ratio_node_level=filtering_params["cond_check_word_repetition_ratio_node_level"],
        word_repetition_length_node_level=filtering_params["word_repetition_length_node_level"],
        word_repetition_node_level_max_cutoff=filtering_params["word_repetition_node_level_max_cutoff"],
        cond_check_special_character_ratio_node_level=filtering_params[
            "cond_check_special_character_ratio_node_level"
        ],
        special_character_ratio_node_level_max_cutoff=filtering_params[
            "special_character_ratio_node_level_max_cutoff"
        ],
        cond_check_stopword_ratio_node_level=filtering_params["cond_check_stopword_ratio_node_level"],
        stopwords=STOPWORDS,
        stopword_ratio_node_level_min_cutoff=filtering_params["stopword_ratio_node_level_min_cutoff"],
        cond_check_flagged_word_ratio_node_level=filtering_params["cond_check_flagged_word_ratio_node_level"],
        flagged_words=FLAGGED_WORDS,
        flagged_word_ratio_node_level_max_cutoff=filtering_params["flagged_word_ratio_node_level_max_cutoff"],
        cond_check_punctuation_ratio_node_level=filtering_params["cond_check_punctuation_ratio_node_level"],
        min_number_words_to_check_punctuation_ratio_node_level=filtering_params[
            "min_number_words_to_check_punctuation_ratio_node_level"
        ],
        punctuation=PUNCTUATION,
        punctuation_ratio_node_level_min_cutoff=filtering_params["punctuation_ratio_node_level_min_cutoff"],
        cond_check_common_word_ratio_node_level=filtering_params["cond_check_common_word_ratio_node_level"],
        path_common_words=args.path_common_words,
        common_word_ratio_node_level_min_cutoff=filtering_params["common_word_ratio_node_level_min_cutoff"],
        cond_check_lang_id_node_level=filtering_params["cond_check_lang_id_node_level"],
        path_lang_id_model=args.path_lang_id_model,
        lang_id_node_level_min_cutoff=filtering_params["lang_id_node_level_min_cutoff"],
        cond_check_perplexity_score_node_level=filtering_params["cond_check_perplexity_score_node_level"],
        digits_re=DIGITS_RE,
        unicode_punctuation=UNICODE_PUNCTUATION,
        path_sentencepiece_model=args.path_sentencepiece_model,
        path_kenlm_model=args.path_kenlm_model,
        perplexity_score_node_level_max_cutoff=filtering_params["perplexity_score_node_level_max_cutoff"],
    )

    logger.info("Starting filtering the web document dataset at node level")
    web_document_dataset_filtered = web_document_dataset.map(web_document_filtering_node_level, num_proc=args.num_proc)
    logger.info("Finished filtering the web document dataset at node level")

    web_document_filtering_doc_level = WebDocumentFilteringDocLevel(
        cond_check_number_images=filtering_params["cond_check_number_images"],
        number_images_min_cutoff=filtering_params["number_images_min_cutoff"],
        number_images_max_cutoff=filtering_params["number_images_max_cutoff"],
        cond_check_number_words_doc_level=filtering_params["cond_check_number_words_doc_level"],
        strip_characters=SPECIAL_CHARACTERS,
        number_words_doc_level_min_cutoff=filtering_params["number_words_doc_level_min_cutoff"],
        number_words_doc_level_max_cutoff=filtering_params["number_words_doc_level_max_cutoff"],
        cond_check_character_repetition_ratio_doc_level=filtering_params[
            "cond_check_character_repetition_ratio_doc_level"
        ],
        character_repetition_length_doc_level=filtering_params["character_repetition_length_doc_level"],
        character_repetition_doc_level_max_cutoff=filtering_params["character_repetition_doc_level_max_cutoff"],
        cond_check_word_repetition_ratio_doc_level=filtering_params["cond_check_word_repetition_ratio_doc_level"],
        word_repetition_length_doc_level=filtering_params["word_repetition_length_doc_level"],
        word_repetition_doc_level_max_cutoff=filtering_params["word_repetition_doc_level_max_cutoff"],
        cond_check_special_character_ratio_doc_level=filtering_params["cond_check_special_character_ratio_doc_level"],
        special_character_ratio_doc_level_max_cutoff=filtering_params["special_character_ratio_doc_level_max_cutoff"],
        cond_check_stopword_ratio_doc_level=filtering_params["cond_check_stopword_ratio_doc_level"],
        stopwords=STOPWORDS,
        stopword_ratio_doc_level_min_cutoff=filtering_params["stopword_ratio_doc_level_min_cutoff"],
        cond_check_flagged_word_ratio_doc_level=filtering_params["cond_check_flagged_word_ratio_doc_level"],
        flagged_words=FLAGGED_WORDS,
        flagged_word_ratio_doc_level_max_cutoff=filtering_params["flagged_word_ratio_doc_level_max_cutoff"],
        cond_check_punctuation_ratio_doc_level=filtering_params["cond_check_punctuation_ratio_doc_level"],
        punctuation=PUNCTUATION,
        punctuation_ratio_doc_level_min_cutoff=filtering_params["punctuation_ratio_doc_level_min_cutoff"],
        cond_check_common_word_ratio_doc_level=filtering_params["cond_check_common_word_ratio_doc_level"],
        path_common_words=args.path_common_words,
        common_word_ratio_doc_level_min_cutoff=filtering_params["common_word_ratio_doc_level_min_cutoff"],
        cond_check_lang_id_doc_level=filtering_params["cond_check_lang_id_doc_level"],
        path_lang_id_model=args.path_lang_id_model,
        lang_id_doc_level_min_cutoff=filtering_params["lang_id_doc_level_min_cutoff"],
        cond_check_perplexity_score_doc_level=filtering_params["cond_check_perplexity_score_doc_level"],
        non_printing_characters_re=NON_PRINTING_CHARACTERS_RE,
        digits_re=DIGITS_RE,
        unicode_punctuation=UNICODE_PUNCTUATION,
        path_sentencepiece_model=args.path_sentencepiece_model,
        path_kenlm_model=args.path_kenlm_model,
        perplexity_score_doc_level_max_cutoff=filtering_params["perplexity_score_doc_level_max_cutoff"],
    )

    logger.info("Starting filtering the web document dataset at doc level")
    web_document_dataset_filtered = web_document_dataset_filtered.filter(
        web_document_filtering_doc_level, num_proc=args.num_proc
    )
    logger.info("Finished filtering the web document dataset at doc level")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



