obelics/configs/config_extract_web_documents.yaml (23 lines of code) (raw):

dom_tree_simplificator: strip_multiple_linebreaks: True strip_multiple_spaces: True remove_html_comments: True replace_line_break_tags: True unwrap_tags: True strip_tags: True strip_special_divs: True remove_dates: True remove_empty_leaves: True unnest_nodes: True remake_tree: True css_rules: - "[class~='footer']" - "[class~='site-info']" css_rules_replace_with_text: {"[class~='more-link']": "\n\nEND_OF_DOCUMENT_TOKEN_TO_BE_REPLACED\n\n"} pre_extraction_simplificator: only_text_image_nodes: True format_texts: True merge_consecutive_text_nodes: True web_document_extractor: image_size: 256 resize_mode: no