vision/m4/sourcing/data_collection/processors/__init__.py (15 lines of code) (raw):
from m4.sourcing.data_collection.processors.dom_tree_simplificator import DOMTreeSimplificator
from m4.sourcing.data_collection.processors.html_extractor import HtmlExtractor
from m4.sourcing.data_collection.processors.image_deduplicator import ImageDeduplicator
from m4.sourcing.data_collection.processors.pair_extractor import TextMediaPairsExtractor
from m4.sourcing.data_collection.processors.pair_filtering import PairFiltering
from m4.sourcing.data_collection.processors.pre_extraction_simplificator import PreExtractionSimplificator
from m4.sourcing.data_collection.processors.warc_downloader import WarcDownloader
from m4.sourcing.data_collection.processors.web_document_extractor import CommonCrawlWebDocumentExtractor
from m4.sourcing.data_collection.processors.web_document_filtering import (
FilteringFunctions,
WebDocumentFilteringDocLevel,
WebDocumentFilteringNodeLevel,
)
from m4.sourcing.data_collection.processors.web_document_image_deduplication import WebDocumentImageDeduplication
from m4.sourcing.data_collection.processors.web_document_line_deduplication import WebDocumentLineDeduplication