fn minify()

in rust/src/main.rs [284:365]


fn minify(file_path: &str) -> Vec<HTMLMinified> {
    // Processing a single webpage
    let single_record_processor = |record: &RawRecord| -> Option<HTMLMinified> {
        // Remove all documents without the Question schema before generating the DOM to speed up processing
        let doc_string = String::from_utf8_lossy(&record.body);
        if !contains_question(&doc_string) {
            return None;
        }
        // Generate DOM, retrieve URI and ip-address
        let (uri, ip, _, document) = warc_to_dom(record)?;
        // Find language
        let mut language: String = "-".to_string();
        if let Some(x) = find_lang_tag(document.clone()) {
            language = x;
        }
        // Remove everything outside of Question
        let outside_result = transform_outside(document);
        if outside_result.is_none() {
            return None;
        }
        let questions = outside_result.unwrap();
        // Remove everything without item* attribute inside
        let mut cleaned_questions = Vec::new();
        for question in questions {
            transform_inside(question.clone());
            remove_empty_nodes(question.clone());
            // Remove newline and carriage returns from the data to avoid additional linebreaks
            let mut string_question = question.to_string().replace("\n", "").replace("\r", "");
            string_question = reduce_tilde(string_question);
            string_question = reduce_breaks(string_question);
            cleaned_questions.push(string_question);
        }
        let all_questions: String = cleaned_questions.into_iter().collect();
        // Return a minified mhtml object
        Some(HTMLMinified {
            mhtml: all_questions,
            language,
            uri,
            ip_address: ip,
        })
    };

    let from_start = Instant::now();
    let file = WarcReader::from_path(file_path).unwrap();
    let file_output = file.collect::<Vec<Result<RawRecord, warc::Error>>>();
    // Read WARC file and collect all well formatted webpages
    let file_error_filter_out = file_output
        .iter()
        .filter(|x| x.is_ok())
        .map(|x| x.as_ref().unwrap())
        .collect::<Vec<&RawRecord>>();
    println!(
        "Finished Reading in {} ms",
        from_start.elapsed().as_millis()
    );

    // Parallel process WARC file
    let from_process = Instant::now();
    let file_output_length = file_output.len() as u64;
    println!("{}", file_output_length);
    let (oks, _): (Vec<_>, Vec<_>) = file_error_filter_out
        .into_par_iter()
        .progress_count(file_output_length)
        .map(single_record_processor)
        .partition(Option::is_some);
    println!(
        "Finished Processing in {} ms for a throughput of {} per ms",
        from_process.elapsed().as_millis(),
        (file_output_length as u128) / from_process.elapsed().as_millis()
    );
    println!(
        "Finished End to End in {} ms, for a throughput of {} per ms",
        from_start.elapsed().as_millis(),
        (file_output_length as u128) / from_start.elapsed().as_millis()
    );

    // Clean out empty webpages
    oks.into_iter()
        .map(Option::unwrap)
        .filter(|x| x.mhtml.len() > 0)
        .collect::<Vec<HTMLMinified>>()
}