in rust/src/main.rs [284:365]
fn minify(file_path: &str) -> Vec<HTMLMinified> {
// Processing a single webpage
let single_record_processor = |record: &RawRecord| -> Option<HTMLMinified> {
// Remove all documents without the Question schema before generating the DOM to speed up processing
let doc_string = String::from_utf8_lossy(&record.body);
if !contains_question(&doc_string) {
return None;
}
// Generate DOM, retrieve URI and ip-address
let (uri, ip, _, document) = warc_to_dom(record)?;
// Find language
let mut language: String = "-".to_string();
if let Some(x) = find_lang_tag(document.clone()) {
language = x;
}
// Remove everything outside of Question
let outside_result = transform_outside(document);
if outside_result.is_none() {
return None;
}
let questions = outside_result.unwrap();
// Remove everything without item* attribute inside
let mut cleaned_questions = Vec::new();
for question in questions {
transform_inside(question.clone());
remove_empty_nodes(question.clone());
// Remove newline and carriage returns from the data to avoid additional linebreaks
let mut string_question = question.to_string().replace("\n", "").replace("\r", "");
string_question = reduce_tilde(string_question);
string_question = reduce_breaks(string_question);
cleaned_questions.push(string_question);
}
let all_questions: String = cleaned_questions.into_iter().collect();
// Return a minified mhtml object
Some(HTMLMinified {
mhtml: all_questions,
language,
uri,
ip_address: ip,
})
};
let from_start = Instant::now();
let file = WarcReader::from_path(file_path).unwrap();
let file_output = file.collect::<Vec<Result<RawRecord, warc::Error>>>();
// Read WARC file and collect all well formatted webpages
let file_error_filter_out = file_output
.iter()
.filter(|x| x.is_ok())
.map(|x| x.as_ref().unwrap())
.collect::<Vec<&RawRecord>>();
println!(
"Finished Reading in {} ms",
from_start.elapsed().as_millis()
);
// Parallel process WARC file
let from_process = Instant::now();
let file_output_length = file_output.len() as u64;
println!("{}", file_output_length);
let (oks, _): (Vec<_>, Vec<_>) = file_error_filter_out
.into_par_iter()
.progress_count(file_output_length)
.map(single_record_processor)
.partition(Option::is_some);
println!(
"Finished Processing in {} ms for a throughput of {} per ms",
from_process.elapsed().as_millis(),
(file_output_length as u128) / from_process.elapsed().as_millis()
);
println!(
"Finished End to End in {} ms, for a throughput of {} per ms",
from_start.elapsed().as_millis(),
(file_output_length as u128) / from_start.elapsed().as_millis()
);
// Clean out empty webpages
oks.into_iter()
.map(Option::unwrap)
.filter(|x| x.mhtml.len() > 0)
.collect::<Vec<HTMLMinified>>()
}