in tantivy/src/indexer/merger.rs [740:916]
fn write_postings_for_field(
&self,
indexed_field: Field,
field_type: &FieldType,
serializer: &mut InvertedIndexSerializer,
fieldnorm_reader: Option<FieldNormReader>,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<Option<TermOrdinalMapping>> {
debug_time!("write-postings-for-field");
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
let mut delta_computer = DeltaComputer::new();
let mut max_term_ords: Vec<TermOrdinal> = Vec::new();
let field_readers: Vec<Arc<InvertedIndexReader>> = self
.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect::<crate::Result<Vec<_>>>()?;
let mut field_term_streams = Vec::new();
for field_reader in &field_readers {
let terms = field_reader.terms();
field_term_streams.push(terms.stream()?);
max_term_ords.push(terms.num_terms() as u64);
}
let mut term_ord_mapping_opt = match field_type {
FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
FieldType::Str(options) if options.is_fast() => {
Some(TermOrdinalMapping::new(max_term_ords))
}
_ => None,
};
let mut merged_terms = TermMerger::new(field_term_streams);
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = self
.readers
.iter()
.map(|reader| {
let mut segment_local_map = vec![];
segment_local_map.resize(reader.max_doc() as usize, None);
segment_local_map
})
.collect();
for (new_doc_id, old_doc_addr) in doc_id_mapping.iter_old_doc_addrs().enumerate() {
let segment_map = &mut merged_doc_id_map[old_doc_addr.segment_ord as usize];
segment_map[old_doc_addr.doc_id as usize] = Some(new_doc_id as DocId);
}
// Note that the total number of tokens is not exact.
// It is only used as a parameter in the BM25 formula.
let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?;
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
//
// In the new segments, the doc id from the different
// segment are stacked so that :
// - Segment 0's doc ids become doc id [0, seg.max_doc]
// - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc]
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc +
// seg2.max_doc]
//
// This stacking applies only when the index is not sorted, in that case the
// doc_ids are kmerged by their sort property
let mut field_serializer =
serializer.new_field(indexed_field, total_num_tokens, fieldnorm_reader)?;
let field_entry = self.schema.get_field_entry(indexed_field);
// ... set segment postings option the new field.
let segment_postings_option = field_entry.field_type().get_index_record_option().expect(
"Encountered a field that is not supposed to be
indexed. Have you modified the schema?",
);
let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
let mut doc_id_and_positions = vec![];
while merged_terms.advance() {
segment_postings_containing_the_term.clear();
let term_bytes: &[u8] = merged_terms.key();
let mut total_doc_freq = 0;
// Let's compute the list of non-empty posting lists
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() {
let segment_reader = &self.readers[segment_ord];
let inverted_index: &InvertedIndexReader = &field_readers[segment_ord];
let segment_postings = inverted_index
.read_postings_from_terminfo(&term_info, segment_postings_option)?;
let alive_bitset_opt = segment_reader.alive_bitset();
let doc_freq = if let Some(alive_bitset) = alive_bitset_opt {
segment_postings.doc_freq_given_deletes(alive_bitset)
} else {
segment_postings.doc_freq()
};
if doc_freq > 0u32 {
total_doc_freq += doc_freq;
segment_postings_containing_the_term.push((segment_ord, segment_postings));
}
}
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term (and that are non-empty)
//
// These segments are non-empty and advance has already been called.
if total_doc_freq == 0u32 {
// All docs that used to contain the term have been deleted. The `term` will be
// entirely removed.
continue;
}
let to_term_ord = field_serializer.new_term(term_bytes, total_doc_freq)?;
if let Some(ref mut term_ord_mapping) = term_ord_mapping_opt {
for (segment_ord, from_term_ord) in merged_terms.matching_segments() {
term_ord_mapping.register_from_to(segment_ord, from_term_ord, to_term_ord);
}
}
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in
segment_postings_containing_the_term.drain(..)
{
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
let mut doc = segment_postings.doc();
while doc != TERMINATED {
// deleted doc are skipped as they do not have a `remapped_doc_id`.
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term if
// there is at least one document.
let term_freq = segment_postings.term_freq();
segment_postings.positions(&mut positions_buffer);
// if doc_id_mapping exists, the doc_ids are reordered, they are
// not just stacked. The field serializer expects monotonically increasing
// doc_ids, so we collect and sort them first, before writing.
//
// I think this is not strictly necessary, it would be possible to
// avoid the loading into a vec via some form of kmerge, but then the merge
// logic would deviate much more from the stacking case (unsorted index)
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.push((
remapped_doc_id,
term_freq,
positions_buffer.to_vec(),
));
} else {
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
}
}
doc = segment_postings.advance();
}
}
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
for (doc_id, term_freq, positions) in &doc_id_and_positions {
let delta_positions = delta_computer.compute_delta(positions);
field_serializer.write_doc(*doc_id, *term_freq, delta_positions);
}
doc_id_and_positions.clear();
}
// closing the term.
field_serializer.close_term()?;
}
field_serializer.close()?;
Ok(term_ord_mapping_opt)
}