in tantivy/src/indexer/segment_writer.rs [157:339]
fn index_document(&mut self, doc: &Document) -> crate::Result<()> {
let doc_id = self.max_doc;
let vals_grouped_by_field = doc
.field_values()
.iter()
.sorted_by_key(|el| el.field())
.group_by(|el| el.field());
for (field, field_values) in &vals_grouped_by_field {
let values = field_values.map(|field_value| field_value.value());
let field_entry = self.schema.get_field_entry(field);
let make_schema_error = || {
crate::TantivyError::SchemaError(format!(
"Expected a {:?} for field {:?}",
field_entry.field_type().value_type(),
field_entry.name()
))
};
if !field_entry.is_indexed() {
continue;
}
let (term_buffer, ctx) = (&mut self.term_buffer, &mut self.ctx);
let postings_writer: &mut dyn PostingsWriter =
self.per_field_postings_writers.get_for_field_mut(field);
term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field);
match field_entry.field_type() {
FieldType::Facet(_) => {
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut unordered_term_id_opt = None;
FacetTokenizer
.token_stream(facet_str)
.process(&mut |token| {
term_buffer.set_text(&token.text);
let unordered_term_id =
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
// TODO pass indexing context directly in subscribe function
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers
.get_term_id_writer_mut(field)
.expect("writer for facet missing")
.add_val(unordered_term_id);
}
}
}
FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default();
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
PreTokenizedStream::from(tok_str.clone()).into()
}
Value::Str(ref text) => {
let text_analyzer =
&self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
}
_ => {
continue;
}
};
assert!(term_buffer.is_empty());
postings_writer.index_text(
doc_id,
&mut *token_stream,
term_buffer,
ctx,
&mut indexing_position,
self.fast_field_writers.get_term_id_writer_mut(field),
);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer
.record(doc_id, field, indexing_position.num_tokens);
}
}
FieldType::U64(_) => {
let mut num_vals = 0;
for value in values {
num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
term_buffer.set_u64(u64_val);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::Date(_) => {
let mut num_vals = 0;
for value in values {
num_vals += 1;
let date_val = value.as_date().ok_or_else(make_schema_error)?;
term_buffer.set_u64(date_val.truncate(DatePrecision::Seconds).to_u64());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::I64(_) => {
let mut num_vals = 0;
for value in values {
num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
term_buffer.set_i64(i64_val);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::F64(_) => {
let mut num_vals = 0;
for value in values {
num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::Bool(_) => {
let mut num_vals = 0;
for value in values {
num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::Bytes(_) => {
let mut num_vals = 0;
for value in values {
num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
FieldType::JsonObject(json_options) => {
let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize];
let json_values_it =
values.map(|value| value.as_json().ok_or_else(make_schema_error));
index_json_values(
doc_id,
json_values_it,
text_analyzer,
json_options.is_expand_dots_enabled(),
term_buffer,
postings_writer,
ctx,
)?;
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
for value in values {
num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
term_buffer.set_ip_addr(ip_addr);
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_vals);
}
}
}
}
Ok(())
}