in tantivy/src/query/more_like_this/more_like_this.rs [165:281]
fn add_term_frequencies(
&self,
searcher: &Searcher,
field: Field,
values: &[Value],
term_frequencies: &mut HashMap<Term, usize>,
) -> Result<()> {
let schema = searcher.schema();
let tokenizer_manager = searcher.index().tokenizers();
let field_entry = schema.get_field_entry(field);
if !field_entry.is_indexed() {
return Ok(());
}
// extract the raw value, possibly tokenizing & filtering to update the term frequency map
match field_entry.field_type() {
FieldType::Facet(_) => {
let facets: Vec<&str> = values
.iter()
.map(|value| match value {
Value::Facet(ref facet) => Ok(facet.encoded_str()),
_ => Err(TantivyError::InvalidArgument(
"invalid field value".to_string(),
)),
})
.collect::<Result<Vec<_>>>()?;
for fake_str in facets {
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
if self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
FieldType::Str(text_options) => {
let mut token_streams: Vec<BoxTokenStream> = vec![];
for value in values {
match value {
Value::PreTokStr(tok_str) => {
token_streams.push(PreTokenizedStream::from(tok_str.clone()).into());
}
Value::Str(ref text) => {
if let Some(tokenizer) = text_options
.get_indexing_options()
.map(|text_indexing_options| {
text_indexing_options.tokenizer().to_string()
})
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name))
{
token_streams.push(tokenizer.token_stream(text));
}
}
_ => (),
}
}
for mut token_stream in token_streams {
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);
*term_frequencies.entry(term).or_insert(0) += 1;
}
});
}
}
FieldType::U64(_) => {
for value in values {
let val = value.as_u64().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_u64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
}
}
}
FieldType::Date(_) => {
for value in values {
let timestamp_micros = value
.as_date()
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
.into_timestamp_micros();
if !self.is_noise_word(timestamp_micros.to_string()) {
let term = Term::from_field_i64(field, timestamp_micros);
*term_frequencies.entry(term).or_insert(0) += 1;
}
}
}
FieldType::I64(_) => {
for value in values {
let val = value.as_i64().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_i64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
}
}
}
FieldType::F64(_) => {
for value in values {
let val = value.as_f64().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_f64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
}
}
}
_ => {}
}
Ok(())
}