in datafusion/physical-expr/src/hash_utils.rs [180:208]
fn hash_dictionary<K: ArrowDictionaryKeyType>(
array: &DictionaryArray<K>,
random_state: &RandomState,
hashes_buffer: &mut [u64],
multi_col: bool,
) -> Result<()> {
// Hash each dictionary value once, and then use that computed
// hash for each key value to avoid a potentially expensive
// redundant hashing for large dictionary elements (e.g. strings)
let values = Arc::clone(array.values());
let mut dict_hashes = vec![0; values.len()];
create_hashes(&[values], random_state, &mut dict_hashes)?;
// combine hash for each index in values
if multi_col {
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
if let Some(key) = key {
*hash = combine_hashes(dict_hashes[key.as_usize()], *hash)
} // no update for Null, consistent with other hashes
}
} else {
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
if let Some(key) = key {
*hash = dict_hashes[key.as_usize()]
} // no update for Null, consistent with other hashes
}
}
Ok(())
}