data_validation/jellyfish_distance.py (10 lines of code) (raw):
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import jellyfish
def extract_closest_match(search_key, target_list, score_cutoff=0):
"""Return str value from target list with highest score using Jaro
for String distance.
search_key (str): A string used to search for closest match.
target_list (list): A list of strings for comparison.
score_cutoff (float): A score cutoff (betwen 0 and 1) to be met.
"""
highest_score = score_cutoff
highest_value_key = None
for target_key in target_list:
score = jellyfish.jaro_similarity(search_key, target_key)
if score >= highest_score:
highest_score = score
highest_value_key = target_key
return highest_value_key