in taxonomy_mapping/process_species_by_dataset.py [0:0]
def get_preferred_taxonomic_match(query: str) -> TaxonomicMatch:
"""
Wrapper for species_lookup.py, but expressing a variety of heuristics and
preferences that are specific to our scenario.
"""
query = query.lower().strip().replace('_', ' ')
# query = 'person'
matches = get_taxonomic_info(query)
# Do we have an iNat match?
inat_matches = [m for m in matches if m['source'] == 'inat']
gbif_matches = [m for m in matches if m['source'] == 'gbif']
# print_taxonomy_matches(inat_matches, verbose=True)
# print_taxonomy_matches(gbif_matches, verbose=True)
scientific_name = ''
common_name = ''
taxonomic_level = ''
match = ''
source = ''
taxonomy_string = ''
n_inat_matches = len(inat_matches)
n_gbif_matches = len(gbif_matches)
selected_matches = None
if n_inat_matches > 0 and taxonomy_preference == 'inat':
selected_matches = 'inat'
elif n_gbif_matches > 0:
selected_matches = 'gbif'
if selected_matches == 'inat':
i_match = 0
if len(inat_matches) > 1:
# print('Warning: multiple iNat matches for {}'.format(query))
# Prefer chordates... most of the names that aren't what we want
# are esoteric insects, like a moth called "cheetah"
#
# If we can't find a chordate, just take the first match.
#
# i_test_match = 0
for i_test_match, match in enumerate(inat_matches):
found_vertebrate = False
taxonomy = match['taxonomy']
for taxonomy_level in taxonomy:
taxon_rank = taxonomy_level[1]
scientific_name = taxonomy_level[2]
if taxon_rank == 'phylum' and scientific_name == 'chordata':
i_match = i_test_match
found_vertebrate = True
break
if found_vertebrate:
break
match = inat_matches[i_match]['taxonomy']
# This is (taxonID, taxonLevel, scientific, [list of common])
lowest_level = match[0]
taxonomic_level = lowest_level[1]
scientific_name = lowest_level[2]
assert len(scientific_name) > 0
common_names = lowest_level[3]
if len(common_names) > 1:
# print(f'Warning: multiple iNat common names for {query}')
# Default to returning the query
if query in common_names:
common_name = query
else:
common_name = common_names[0]
elif len(common_names) > 0:
common_name = common_names[0]
# print(f'Matched iNat {query} to {scientific_name},{common_name}')
source = 'inat'
# ...if we had iNat matches
# If we either prefer GBIF or didn't have iNat matches
#
# Code is deliberately redundant here; I'm expecting some subtleties in how
# handle GBIF and iNat.
elif selected_matches == 'gbif':
i_match = 0
if len(gbif_matches) > 1:
# print('Warning: multiple GBIF matches for {}'.format(query))
# Prefer chordates... most of the names that aren't what we want
# are esoteric insects, like a moth called "cheetah"
#
# If we can't find a chordate, just take the first match.
#
# i_test_match = 0
for i_test_match, match in enumerate(gbif_matches):
found_vertebrate = False
taxonomy = match['taxonomy']
for taxonomy_level in taxonomy:
taxon_rank = taxonomy_level[1]
scientific_name = taxonomy_level[2]
if taxon_rank == 'phylum' and scientific_name == 'chordata':
i_match = i_test_match
found_vertebrate = True
break
if found_vertebrate:
break
match = gbif_matches[i_match]['taxonomy']
# This is (taxonID, taxonLevel, scientific, [list of common])
lowest_level = match[0]
taxonomic_level = lowest_level[1]
scientific_name = lowest_level[2]
assert len(scientific_name) > 0
common_names = lowest_level[3]
if len(common_names) > 1:
# print(f'Warning: multiple GBIF common names for {query}')
# Default to returning the query
if query in common_names:
common_name = query
else:
common_name = common_names[0]
elif len(common_names) > 0:
common_name = common_names[0]
source = 'gbif'
# ...if we needed to look in the GBIF taxonomy
taxonomy_string = str(match)
return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
taxonomy_string, match)