in taxonomy_mapping/species_lookup.py [0:0]
def initialize_taxonomy_lookup() -> None:
"""
Initialize this module by doing the following:
* Downloads and unzips the current GBIF and iNat taxonomies if necessary
(only unzips what's necessary, but does not delete the original zipfiles)
* Builds a bunch of dictionaries and tables to facilitate lookup
* Serializes those tables via pickle
* Skips all of the above if the serialized pickle file already exists
"""
global inat_taxonomy,\
gbif_taxonomy,\
gbif_common_mapping,\
inat_taxon_id_to_row,\
gbif_taxon_id_to_row,\
inat_taxon_id_to_vernacular,\
inat_vernacular_to_taxon_id,\
inat_taxon_id_to_scientific,\
inat_scientific_to_taxon_id,\
gbif_taxon_id_to_vernacular,\
gbif_vernacular_to_taxon_id,\
gbif_taxon_id_to_scientific,\
gbif_scientific_to_taxon_id
## Load serialized taxonomy info if we've already saved it
if os.path.isfile(serialized_structures_file):
print(f'Reading taxonomy data from {serialized_structures_file}')
with open(serialized_structures_file, 'rb') as f:
structures_to_serialize = pickle.load(f)
inat_taxonomy,\
gbif_taxonomy,\
gbif_common_mapping,\
inat_taxon_id_to_row,\
gbif_taxon_id_to_row,\
inat_taxon_id_to_vernacular,\
inat_vernacular_to_taxon_id,\
inat_taxon_id_to_scientific,\
inat_scientific_to_taxon_id,\
gbif_taxon_id_to_vernacular,\
gbif_vernacular_to_taxon_id,\
gbif_taxon_id_to_scientific,\
gbif_scientific_to_taxon_id = structures_to_serialize
return
## If we don't have serialized taxonomy info, create it from scratch.
# Download and unzip taxonomy files
for taxonomy_name, zip_url in taxonomy_urls.items():
need_to_download = False
# Don't download the zipfile if we've already unzipped what we need
for fn in files_to_unzip[taxonomy_name]:
target_file = os.path.join(
taxonomy_download_dir, taxonomy_name, fn)
if not os.path.isfile(target_file):
need_to_download = True
break
if not need_to_download:
print(f'Bypassing download of {taxonomy_name}, all files available')
continue
zipfile_path = os.path.join(
taxonomy_download_dir, zip_url.split('/')[-1])
# Bypasses download if the file exists already
ai4e_web_utils.download_url(
zip_url, os.path.join(zipfile_path),
progress_updater=ai4e_web_utils.DownloadProgressBar(),
verbose=True)
# Unzip the files we need
files_we_need = files_to_unzip[taxonomy_name]
with zipfile.ZipFile(zipfile_path, 'r') as zipH:
for fn in files_we_need:
print('Unzipping {}'.format(fn))
target_file = os.path.join(
taxonomy_download_dir, taxonomy_name, fn)
if os.path.isfile(target_file):
print(f'Bypassing unzip of {target_file}, file exists')
else:
with zipH.open(fn) as zf, open(target_file, 'wb') as f:
shutil.copyfileobj(zf, f)
# ...for each file that we need from this zipfile
# Remove the zipfile
# os.remove(zipfile_path)
# ...for each taxonomy
# Create dataframes from each of the taxonomy files, and the GBIF common
# name file
# Load iNat taxonomy
inat_taxonomy = pd.read_csv(os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv'))
inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
# Load GBIF taxonomy
gbif_taxonomy = pd.read_csv(os.path.join(
taxonomy_download_dir, 'GBIF', 'Taxon.tsv'), sep='\t')
gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
# Remove questionable rows from the GBIF taxonomy
gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
gbif_taxonomy = gbif_taxonomy.reset_index()
# Load GBIF vernacular name mapping
gbif_common_mapping = pd.read_csv(os.path.join(
taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
# Only keep English mappings
gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
gbif_common_mapping = gbif_common_mapping.reset_index()
# Convert everything to lowercase
def convert_df_to_lowercase(df):
df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
return df
inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
# For each taxonomy table, create a mapping from taxon IDs to rows
inat_taxon_id_to_row = {}
gbif_taxon_id_to_row = {}
print('Building iNat taxonID --> row table')
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
inat_taxon_id_to_row[row['taxonID']] = i_row
print('Building GBIF taxonID --> row table')
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
gbif_taxon_id_to_row[row['taxonID']] = i_row
# Create name mapping dictionaries
inat_taxon_id_to_vernacular = defaultdict(set)
inat_vernacular_to_taxon_id = defaultdict(set)
inat_taxon_id_to_scientific = defaultdict(set)
inat_scientific_to_taxon_id = defaultdict(set)
gbif_taxon_id_to_vernacular = defaultdict(set)
gbif_vernacular_to_taxon_id = defaultdict(set)
gbif_taxon_id_to_scientific = defaultdict(set)
gbif_scientific_to_taxon_id = defaultdict(set)
# Build iNat dictionaries
# row = inat_taxonomy.iloc[0]
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
taxon_id = row['taxonID']
vernacular_name = row['vernacularName']
scientific_name = row['scientificName']
if len(vernacular_name) > 0:
inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
assert len(scientific_name) > 0
inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
# Build GBIF dictionaries
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
taxon_id = row['taxonID']
# The canonical name is the Latin name; the "scientific name"
# include the taxonomy name.
#
# http://globalnames.org/docs/glossary/
scientific_name = row['canonicalName']
# This only seems to happen for really esoteric species that aren't
# likely to apply to our problems, but doing this for completeness.
if len(scientific_name) == 0:
scientific_name = row['scientificName']
assert len(scientific_name) > 0
gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
taxon_id = row['taxonID']
# Don't include taxon IDs that were removed from the master table
if taxon_id not in gbif_taxon_id_to_scientific:
continue
vernacular_name = row['vernacularName']
assert len(vernacular_name) > 0
gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
# Save everything to file
structures_to_serialize = [
inat_taxonomy,
gbif_taxonomy,
gbif_common_mapping,
inat_taxon_id_to_row,
gbif_taxon_id_to_row,
inat_taxon_id_to_vernacular,
inat_vernacular_to_taxon_id,
inat_taxon_id_to_scientific,
inat_scientific_to_taxon_id,
gbif_taxon_id_to_vernacular,
gbif_vernacular_to_taxon_id,
gbif_taxon_id_to_scientific,
gbif_scientific_to_taxon_id
]
print('Serializing...', end='')
if not os.path.isfile(serialized_structures_file):
with open(serialized_structures_file, 'wb') as p:
pickle.dump(structures_to_serialize, p)
print(' done')