tools/cdms/cdms_reader.py (156 lines of code) (raw):

# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import string from netCDF4 import Dataset, num2date import sys import datetime import csv from collections import OrderedDict import logging #TODO: Get rid of numpy errors? #TODO: Update big SDAP README LOGGER = logging.getLogger("cdms_reader") def assemble_matches(filename): """ Read a CDMS netCDF file and return a list of matches. Parameters ---------- filename : str The CDMS netCDF file name. Returns ------- matches : list List of matches. Each list element is a dictionary. For match m, netCDF group GROUP (PrimaryData or SecondaryData), and group variable VARIABLE: matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record matches[m][GROUP][VARIABLE]: variable value """ try: # Open the netCDF file with Dataset(filename, 'r') as cdms_nc: # Check that the number of groups is consistent w/ the MatchedGroups # dimension assert len(cdms_nc.groups) == cdms_nc.dimensions['MatchedGroups'].size,\ ("Number of groups isn't the same as MatchedGroups dimension.") matches = [] matched_records = cdms_nc.dimensions['MatchedRecords'].size # Loop through the match IDs to assemble matches for match in range(0, matched_records): match_dict = OrderedDict() # Grab the data from each platform (group) in the match for group_num, group in enumerate(cdms_nc.groups): match_dict[group] = OrderedDict() match_dict[group]['matchID'] = match ID = cdms_nc.variables['matchIDs'][match][group_num] match_dict[group][group + 'ID'] = ID for var in cdms_nc.groups[group].variables.keys(): match_dict[group][var] = cdms_nc.groups[group][var][ID] # Create a UTC datetime field from timestamp dt = num2date(match_dict[group]['time'], cdms_nc.groups[group]['time'].units) match_dict[group]['datetime'] = dt LOGGER.info(match_dict) matches.append(match_dict) return matches except (OSError, IOError) as err: LOGGER.exception("Error reading netCDF file " + filename) raise err def assemble_matches_by_primary(filename): """ Read a CDMS netCDF file and return a list of matches, in which secondary data points are grouped together by their primary data point match. This function returns matches in a different order than the 'assemble_matches' function. In this function, all secondary data is associated with its primary match without the need to access multiple matches. Parameters ---------- filename : str The CDMS netCDF file name. Returns ------- matches : list List of matches. Each list element is a dictionary that maps a primary record to all of its associated secondary records. For match m, netCDF group GROUP (PrimaryData or SecondaryData), and group variable VARIABLE: matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record matches[m][GROUP][VARIABLE]: variable value. Each VARIABLE is returned as a masked array. ex. To access the first secondary time value available for a given match: matches[m]['SecondaryData']['time'][0] """ try: # Open the netCDF file with Dataset(filename, 'r') as cdms_nc: # Check that the number of groups is consistent w/ the MatchedGroups # dimension assert len(cdms_nc.groups) == cdms_nc.dimensions['MatchedGroups'].size,\ ("Number of groups isn't the same as MatchedGroups dimension.") matched_records = cdms_nc.dimensions['MatchedRecords'].size primary_matches = cdms_nc.groups['PrimaryData'].dimensions['dim'].size matches = [OrderedDict()] * primary_matches for match in range(matched_records): PID = int(cdms_nc.variables['matchIDs'][match][0]) if len(matches[PID]) == 0: #establishes ordered dictionary for first match[PID] matches[PID] = OrderedDict() for group_num, group in enumerate(cdms_nc.groups): if group_num == 0: #primary if group not in matches[PID].keys(): #initialization matches[PID][group] = OrderedDict() matches[PID][group]['matchID'] = [] matches[PID][group]['matchID'].append(match) ID = cdms_nc.variables['matchIDs'][match][group_num] matches[PID][group][group + 'ID'] = ID for var in cdms_nc.groups[group].variables.keys(): matches[PID][group][var] = cdms_nc.groups[group][var][ID] dt = num2date(matches[PID][group]['time'], cdms_nc.groups[group]['time'].units) matches[PID][group]['datetime'] = dt elif group_num == 1: #secondary if group not in matches[PID].keys(): #initialization matches[PID][group] = OrderedDict() matches[PID][group]['matchID'] = [] matches[PID][group][group + 'ID'] = [] matches[PID][group]['datetime'] = [] matches[PID][group]['matchID'].append(match) ID = cdms_nc.variables['matchIDs'][match][group_num] matches[PID][group][group + 'ID'].append(ID) for var in cdms_nc.groups[group].variables.keys(): if var not in matches[PID][group].keys(): matches[PID][group][var] = [] matches[PID][group][var].append(cdms_nc.groups[group][var][ID]) dt = num2date(matches[PID][group]['time'], cdms_nc.groups[group]['time'].units) matches[PID][group]['datetime'].append(dt[0]) return matches except (OSError, IOError) as err: LOGGER.exception("Error reading netCDF file " + filename) raise err def matches_to_csv(matches, csvfile): """ Write the CDMS matches to a CSV file. Include a header of column names which are based on the group and variable names from the netCDF file. Parameters ---------- matches : list The list of dictionaries containing the CDMS matches as returned from assemble_matches. csvfile : str The name of the CSV output file. """ # Create a header for the CSV. Column names are GROUP_VARIABLE or # GROUP_GROUPID. header = [] for key, value in matches[0].items(): for otherkey in value.keys(): header.append(key + "_" + otherkey) try: # Write the CSV file with open(csvfile, 'w') as output_file: csv_writer = csv.writer(output_file) csv_writer.writerow(header) for match in matches: row = [] for group, data in match.items(): for value in data.values(): row.append(value) csv_writer.writerow(row) except (OSError, IOError) as err: LOGGER.exception("Error writing CSV file " + csvfile) raise err def get_globals(filename): """ Write the CDMS global attributes to a text file. Additionally, within the file there will be a description of where all the different outputs go and how to best utlize this program. Parameters ---------- filename : str The name of the original '.nc' input file. """ x0 = "README / cdms_reader.py Program Use and Description:\n" x1 = "\nThe cdms_reader.py program reads a CDMS netCDF (a NETCDF file with a matchIDs variable)\n" x2 = "file into memory, assembles a list of matches of primary and secondary data\n" x3 = "and optionally\n" x4 = "output the matches to a CSV file. Each matched pair contains one primary\n" x5 = "data record and one secondary data record.\n" x6 = "\nBelow, this file wil list the global attributes of the .nc (NETCDF) file.\n" x7 = "If you wish to see a full dump of the data from the .nc file,\n" x8 = "please utilize the ncdump command from NETCDF (or look at the CSV file).\n" try: with Dataset(filename, "r", format="NETCDF4") as ncFile: txtName = filename.replace(".nc", ".txt") with open(txtName, "w") as txt: txt.write(x0 + x1 +x2 +x3 + x4 + x5 + x6 + x7 + x8) txt.write("\nGlobal Attributes:") for x in ncFile.ncattrs(): txt.write(f'\t :{x} = "{ncFile.getncattr(x)}" ;\n') except (OSError, IOError) as err: LOGGER.exception("Error reading netCDF file " + filename) print("Error reading file!") raise err def create_logs(user_option, logName): """ Write the CDMS log information to a file. Additionally, the user may opt to print this information directly to stdout, or discard it entirely. Parameters ---------- user_option : str The result of the arg.log 's interpretation of what option the user selected. logName : str The name of the log file we wish to write to, assuming the user did not use the -l option. """ if user_option == 'N': print("** Note: No log was created **") elif user_option == '1': #prints the log contents to stdout logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S', handlers=[ logging.StreamHandler(sys.stdout) ]) else: #prints log to a .log file logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S', handlers=[ logging.FileHandler(logName) ]) if user_option != 1 and user_option != 'Y': print(f"** Bad usage of log option. Log will print to {logName} **") if __name__ == '__main__': """ Execution: python cdms_reader.py filename OR python3 cdms_reader.py filename OR python3 cdms_reader.py filename -c -g OR python3 cdms_reader.py filename --csv --meta Note (For Help Try): python3 cdms_reader.py -h OR python3 cdms_reader.py --help """ u0 = '\n%(prog)s -h OR --help \n' u1 = '%(prog)s filename -c -g\n%(prog)s filename --csv --meta\n' u2 ='Use -l OR -l1 to modify destination of logs' p = argparse.ArgumentParser(usage= u0 + u1 + u2) #below block is to customize user options p.add_argument('filename', help='CDMS netCDF file to read') p.add_argument('-c', '--csv', nargs='?', const= 'Y', default='N', help='Use -c or --csv to retrieve CSV output') p.add_argument('-g', '--meta', nargs='?', const='Y', default='N', help='Use -g or --meta to retrieve global attributes / metadata') p.add_argument('-l', '--log', nargs='?', const='N', default='Y', help='Use -l or --log to AVOID creating log files, OR use -l1 to print to stdout/console') #arguments are processed by the next line args = p.parse_args() logName = args.filename.replace(".nc", ".log") create_logs(args.log, logName) cdms_matches = assemble_matches(args.filename) if args.csv == 'Y' : matches_to_csv(cdms_matches, args.filename.replace(".nc",".csv")) if args.meta == 'Y' : get_globals(args.filename)