in delta_sharing_bring_your_own_container_local_processing/processing_script.py [0:0]
def main():
print("Processing Started")
# Convert command line args into a map of args
args_iter = iter(sys.argv[1:])
args = dict(zip(args_iter, args_iter))
print('Received arguments {}'.format(args))
profile_files = [os.path.join(profile_path, file) for file in os.listdir(profile_path)]
if len(profile_files) == 0:
raise ValueError(
(
"There are no files in {}.\n"
+ "This usually indicates that the channel ({}) was incorrectly specified,\n"
+ "the data specification in S3 was incorrectly specified or the role specified\n"
+ "does not have permission to access the data."
).format(profile_path)
)
profile_file = profile_files[0]
print(f'Found profile file: {profile_file}')
# Create a SharingClient
client = delta_sharing.SharingClient(profile_file)
table_url = profile_file + "#delta_sharing.default.owid-covid-data"
# Load the table as a Pandas DataFrame
print('Loading owid-covid-data table from Delta Lake')
data = delta_sharing.load_as_pandas(table_url)
print(f'Data shape: {data.shape}')
# Aggregate total_cases per location
cases_per_location = data.groupby(['location'])['total_cases'].sum()
print(f'cases_per_location\n{cases_per_location}\n')
output_file = os.path.join(processed_data_path,'total_cases_per_location.csv')
print(f'Writing output file: {output_file}')
cases_per_location.to_csv(output_file)
print("Processing Complete")