in utilities/Hive_metastore_migration/src/export_from_datacatalog.py [0:0]
def main():
to_s3 = 'to-s3'
to_jdbc = 'to-jdbc'
parser = argparse.ArgumentParser(prog=sys.argv[0])
parser.add_argument('-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore')
parser.add_argument('--database-names', required=True, help='Semicolon-separated list of names of database in Datacatalog to export')
parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path')
parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection')
parser.add_argument('-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"')
options = get_options(parser, sys.argv)
if options['mode'] == to_s3:
validate_options_in_mode(
options=options, mode=to_s3,
required_options=['output_path'],
not_allowed_options=['connection_name']
)
elif options['mode'] == to_jdbc:
validate_options_in_mode(
options=options, mode=to_jdbc,
required_options=['connection_name'],
not_allowed_options=['output_path']
)
else:
raise AssertionError('unknown mode ' + options['mode'])
validate_aws_regions(options['region'])
# spark env
(conf, sc, sql_context) = get_spark_env()
glue_context = GlueContext(sc)
# extract from datacatalog reader
database_arr = options['database_names'].split(';')
(databases, tables, partitions) = read_databases_from_catalog(
sql_context=sql_context,
glue_context=glue_context,
datacatalog_name='datacatalog',
database_arr=database_arr,
region=options.get('region') or 'us-east-1'
)
if options['mode'] == to_s3:
output_path = get_output_dir(options['output_path'])
datacatalog_migrate_to_s3(
databases=databases,
tables=tables,
partitions=partitions,
output_path=output_path
)
elif options['mode'] == to_jdbc:
connection_name = options['connection_name']
datacatalog_migrate_to_hive_metastore(
sc=sc,
sql_context=sql_context,
databases=databases,
tables=tables,
partitions=partitions,
connection=glue_context.extract_jdbc_conf(connection_name)
)