in utilities/Hive_metastore_migration/src/import_into_datacatalog.py [0:0]
def main():
# arguments
from_s3 = 'from-s3'
from_jdbc = 'from-jdbc'
parser = argparse.ArgumentParser(prog=sys.argv[0])
parser.add_argument('-m', '--mode', required=True, choices=[from_s3, from_jdbc], help='Choose to migrate metastore either from JDBC or from S3')
parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection')
parser.add_argument('-R', '--region', required=False, help='AWS region of target Glue DataCatalog, default to "us-east-1"')
parser.add_argument('-d', '--database-prefix', required=False, help='Optional prefix for database names in Glue DataCatalog')
parser.add_argument('-t', '--table-prefix', required=False, help='Optional prefix for table name in Glue DataCatalog')
parser.add_argument('-D', '--database-input-path', required=False, help='An S3 path containing json files of metastore database entities')
parser.add_argument('-T', '--table-input-path', required=False, help='An S3 path containing json files of metastore table entities')
parser.add_argument('-P', '--partition-input-path', required=False, help='An S3 path containing json files of metastore partition entities')
options = get_options(parser, sys.argv)
if options['mode'] == from_s3:
validate_options_in_mode(
options=options, mode=from_s3,
required_options=['database_input_path', 'table_input_path', 'partition_input_path'],
not_allowed_options=['database_prefix', 'table_prefix']
)
elif options['mode'] == from_jdbc:
validate_options_in_mode(
options=options, mode=from_jdbc,
required_options=['connection_name'],
not_allowed_options=['database_input_path', 'table_input_path', 'partition_input_path']
)
else:
raise AssertionError('unknown mode ' + options['mode'])
validate_aws_regions(options['region'])
# spark env
(conf, sc, sql_context) = get_spark_env()
glue_context = GlueContext(sc)
# launch job
if options['mode'] == from_s3:
metastore_import_from_s3(
sql_context=sql_context,
glue_context=glue_context,
db_input_dir=options['database_input_path'],
tbl_input_dir=options['table_input_path'],
parts_input_dir=options['partition_input_path'],
datacatalog_name='datacatalog',
region=options.get('region') or 'us-east-1'
)
elif options['mode'] == from_jdbc:
glue_context.extract_jdbc_conf(options['connection_name'])
metastore_full_migration(
sc=sc,
sql_context=sql_context,
glue_context=glue_context,
connection=glue_context.extract_jdbc_conf(options['connection_name']),
db_prefix=options.get('database_prefix') or '',
table_prefix=options.get('table_prefix') or '',
datacatalog_name='datacatalog',
region=options.get('region') or 'us-east-1'
)