in petastorm/etl/petastorm_generate_metadata.py [0:0]
def _main(args):
parser = argparse.ArgumentParser(prog='petastorm_generate_metadata',
description='Add necessary petastorm metadata to an existing dataset',
epilog=example_text,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--dataset_url',
help='the url to the dataset base directory', required=True)
parser.add_argument('--unischema_class',
help='the fully qualified class of the dataset unischema. If not specified will attempt'
' to reuse schema already in dataset. '
'(e.g. examples.hello_world.generate_hello_world_dataset.HelloWorldSchema)',
required=False)
parser.add_argument('--master', type=str,
help='Spark master. Default if not specified. To run on a local machine, specify '
'"local[W]" (where W is the number of local spark workers, e.g. local[10])')
parser.add_argument('--spark-driver-memory', type=str, help='The amount of memory the driver process will have',
default='4g')
parser.add_argument('--use-summary-metadata', action='store_true',
help='Whether to use the parquet summary metadata format.'
' Not scalable for large amounts of columns and/or row groups.')
parser.add_argument('--hdfs-driver', type=str, default='libhdfs3',
help='A string denoting the hdfs driver to use (if using a dataset on hdfs). '
'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')
args = parser.parse_args(args)
# Open Spark Session
spark_session = SparkSession \
.builder \
.appName("Petastorm Generate Metadata") \
.config('spark.driver.memory', args.spark_driver_memory)
if args.master:
spark_session.master(args.master)
spark = spark_session.getOrCreate()
generate_petastorm_metadata(spark, args.dataset_url, args.unischema_class, args.use_summary_metadata,
hdfs_driver=args.hdfs_driver)
# Shut down the spark sessions and context
spark.stop()