def _main()

in petastorm/etl/petastorm_generate_metadata.py [0:0]


def _main(args):
    parser = argparse.ArgumentParser(prog='petastorm_generate_metadata',
                                     description='Add necessary petastorm metadata to an existing dataset',
                                     epilog=example_text,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--dataset_url',
                        help='the url to the dataset base directory', required=True)
    parser.add_argument('--unischema_class',
                        help='the fully qualified class of the dataset unischema. If not specified will attempt'
                             ' to reuse schema already in dataset. '
                             '(e.g. examples.hello_world.generate_hello_world_dataset.HelloWorldSchema)',
                        required=False)
    parser.add_argument('--master', type=str,
                        help='Spark master. Default if not specified. To run on a local machine, specify '
                             '"local[W]" (where W is the number of local spark workers, e.g. local[10])')
    parser.add_argument('--spark-driver-memory', type=str, help='The amount of memory the driver process will have',
                        default='4g')
    parser.add_argument('--use-summary-metadata', action='store_true',
                        help='Whether to use the parquet summary metadata format.'
                             ' Not scalable for large amounts of columns and/or row groups.')
    parser.add_argument('--hdfs-driver', type=str, default='libhdfs3',
                        help='A string denoting the hdfs driver to use (if using a dataset on hdfs). '
                             'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')
    args = parser.parse_args(args)

    # Open Spark Session
    spark_session = SparkSession \
        .builder \
        .appName("Petastorm Generate Metadata") \
        .config('spark.driver.memory', args.spark_driver_memory)
    if args.master:
        spark_session.master(args.master)

    spark = spark_session.getOrCreate()

    generate_petastorm_metadata(spark, args.dataset_url, args.unischema_class, args.use_summary_metadata,
                                hdfs_driver=args.hdfs_driver)

    # Shut down the spark sessions and context
    spark.stop()