def args_parser()

in petastorm/tools/copy_dataset.py [0:0]


def args_parser():
    parser = argparse.ArgumentParser(description=__doc__)

    parser.add_argument('source_url',
                        help='A url of a source petastorm dataset',
                        type=str)

    parser.add_argument('target_url',
                        help='A url of a target petastorm datset',
                        type=str)

    parser.add_argument('--overwrite-output', action='store_true',
                        help='If the flag is set to false, the script will fail '
                             'in case when the output directory already exists')

    parser.add_argument('--field-regex', type=str, nargs='+',
                        help='A list of regular expressions. Only fields that match one of the regex patterns will '
                             'be copied.')

    parser.add_argument('--not-null-fields', type=str, nargs='+',
                        help='All names in this list must be not null in the source dataset in order to be copied to '
                             'the target dataset.')

    parser.add_argument('--partition-count', type=int, required=False,
                        help='Specifies number of partitions in the output dataset')

    parser.add_argument('--row-group-size-mb', type=int, required=False,
                        help='Specifies the row group size in the created dataset')
    parser.add_argument('--hdfs-driver', type=str, default='libhdfs3',
                        help='A string denoting the hdfs driver to use (if using a dataset on hdfs). '
                             'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')

    add_configure_spark_arguments(parser)

    return parser