in petastorm/tools/copy_dataset.py [0:0]
def args_parser():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('source_url',
help='A url of a source petastorm dataset',
type=str)
parser.add_argument('target_url',
help='A url of a target petastorm datset',
type=str)
parser.add_argument('--overwrite-output', action='store_true',
help='If the flag is set to false, the script will fail '
'in case when the output directory already exists')
parser.add_argument('--field-regex', type=str, nargs='+',
help='A list of regular expressions. Only fields that match one of the regex patterns will '
'be copied.')
parser.add_argument('--not-null-fields', type=str, nargs='+',
help='All names in this list must be not null in the source dataset in order to be copied to '
'the target dataset.')
parser.add_argument('--partition-count', type=int, required=False,
help='Specifies number of partitions in the output dataset')
parser.add_argument('--row-group-size-mb', type=int, required=False,
help='Specifies the row group size in the created dataset')
parser.add_argument('--hdfs-driver', type=str, default='libhdfs3',
help='A string denoting the hdfs driver to use (if using a dataset on hdfs). '
'Current choices are libhdfs (java through JNI) or libhdfs3 (C++)')
add_configure_spark_arguments(parser)
return parser