in perfkitbenchmarker/scripts/spark_sql_test_scripts/spark_sql_runner.py [0:0]
def parse_args(args=None):
"""Parse argv."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--sql-queries',
action='append',
type=lambda csv: csv.split(','),
required=True,
help=(
'Comma-separated list of SQL files to run. If you pass this argument'
' many times then it will run each SQL query list/stream in'
' parallel.'
),
)
data_group = parser.add_mutually_exclusive_group()
data_group.add_argument(
'--database', help='Hive database to look for data in.'
)
data_group.add_argument(
'--table-base-dir',
help=(
'Base HCFS path containing the table data to be registered into Spark'
' temporary view.'
),
)
data_group.add_argument(
'--bigquery-dataset',
help=(
'BQ Dataset containing the tables passed in --table-names to be'
' registered into Spark temporary view.'
),
)
parser.add_argument(
'--table-names',
nargs='+',
help='Names of the tables to be registered into Spark temporary view.',
)
parser.add_argument(
'--table-format',
help=(
'Format of data to be registered into Spark temporary view as passed'
' to `spark.read.format()`. Assumed to be "parquet", or "bigquery" if'
' a BQ dataset is also specified.'
),
)
parser.add_argument(
'--bigquery-read-data-format',
help=(
'The record format to use when connecting to BigQuery storage. See:'
' https://github.com/GoogleCloudDataproc/spark-bigquery-connector#properties'
),
)
parser.add_argument(
'--csv-delimiter', help='CSV delimiter to load CSV files', default=','
)
parser.add_argument(
'--enable-hive',
type=bool,
default=False,
help='Whether to try to read data from Hive.',
)
parser.add_argument(
'--table-cache',
choices=['eager', 'lazy'],
help='Whether to cache the tables in memory spilling to local-disk.',
)
results_group = parser.add_mutually_exclusive_group(required=True)
results_group.add_argument(
'--log-results',
type=bool,
default=False,
help=(
'Log query timings to stdout/stderr instead of writing them to some'
' object storage location. Reduces runner latency (and hence its'
' total wall time), but it is not supported by all DPB services.'
),
)
results_group.add_argument(
'--report-dir',
help='Directory to write out query timings to.',
)
parser.add_argument(
'--fail-on-query-execution-errors',
type=bool,
default=False,
help=(
'Fail the whole script on an error while executing the queries, '
'instead of continuing and not reporting that query run time (the '
'default).'
),
)
parser.add_argument(
'--dump-spark-conf',
help=(
'Directory to dump the spark conf props for this job. For debugging '
'purposes.'
),
)
if args is None:
return parser.parse_args()
return parser.parse_args(args)