def parse_args()

in perfkitbenchmarker/scripts/spark_sql_test_scripts/spark_sql_runner.py [0:0]


def parse_args(args=None):
  """Parse argv."""

  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument(
      '--sql-queries',
      action='append',
      type=lambda csv: csv.split(','),
      required=True,
      help=(
          'Comma-separated list of SQL files to run. If you pass this argument'
          ' many times then it will run each SQL query list/stream in'
          ' parallel.'
      ),
  )
  data_group = parser.add_mutually_exclusive_group()
  data_group.add_argument(
      '--database', help='Hive database to look for data in.'
  )
  data_group.add_argument(
      '--table-base-dir',
      help=(
          'Base HCFS path containing the table data to be registered into Spark'
          ' temporary view.'
      ),
  )
  data_group.add_argument(
      '--bigquery-dataset',
      help=(
          'BQ Dataset containing the tables passed in --table-names to be'
          ' registered into Spark temporary view.'
      ),
  )
  parser.add_argument(
      '--table-names',
      nargs='+',
      help='Names of the tables to be registered into Spark temporary view.',
  )
  parser.add_argument(
      '--table-format',
      help=(
          'Format of data to be registered into Spark temporary view as passed'
          ' to `spark.read.format()`. Assumed to be "parquet", or "bigquery" if'
          ' a BQ dataset is also specified.'
      ),
  )
  parser.add_argument(
      '--bigquery-read-data-format',
      help=(
          'The record format to use when connecting to BigQuery storage. See:'
          ' https://github.com/GoogleCloudDataproc/spark-bigquery-connector#properties'
      ),
  )
  parser.add_argument(
      '--csv-delimiter', help='CSV delimiter to load CSV files', default=','
  )
  parser.add_argument(
      '--enable-hive',
      type=bool,
      default=False,
      help='Whether to try to read data from Hive.',
  )
  parser.add_argument(
      '--table-cache',
      choices=['eager', 'lazy'],
      help='Whether to cache the tables in memory spilling to local-disk.',
  )
  results_group = parser.add_mutually_exclusive_group(required=True)
  results_group.add_argument(
      '--log-results',
      type=bool,
      default=False,
      help=(
          'Log query timings to stdout/stderr instead of writing them to some'
          ' object storage location. Reduces runner latency (and hence its'
          ' total wall time), but it is not supported by all DPB services.'
      ),
  )
  results_group.add_argument(
      '--report-dir',
      help='Directory to write out query timings to.',
  )
  parser.add_argument(
      '--fail-on-query-execution-errors',
      type=bool,
      default=False,
      help=(
          'Fail the whole script on an error while executing the queries, '
          'instead of continuing and not reporting that query run time (the '
          'default).'
      ),
  )
  parser.add_argument(
      '--dump-spark-conf',
      help=(
          'Directory to dump the spark conf props for this job. For debugging '
          'purposes.'
      ),
  )
  if args is None:
    return parser.parse_args()
  return parser.parse_args(args)