utils/dataproc.py [449:507]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    job_name=None,
    aws_conn_id=None,
    gcp_conn_id="google_cloud_airflow_dataproc",
    project_id="airflow-dataproc",
    master_disk_type="pd-standard",
    worker_disk_type="pd-standard",
    master_disk_size=1024,
    worker_disk_size=1024,
    master_num_local_ssds=0,
    worker_num_local_ssds=0,
):
    """
    Create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.

    Then we call DataprocSubmitJobOperator to execute the jar defined by the arguments
    jar_urls and main_class. Once that succeeds, we teardown the cluster.

    **Example**: ::

        # Unsalted cluster name so subsequent runs fail if the cluster name exists
        cluster_name = 'test-dataproc-cluster-hwoo'

        # Defined in Airflow's UI -> Admin -> Connections
        gcp_conn_id = 'google_cloud_airflow_dataproc'

        run_dataproc_jar = SubDagOperator(
            task_id='run_dataproc_jar',
            dag=dag,
            subdag = moz_dataproc_jar_runner(
                parent_dag_name=dag.dag_id,
                dag_name='run_dataproc_jar',
                job_name='Run_some_spark_jar_on_dataproc',
                default_args=default_args,
                cluster_name=cluster_name,
                jar_urls=['gs://some_bucket/some_jar.jar'],
                main_class='com.mozilla.path.to.ClassName',
                jar_args=["-d", "{{ ds_nodash }}"],
                gcp_conn_id=gcp_conn_id)
        )

    Airflow related args:
    ---
    See moz_dataproc_pyspark_runner

    Dataproc Cluster related args:
    ---
    See moz_dataproc_pyspark_runner

    Jar runner related args:
    ---
    :param list jar_urls:               URIs to jars provisioned in Cloud Storage (example:
                                        for UDFs and libs) and are ideal to put in default arguments.
    :param str main_class:              Name of the job class entrypoint to execute.
    :param list jar_args:               Arguments for the job.

    """

    if optional_components is None:
        optional_components = ["ANACONDA"]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



utils/dataproc.py [602:668]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    job_name=None,
    aws_conn_id=None,
    gcp_conn_id="google_cloud_airflow_dataproc",
    project_id="airflow-dataproc",
    master_disk_type="pd-standard",
    worker_disk_type="pd-standard",
    master_disk_size=1024,
    worker_disk_size=1024,
    master_num_local_ssds=0,
    worker_num_local_ssds=0,
):
    """
    Create a GCP Dataproc cluster with Anaconda/Jupyter/Component gateway.

    Then we execute a script uri (either https or gcs) similar to how we use our custom AWS
    EmrSparkOperator. This will call DataprocSubmitJobOperator using EMR's script-runner.jar, which
    then executes the airflow_gcp.sh entrypoint script. The entrypoint script expects another
    script uri, along with it's arguments, as parameters. Once that succeeds, we teardown the
    cluster.

    **Example**: ::

        # Unsalted cluster name so subsequent runs fail if the cluster name exists
        cluster_name = 'test-dataproc-cluster-hwoo'

        # Defined in Airflow's UI -> Admin -> Connections
        gcp_conn_id = 'google_cloud_airflow_dataproc'

        run_dataproc_script = SubDagOperator(
            task_id='run_dataproc_script',
            dag=dag,
            subdag = moz_dataproc_scriptrunner(
                parent_dag_name=dag.dag_id,
                dag_name='run_dataproc_script',
                default_args=default_args,
                cluster_name=cluster_name,
                job_name='Run_a_script_on_dataproc',
                uri='https://raw.githubusercontent.com/mozilla/telemetry-airflow/main/jobs/some_bash_or_py_script.py',
                env={"date": "{{ ds_nodash }}"},
                arguments="-d {{ ds_nodash }}",
                gcp_conn_id=gcp_conn_id)
        )

    Airflow related args:
    ---
    See moz_dataproc_pyspark_runner

    Dataproc Cluster related args:
    ---
    See moz_dataproc_pyspark_runner

    Scriptrunner specific args:
    ---
    :param str uri:                     The HTTP or GCS URI of the script to run. Can be
                                        .py, .jar, or other type of script (e.g. bash). Is ran
                                        via the airflow_gcp.sh entrypoint. Ipynb is no longer
                                        supported.
    :param dict env:                    If env is not None, it must be a mapping that defines
                                        the environment variables for the new process
                                        (templated).
    :param str arguments:               Passed to `airflow_gcp.sh`, passed as one long string
                                        of space separated args.

    """

    if optional_components is None:
        optional_components = ["ANACONDA"]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



