datasets/idc/pipelines/copy_tcia_data/pipeline.yaml (67 lines of code) (raw):

# Copyright 2021 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. --- resources: ~ dag: airflow_version: 2 initialize: dag_id: copy_tcia_data default_args: owner: "Google" depends_on_past: False start_date: '2021-11-23' max_active_runs: 1 schedule_interval: "@monthly" catchup: False default_view: graph tasks: - operator: "CloudDataTransferServiceGCSToGCSOperator" description: "Task to run a GCS to GCS operation using Google resources" args: task_id: copy_gcs_bucket timeout: 43200 # 12 hours retries: 0 wait: True project_id: bigquery-public-data source_bucket: "{{ var.json.idc.source_bucket }}" destination_bucket: "{{ var.json.idc.destination_bucket}}" transfer_options: deleteObjectsUniqueInSink: False - operator: "KubernetesPodOperator" description: "Transfer IDC Databases" args: task_id: "copy_bq_datasets" name: "copy_bq_datasets" namespace: "composer" service_account_name: "datasets" image_pull_policy: "Always" image: "{{ var.json.idc.container_registry.copy_bq_datasets }}" env_vars: SOURCE_PROJECT_ID: "{{ var.json.idc.source_project_id }}" TARGET_PROJECT_ID: "{{ var.json.idc.target_project_id }}" SERVICE_ACCOUNT: "{{ var.json.idc.service_account }}" DATASET_NAME: "idc" DATASET_VERSIONS: >- ["v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v11_clinical", "v12", "v12_clinical", "v13", "v13_clinical"] TIMEOUT: "36000" resources: request_memory: "128M" request_cpu: "200m" - operator: "KubernetesPodOperator" description: "Generate BQ views" args: task_id: "generate_bq_views" name: "generate_bq_views" namespace: "composer" service_account_name: "datasets" image_pull_policy: "Always" image: "{{ var.json.idc.container_registry.generate_bq_views }}" env_vars: SOURCE_PROJECT_ID: "{{ var.json.idc.source_project_id }}" TARGET_PROJECT_ID: "{{ var.json.idc.target_project_id }}" BQ_DATASETS: >- ["idc_v1", "idc_v2", "idc_v3", "idc_v4", "idc_v5", "idc_v6", "idc_v7", "idc_v8", "idc_v9", "idc_v10", "idc_v11", "idc_v11_clinical", "idc_v12", "idc_v12_clinical", "idc_v13", "idc_v13_clinical", "idc_current", "idc_current_clinical"] SERVICE_ACCOUNT: "{{ var.json.idc.service_account }}" resources: request_memory: "128M" request_cpu: "200m" graph_paths: - "copy_gcs_bucket >> copy_bq_datasets >> generate_bq_views"