datasets/idc/pipelines/copy_tcia_data/pipeline.yaml (67 lines of code) (raw):
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
resources: ~
dag:
airflow_version: 2
initialize:
dag_id: copy_tcia_data
default_args:
owner: "Google"
depends_on_past: False
start_date: '2021-11-23'
max_active_runs: 1
schedule_interval: "@monthly"
catchup: False
default_view: graph
tasks:
- operator: "CloudDataTransferServiceGCSToGCSOperator"
description: "Task to run a GCS to GCS operation using Google resources"
args:
task_id: copy_gcs_bucket
timeout: 43200 # 12 hours
retries: 0
wait: True
project_id: bigquery-public-data
source_bucket: "{{ var.json.idc.source_bucket }}"
destination_bucket: "{{ var.json.idc.destination_bucket}}"
transfer_options:
deleteObjectsUniqueInSink: False
- operator: "KubernetesPodOperator"
description: "Transfer IDC Databases"
args:
task_id: "copy_bq_datasets"
name: "copy_bq_datasets"
namespace: "composer"
service_account_name: "datasets"
image_pull_policy: "Always"
image: "{{ var.json.idc.container_registry.copy_bq_datasets }}"
env_vars:
SOURCE_PROJECT_ID: "{{ var.json.idc.source_project_id }}"
TARGET_PROJECT_ID: "{{ var.json.idc.target_project_id }}"
SERVICE_ACCOUNT: "{{ var.json.idc.service_account }}"
DATASET_NAME: "idc"
DATASET_VERSIONS: >-
["v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v11_clinical", "v12", "v12_clinical", "v13", "v13_clinical"]
TIMEOUT: "36000"
resources:
request_memory: "128M"
request_cpu: "200m"
- operator: "KubernetesPodOperator"
description: "Generate BQ views"
args:
task_id: "generate_bq_views"
name: "generate_bq_views"
namespace: "composer"
service_account_name: "datasets"
image_pull_policy: "Always"
image: "{{ var.json.idc.container_registry.generate_bq_views }}"
env_vars:
SOURCE_PROJECT_ID: "{{ var.json.idc.source_project_id }}"
TARGET_PROJECT_ID: "{{ var.json.idc.target_project_id }}"
BQ_DATASETS: >-
["idc_v1", "idc_v2", "idc_v3", "idc_v4", "idc_v5", "idc_v6", "idc_v7", "idc_v8", "idc_v9", "idc_v10", "idc_v11", "idc_v11_clinical", "idc_v12", "idc_v12_clinical", "idc_v13", "idc_v13_clinical", "idc_current", "idc_current_clinical"]
SERVICE_ACCOUNT: "{{ var.json.idc.service_account }}"
resources:
request_memory: "128M"
request_cpu: "200m"
graph_paths:
- "copy_gcs_bucket >> copy_bq_datasets >> generate_bq_views"