datasets/human_variant_annotation/pipelines/clinvar/pipeline.yaml (74 lines of code) (raw):
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
resources: ~
dag:
airflow_version: 2
initialize:
dag_id: clinvar
default_args:
owner: "Google"
depends_on_past: False
start_date: "2021-03-01"
max_active_runs: 1
schedule_interval: "@daily"
catchup: False
default_view: graph
tasks:
- operator: "KubernetesPodOperator"
description: "Run CSV transform within kubernetes pod"
args:
task_id: "clinvar_vcf_grch37"
startup_timeout_seconds: 600
name: "name_basics"
namespace: "composer-user-workloads"
service_account_name: "default"
config_file: "/home/airflow/composer_kube_config"
image_pull_policy: "Always"
image: "{{ var.json.human_variant_annotation.container_registry.run_csv_transform_kub }}"
env_vars:
BASE_URL: "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/"
FOLDER: "vcf_GRCh37"
VERSION: "2.0"
GCS_BUCKET: "{{ var.value.composer_bucket }}"
TARGET_GCS_FOLDER: "data/human_variant_annotation/clinVar-vcf_GRCh37/"
PIPELINE: "clinvar"
- operator: "GoogleCloudStorageToGoogleCloudStorageOperator"
description: "Task to run a GoogleCloudStorageToGoogleCloudStorageOperator"
args:
task_id: "copy_clinvar_v1_to_gcs_destination_bucket"
source_bucket: "{{ var.value.composer_bucket }}"
source_object: "data/human_variant_annotation/clinVar-vcf_GRCh37/*"
destination_bucket: "{{ var.json.human_variant_annotation.destination_bucket }}"
destination_object: "human-variant-annotation/clinVar-vcf_GRCh37/"
move_object: False
replace: False
- operator: "KubernetesPodOperator"
description: "Run CSV transform within kubernetes pod"
args:
task_id: "clinvar_vcf_grch38"
startup_timeout_seconds: 600
name: "name_basics"
namespace: "composer-user-workloads"
service_account_name: "default"
config_file: "/home/airflow/composer_kube_config"
image_pull_policy: "Always"
image: "{{ var.json.human_variant_annotation.container_registry.run_csv_transform_kub }}"
env_vars:
BASE_URL: "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/"
FOLDER: "vcf_GRCh38"
VERSION: "2.0"
GCS_BUCKET: "{{ var.value.composer_bucket }}"
TARGET_GCS_FOLDER: "data/human_variant_annotation/clinVar-vcf_GRCh38/"
PIPELINE: "db_snp"
- operator: "GoogleCloudStorageToGoogleCloudStorageOperator"
description: "Task to run a GoogleCloudStorageToGoogleCloudStorageOperator"
args:
task_id: "copy_clinvar_v2_to_gcs_destination_bucket"
source_bucket: "{{ var.value.composer_bucket }}"
source_object: "data/human_variant_annotation/clinVar-vcf_GRCh38/*"
destination_bucket: "{{ var.json.human_variant_annotation.destination_bucket }}"
destination_object: "human-variant-annotation/clinVar-vcf_GRCh38/"
move_object: False
replace: False
graph_paths:
- "clinvar_vcf_grch37 >> copy_clinvar_v1_to_gcs_destination_bucket"
- "clinvar_vcf_grch38 >> copy_clinvar_v2_to_gcs_destination_bucket"