o2a/mappers/mapreduce_mapper.py (57 lines of code) (raw):
# -*- coding: utf-8 -*-
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Maps Oozie pig node to Airflow's DAG"""
from typing import Dict, List, Optional, Set
from xml.etree.ElementTree import Element
from o2a.converter.task import Task
from o2a.converter.relation import Relation
from o2a.mappers.action_mapper import ActionMapper
from o2a.mappers.extensions.prepare_mapper_extension import PrepareMapperExtension
from o2a.o2a_libs.src.o2a_lib.property_utils import PropertySet
from o2a.utils.file_archive_extractors import ArchiveExtractor, FileExtractor
# pylint: disable=too-many-instance-attributes
from o2a.utils.param_extractor import extract_param_values_from_action_node
from o2a.utils.xml_utils import get_tag_el_text
class MapReduceMapper(ActionMapper):
"""
Converts a MapReduce Oozie node to an Airflow task.
"""
def __init__(self, oozie_node: Element, name: str, dag_name: str, props: PropertySet, **kwargs):
ActionMapper.__init__(
self, oozie_node=oozie_node, name=name, dag_name=dag_name, props=props, **kwargs
)
self.params_dict: Dict[str, str] = {}
self.file_extractor = FileExtractor(oozie_node=oozie_node, props=self.props)
self.archive_extractor = ArchiveExtractor(oozie_node=oozie_node, props=self.props)
self.name_node: Optional[str] = None
self.hdfs_files: Optional[List[str]] = None
self.hdfs_archives: Optional[List[str]] = None
self.prepare_extension: PrepareMapperExtension = PrepareMapperExtension(self)
def on_parse_node(self):
super().on_parse_node()
self.name_node = get_tag_el_text(self.oozie_node, "name-node")
self.params_dict = extract_param_values_from_action_node(self.oozie_node)
_, self.hdfs_files = self.file_extractor.parse_node()
_, self.hdfs_archives = self.archive_extractor.parse_node()
def to_tasks_and_relations(self):
action_task = Task(
task_id=self.name,
template_name="mapreduce.tpl",
template_params=dict(
props=self.props,
params_dict=self.params_dict,
hdfs_files=self.hdfs_files,
hdfs_archives=self.hdfs_archives,
action_node_properties=self.props.action_node_properties,
),
)
tasks = [action_task]
relations: List[Relation] = []
prepare_task = self.prepare_extension.get_prepare_task()
if prepare_task:
tasks, relations = self.prepend_task(prepare_task, tasks, relations)
return tasks, relations
@staticmethod
def _validate_paths(input_directory_path, output_directory_path):
if not input_directory_path:
raise Exception(f"The input_directory_path should be set and is {input_directory_path}")
if not output_directory_path:
raise Exception(f"The output_directory_path should be set and is {output_directory_path}")
def required_imports(self) -> Set[str]:
return {
"from airflow.utils import dates",
"from airflow.providers.google.cloud.operators.dataproc import DataprocSubmitJobOperator"
}