plugins/spark_upgrade/gradient_boost_trees.py (48 lines of code) (raw):
# Copyright (c) 2023 Uber Technologies, Inc.
# <p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of the License at
# <p>http://www.apache.org/licenses/LICENSE-2.0
# <p>Unless required by applicable law or agreed to in writing, software distributed under the
# License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, List, Dict
from execute_piranha import ExecutePiranha
from polyglot_piranha import (
Rule,
Filter,
)
_INSTANCE_EXPR_QUERY = """(
(instance_expression
(type_identifier) @typ_id
arguments: (arguments
(_)
(_)
(_)
)
(#eq? @typ_id "Instance")
) @inst
)"""
class GradientBoostTrees(ExecutePiranha):
def __init__(self, paths_to_codebase: List[str]):
super().__init__(
paths_to_codebase=paths_to_codebase,
substitutions={"gbt": "GradientBoostedTrees"},
language="scala",
)
def step_name(self) -> str:
return "Access execution plans"
def get_rules(self) -> List[Rule]:
gradient_boost_trees = Rule(
name="transform_GradientBoostedTrees_run",
query="""cs GradientBoostedTrees.run(
:[dataset],
:[strategy],
:[seed],
:[featureSubsetStrategy]
)""",
replace_node="*",
replace="""GradientBoostedTrees.run(
@dataset.map(data => new Instance(data.label, 1.0, data.features)),
@strategy,
@seed,
@featureSubsetStrategy
)""",
filters={
Filter(
not_contains=[_INSTANCE_EXPR_QUERY],
)
},
holes={"gbt"},
)
gradient_boost_trees_comment = Rule(
name="transform_GradientBoostedTrees_run_comment",
query="""cs GradientBoostedTrees.run(
:[dataset],
:[strategy],
:[seed],
:[featureSubsetStrategy] :[comment]
)""",
replace_node="*",
replace="""GradientBoostedTrees.run(
@dataset.map(data => new Instance(data.label, 1.0, data.features)),
@strategy,
@seed,
@featureSubsetStrategy
)""",
filters={
Filter(
not_contains=[_INSTANCE_EXPR_QUERY],
)
},
holes={"gbt"},
)
return [gradient_boost_trees, gradient_boost_trees_comment]
def summaries_to_custom_dict(self, _) -> Dict[str, Any]:
return {}