python/pyspark/sql/connect/plan.py

# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # mypy: disable-error-code="operator" from pyspark.resource import ResourceProfile from pyspark.sql.connect.utils import check_dependencies check_dependencies(__name__) from typing import ( Any, List, Optional, Type, Sequence, Union, cast, TYPE_CHECKING, Mapping, Dict, Tuple, ) import functools import json import pickle from threading import Lock from inspect import signature, isclass import pyarrow as pa from pyspark.serializers import CloudPickleSerializer from pyspark.storagelevel import StorageLevel from pyspark.sql.types import DataType import pyspark.sql.connect.proto as proto from pyspark.sql.column import Column from pyspark.sql.connect.logging import logger from pyspark.sql.connect.proto import base_pb2 as spark_dot_connect_dot_base__pb2 from pyspark.sql.connect.conversion import storage_level_to_proto from pyspark.sql.connect.expressions import Expression, SubqueryExpression from pyspark.sql.connect.types import pyspark_types_to_proto_types, UnparsedDataType from pyspark.errors import ( AnalysisException, PySparkValueError, PySparkPicklingError, ) if TYPE_CHECKING: from pyspark.sql.connect.client import SparkConnectClient from pyspark.sql.connect.udf import UserDefinedFunction from pyspark.sql.connect.observation import Observation from pyspark.sql.connect.session import SparkSession class LogicalPlan: _lock: Lock = Lock() _nextPlanId: int = 0 INDENT = 2 def __init__( self, child: Optional["LogicalPlan"], references: Optional[Sequence["LogicalPlan"]] = None ) -> None: """ Parameters ---------- child : :class:`LogicalPlan`, optional. The child logical plan. references : list of :class:`LogicalPlan`, optional. The list of logical plans that are referenced as subqueries in this logical plan. """ self._child = child self._root_plan_id = LogicalPlan._fresh_plan_id() self._references: Sequence["LogicalPlan"] = references or [] self._plan_id_with_rel: Optional[int] = None if len(self._references) > 0: assert all(isinstance(r, LogicalPlan) for r in self._references) self._plan_id_with_rel = LogicalPlan._fresh_plan_id() @property def _plan_id(self) -> int: return self._plan_id_with_rel or self._root_plan_id @staticmethod def _fresh_plan_id() -> int: plan_id: Optional[int] = None with LogicalPlan._lock: plan_id = LogicalPlan._nextPlanId LogicalPlan._nextPlanId += 1 assert plan_id is not None return plan_id def _create_proto_relation(self) -> proto.Relation: plan = proto.Relation() plan.common.plan_id = self._root_plan_id return plan def plan(self, session: "SparkConnectClient") -> proto.Relation: # type: ignore[empty-body] ... def command(self, session: "SparkConnectClient") -> proto.Command: # type: ignore[empty-body] ... def _verify(self, session: "SparkConnectClient") -> bool: """This method is used to verify that the current logical plan can be serialized to Proto and back and afterwards is identical.""" plan = proto.Plan() plan.root.CopyFrom(self.plan(session)) serialized_plan = plan.SerializeToString() test_plan = proto.Plan() test_plan.ParseFromString(serialized_plan) return test_plan == plan def to_proto(self, session: "SparkConnectClient", debug: bool = False) -> proto.Plan: """ Generates connect proto plan based on this LogicalPlan. Parameters ---------- session : :class:`SparkConnectClient`, optional. a session that connects remote spark cluster. debug: bool if enabled, the proto plan will be printed. """ plan = proto.Plan() plan.root.CopyFrom(self.plan(session)) if debug: print(plan) return plan @property def observations(self) -> Dict[str, "Observation"]: if self._child is None: return {} else: return self._child.observations @staticmethod def _collect_references( cols_or_exprs: Sequence[Union[Column, Expression]] ) -> Sequence["LogicalPlan"]: references: List[LogicalPlan] = [] def append_reference(e: Expression) -> None: if isinstance(e, SubqueryExpression): references.append(e._plan) for col_or_expr in cols_or_exprs: if isinstance(col_or_expr, Column): col_or_expr._expr.foreach(append_reference) else: col_or_expr.foreach(append_reference) return references def _with_relations( self, root: proto.Relation, session: "SparkConnectClient" ) -> proto.Relation: if len(self._references) == 0: return root else: # When there are references to other DataFrame, e.g., subqueries, build new plan like: # with_relations [id 10] # root: plan [id 9] # reference: # refs#1: [id 8] # refs#2: [id 5] plan = proto.Relation() assert isinstance(self._plan_id_with_rel, int) plan.common.plan_id = self._plan_id_with_rel plan.with_relations.root.CopyFrom(root) plan.with_relations.references.extend([ref.plan(session) for ref in self._references]) return plan def _parameters_to_print(self, parameters: Mapping[str, Any]) -> Mapping[str, Any]: """ Extracts the parameters that are able to be printed. It looks up the signature in the constructor of this :class:`LogicalPlan`, and retrieves the variables from this instance by the same name (or the name with prefix `_`) defined in the constructor. Parameters ---------- parameters : map Parameter mapping from ``inspect.signature(...).parameters`` Returns ------- dict A dictionary consisting of a string name and variable found in this :class:`LogicalPlan`. Notes ----- :class:`LogicalPlan` itself is filtered out and considered as a non-printable parameter. Examples -------- The example below returns a dictionary from `self._start`, `self._end`, `self._num_partitions`. >>> rg = Range(0, 10, 1) >>> rg._parameters_to_print(signature(rg.__class__.__init__).parameters) {'start': 0, 'end': 10, 'step': 1, 'num_partitions': None} If the child is defined, it is not considered as a printable instance >>> project = Project(rg, "value") >>> project._parameters_to_print(signature(project.__class__.__init__).parameters) {'columns': ['value']} """ params = {} for name, tpe in parameters.items(): # LogicalPlan is not to print, e.g., LogicalPlan is_logical_plan = isclass(tpe.annotation) and isinstance(tpe.annotation, LogicalPlan) # Look up the string argument defined as a forward reference e.g., "LogicalPlan" is_forwardref_logical_plan = getattr(tpe.annotation, "__forward_arg__", "").endswith( "LogicalPlan" ) # Wrapped LogicalPlan, e.g., Optional[LogicalPlan] is_nested_logical_plan = any( isclass(a) and issubclass(a, LogicalPlan) for a in getattr(tpe.annotation, "__args__", ()) ) # Wrapped forward reference of LogicalPlan, e.g., Optional["LogicalPlan"]. is_nested_forwardref_logical_plan = any( getattr(a, "__forward_arg__", "").endswith("LogicalPlan") for a in getattr(tpe.annotation, "__args__", ()) ) if ( not is_logical_plan and not is_forwardref_logical_plan and not is_nested_logical_plan and not is_nested_forwardref_logical_plan ): # Searches self.name or self._name try: params[name] = getattr(self, name) except AttributeError: try: params[name] = getattr(self, "_" + name) except AttributeError: pass # Simply ignore return params def print(self, indent: int = 0) -> str: """ Print the simple string representation of the current :class:`LogicalPlan`. Parameters ---------- indent : int The number of leading spaces for the output string. Returns ------- str Simple string representation of this :class:`LogicalPlan`. """ params = self._parameters_to_print(signature(self.__class__.__init__).parameters) pretty_params = [f"{name}='{param}'" for name, param in params.items()] if len(pretty_params) == 0: pretty_str = "" else: pretty_str = " " + ", ".join(pretty_params) return f"{' ' * indent}<{self.__class__.__name__}{pretty_str}>\n{self._child_print(indent)}" def _repr_html_(self) -> str: """Returns a :class:`LogicalPlan` with HTML code. This is generally called in third-party systems such as Jupyter. Returns ------- str HTML representation of this :class:`LogicalPlan`. """ params = self._parameters_to_print(signature(self.__class__.__init__).parameters) pretty_params = [ f"\n {name}: " f"{param} " for name, param in params.items() ] if len(pretty_params) == 0: pretty_str = "" else: pretty_str = "".join(pretty_params) return f""" <ul> <li> {self.__class__.__name__} {pretty_str} {self._child_repr()} </li> </ul> """ def _child_print(self, indent: int) -> str: return self._child.print(indent + LogicalPlan.INDENT) if self._child else "" def _child_repr(self) -> str: return self._child._repr_html_() if self._child is not None else "" class DataSource(LogicalPlan): """A datasource with a format and optional a schema from which Spark reads data""" def __init__( self, format: Optional[str] = None, schema: Optional[str] = None, options: Optional[Mapping[str, str]] = None, paths: Optional[List[str]] = None, predicates: Optional[List[str]] = None, is_streaming: Optional[bool] = None, ) -> None: super().__init__(None) assert format is None or isinstance(format, str) assert schema is None or isinstance(schema, str) if options is not None: new_options = {} for k, v in options.items(): if v is not None: assert isinstance(k, str) assert isinstance(v, str) new_options[k] = v options = new_options if paths is not None: assert isinstance(paths, list) assert all(isinstance(path, str) for path in paths) if predicates is not None: assert isinstance(predicates, list) assert all(isinstance(predicate, str) for predicate in predicates) self._format = format self._schema = schema self._options = options self._paths = paths self._predicates = predicates self._is_streaming = is_streaming def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() if self._format is not None: plan.read.data_source.format = self._format if self._schema is not None: plan.read.data_source.schema = self._schema if self._options is not None and len(self._options) > 0: for k, v in self._options.items(): plan.read.data_source.options[k] = v if self._paths is not None and len(self._paths) > 0: plan.read.data_source.paths.extend(self._paths) if self._predicates is not None and len(self._predicates) > 0: plan.read.data_source.predicates.extend(self._predicates) if self._is_streaming is not None: plan.read.is_streaming = self._is_streaming return plan class Read(LogicalPlan): def __init__( self, table_name: str, options: Optional[Dict[str, str]] = None, is_streaming: Optional[bool] = None, ) -> None: super().__init__(None) self.table_name = table_name self.options = options or {} self._is_streaming = is_streaming def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.read.named_table.unparsed_identifier = self.table_name if self._is_streaming is not None: plan.read.is_streaming = self._is_streaming for k, v in self.options.items(): plan.read.named_table.options[k] = v return plan def print(self, indent: int = 0) -> str: return f"{' ' * indent}<Read table_name={self.table_name}>\n" class LocalRelation(LogicalPlan): """Creates a LocalRelation plan object based on a PyArrow Table.""" def __init__( self, table: Optional["pa.Table"], schema: Optional[str] = None, ) -> None: super().__init__(None) if table is None: assert schema is not None else: assert isinstance(table, pa.Table) assert schema is None or isinstance(schema, str) self._table = table self._schema = schema def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() if self._table is not None: sink = pa.BufferOutputStream() with pa.ipc.new_stream(sink, self._table.schema) as writer: for b in self._table.to_batches(): writer.write_batch(b) plan.local_relation.data = sink.getvalue().to_pybytes() if self._schema is not None: plan.local_relation.schema = self._schema return plan def serialize(self, session: "SparkConnectClient") -> bytes: p = self.plan(session) return bytes(p.local_relation.SerializeToString()) def print(self, indent: int = 0) -> str: return f"{' ' * indent}<LocalRelation>\n" def _repr_html_(self) -> str: return """ <ul> <li>LocalRelation</li> </ul> """ class CachedLocalRelation(LogicalPlan): """Creates a CachedLocalRelation plan object based on a hash of a LocalRelation.""" def __init__(self, hash: str) -> None: super().__init__(None) self._hash = hash def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() clr = plan.cached_local_relation clr.hash = self._hash return plan def print(self, indent: int = 0) -> str: return f"{' ' * indent}<CachedLocalRelation>\n" def _repr_html_(self) -> str: return """ <ul> <li>CachedLocalRelation</li> </ul> """ class ShowString(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], num_rows: int, truncate: int, vertical: bool ) -> None: super().__init__(child) self.num_rows = num_rows self.truncate = truncate self.vertical = vertical def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.show_string.input.CopyFrom(self._child.plan(session)) plan.show_string.num_rows = self.num_rows plan.show_string.truncate = self.truncate plan.show_string.vertical = self.vertical return plan class HtmlString(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], num_rows: int, truncate: int) -> None: super().__init__(child) self.num_rows = num_rows self.truncate = truncate def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.html_string.input.CopyFrom(self._child.plan(session)) plan.html_string.num_rows = self.num_rows plan.html_string.truncate = self.truncate return plan class Project(LogicalPlan): """Logical plan object for a projection. All input arguments are directly serialized into the corresponding protocol buffer objects. This class only provides very limited error handling and input validation. To be compatible with PySpark, we validate that the input arguments are all expressions to be able to serialize them to the server. """ def __init__( self, child: Optional["LogicalPlan"], columns: List[Column], ) -> None: assert all(isinstance(c, Column) for c in columns) super().__init__(child, self._collect_references(columns)) self._columns = columns def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.project.input.CopyFrom(self._child.plan(session)) plan.project.expressions.extend([c.to_plan(session) for c in self._columns]) return self._with_relations(plan, session) class WithColumns(LogicalPlan): """Logical plan object for a withColumns operation.""" def __init__( self, child: Optional["LogicalPlan"], columnNames: Sequence[str], columns: Sequence[Column], metadata: Optional[Sequence[str]] = None, ) -> None: assert isinstance(columnNames, list) assert len(columnNames) > 0 assert all(isinstance(c, str) for c in columnNames) assert isinstance(columns, list) assert len(columns) == len(columnNames) assert all(isinstance(c, Column) for c in columns) if metadata is not None: assert isinstance(metadata, list) assert len(metadata) == len(columnNames) for m in metadata: assert isinstance(m, str) # validate json string assert m == "" or json.loads(m) is not None super().__init__(child, self._collect_references(columns)) self._columnNames = columnNames self._columns = columns self._metadata = metadata def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.with_columns.input.CopyFrom(self._child.plan(session)) for i in range(0, len(self._columnNames)): alias = proto.Expression.Alias() alias.expr.CopyFrom(self._columns[i].to_plan(session)) alias.name.append(self._columnNames[i]) if self._metadata is not None: alias.metadata = self._metadata[i] plan.with_columns.aliases.append(alias) return self._with_relations(plan, session) class WithWatermark(LogicalPlan): """Logical plan object for a WithWatermark operation.""" def __init__(self, child: Optional["LogicalPlan"], event_time: str, delay_threshold: str): super().__init__(child) self._event_time = event_time self._delay_threshold = delay_threshold def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.with_watermark.input.CopyFrom(self._child.plan(session)) plan.with_watermark.event_time = self._event_time plan.with_watermark.delay_threshold = self._delay_threshold return plan class CachedRemoteRelation(LogicalPlan): """Logical plan object for a DataFrame reference which represents a DataFrame that's been cached on the server with a given id.""" def __init__(self, relation_id: str, spark_session: "SparkSession"): super().__init__(None) self._relation_id = relation_id # Needs to hold the session to make a request itself. self._spark_session = spark_session def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.cached_remote_relation.relation_id = self._relation_id return plan def __del__(self) -> None: session = self._spark_session # If session is already closed, all cached DataFrame should be released. if session is not None and not session.client.is_closed and self._relation_id is not None: try: command = RemoveRemoteCachedRelation(self).command(session=session.client) req = session.client._execute_plan_request_with_metadata() if session.client._user_id: req.user_context.user_id = session.client._user_id req.plan.command.CopyFrom(command) for attempt in session.client._retrying(): with attempt: # !!HACK ALERT!! # unary_stream does not work on Python's exit for an unknown reasons # Therefore, here we open unary_unary channel instead. # See also :class:`SparkConnectServiceStub`. request_serializer = ( spark_dot_connect_dot_base__pb2.ExecutePlanRequest.SerializeToString ) response_deserializer = ( spark_dot_connect_dot_base__pb2.ExecutePlanResponse.FromString ) channel = session.client._channel.unary_unary( "/spark.connect.SparkConnectService/ExecutePlan", request_serializer=request_serializer, response_deserializer=response_deserializer, ) metadata = session.client._builder.metadata() channel(req, metadata=metadata) # type: ignore[arg-type] except Exception as e: logger.warn(f"RemoveRemoteCachedRelation failed with exception: {e}.") class Hint(LogicalPlan): """Logical plan object for a Hint operation.""" def __init__( self, child: Optional["LogicalPlan"], name: str, parameters: Sequence[Column], ) -> None: assert isinstance(name, str) assert parameters is not None and isinstance(parameters, List) for param in parameters: assert isinstance(param, Column) super().__init__(child, self._collect_references(parameters)) self._name = name self._parameters = parameters def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.hint.input.CopyFrom(self._child.plan(session)) plan.hint.name = self._name plan.hint.parameters.extend([param.to_plan(session) for param in self._parameters]) return self._with_relations(plan, session) class Filter(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], filter: Column) -> None: super().__init__(child, self._collect_references([filter])) self.filter = filter def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.filter.input.CopyFrom(self._child.plan(session)) plan.filter.condition.CopyFrom(self.filter.to_plan(session)) return self._with_relations(plan, session) class Limit(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], limit: int) -> None: super().__init__(child) self.limit = limit def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.limit.input.CopyFrom(self._child.plan(session)) plan.limit.limit = self.limit return plan class Tail(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], limit: int) -> None: super().__init__(child) self.limit = limit def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.tail.input.CopyFrom(self._child.plan(session)) plan.tail.limit = self.limit return plan class Offset(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], offset: int = 0) -> None: super().__init__(child) self.offset = offset def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.offset.input.CopyFrom(self._child.plan(session)) plan.offset.offset = self.offset return plan class Deduplicate(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], all_columns_as_keys: bool = False, column_names: Optional[List[str]] = None, within_watermark: bool = False, ) -> None: super().__init__(child) self.all_columns_as_keys = all_columns_as_keys self.column_names = column_names self.within_watermark = within_watermark def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.deduplicate.input.CopyFrom(self._child.plan(session)) plan.deduplicate.all_columns_as_keys = self.all_columns_as_keys plan.deduplicate.within_watermark = self.within_watermark if self.column_names is not None: plan.deduplicate.column_names.extend(self.column_names) return plan class Sort(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], columns: List[Column], is_global: bool, ) -> None: assert all(isinstance(c, Column) for c in columns) assert isinstance(is_global, bool) super().__init__(child, self._collect_references(columns)) self.columns = columns self.is_global = is_global def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.sort.input.CopyFrom(self._child.plan(session)) plan.sort.order.extend([c.to_plan(session).sort_order for c in self.columns]) plan.sort.is_global = self.is_global return self._with_relations(plan, session) class Drop(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], columns: List[Union[Column, str]], ) -> None: if len(columns) > 0: assert all(isinstance(c, (Column, str)) for c in columns) super().__init__( child, self._collect_references([c for c in columns if isinstance(c, Column)]) ) self._columns = columns def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.drop.input.CopyFrom(self._child.plan(session)) for c in self._columns: if isinstance(c, Column): plan.drop.columns.append(c.to_plan(session)) else: plan.drop.column_names.append(c) return self._with_relations(plan, session) class Sample(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], lower_bound: float, upper_bound: float, with_replacement: bool, seed: int, deterministic_order: bool = False, ) -> None: super().__init__(child) self.lower_bound = lower_bound self.upper_bound = upper_bound self.with_replacement = with_replacement self.seed = seed self.deterministic_order = deterministic_order def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.sample.input.CopyFrom(self._child.plan(session)) plan.sample.lower_bound = self.lower_bound plan.sample.upper_bound = self.upper_bound plan.sample.with_replacement = self.with_replacement plan.sample.seed = self.seed plan.sample.deterministic_order = self.deterministic_order return plan class Aggregate(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], group_type: str, grouping_cols: Sequence[Column], aggregate_cols: Sequence[Column], pivot_col: Optional[Column], pivot_values: Optional[Sequence[Column]], grouping_sets: Optional[Sequence[Sequence[Column]]], ) -> None: assert isinstance(group_type, str) and group_type in [ "groupby", "rollup", "cube", "pivot", "grouping_sets", ] assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols) assert isinstance(aggregate_cols, list) and all( isinstance(c, Column) for c in aggregate_cols ) if group_type == "pivot": assert pivot_col is not None and isinstance(pivot_col, Column) assert pivot_values is None or isinstance(pivot_values, list) elif group_type == "grouping_sets": assert grouping_sets is None or isinstance(grouping_sets, list) else: assert pivot_col is None assert pivot_values is None assert grouping_sets is None super().__init__( child, self._collect_references( grouping_cols + aggregate_cols + ([pivot_col] if pivot_col is not None else []) + (pivot_values if pivot_values is not None else []) + ([g for gs in grouping_sets for g in gs] if grouping_sets is not None else []) ), ) self._group_type = group_type self._grouping_cols = grouping_cols self._aggregate_cols = aggregate_cols self._pivot_col = pivot_col self._pivot_values = pivot_values self._grouping_sets = grouping_sets def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.aggregate.input.CopyFrom(self._child.plan(session)) plan.aggregate.grouping_expressions.extend( [c.to_plan(session) for c in self._grouping_cols] ) plan.aggregate.aggregate_expressions.extend( [c.to_plan(session) for c in self._aggregate_cols] ) if self._group_type == "groupby": plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY elif self._group_type == "rollup": plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP elif self._group_type == "cube": plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_CUBE elif self._group_type == "pivot": plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_PIVOT assert self._pivot_col is not None plan.aggregate.pivot.col.CopyFrom(self._pivot_col.to_plan(session)) if self._pivot_values is not None and len(self._pivot_values) > 0: plan.aggregate.pivot.values.extend( [v.to_plan(session).literal for v in self._pivot_values] ) elif self._group_type == "grouping_sets": plan.aggregate.group_type = proto.Aggregate.GroupType.GROUP_TYPE_GROUPING_SETS assert self._grouping_sets is not None for grouping_set in self._grouping_sets: plan.aggregate.grouping_sets.append( proto.Aggregate.GroupingSets( grouping_set=[c.to_plan(session) for c in grouping_set] ) ) return self._with_relations(plan, session) class Join(LogicalPlan): def __init__( self, left: Optional["LogicalPlan"], right: "LogicalPlan", on: Optional[Union[str, List[str], Column, List[Column]]], how: Optional[str], ) -> None: super().__init__( left, self._collect_references( [] if on is None or isinstance(on, str) else [on] if isinstance(on, Column) else [c for c in on if isinstance(c, Column)] ), ) self.left = cast(LogicalPlan, left) self.right = right self.on = on if how is None: join_type = proto.Join.JoinType.JOIN_TYPE_INNER elif how == "inner": join_type = proto.Join.JoinType.JOIN_TYPE_INNER elif how in ["outer", "full", "fullouter"]: join_type = proto.Join.JoinType.JOIN_TYPE_FULL_OUTER elif how in ["leftouter", "left"]: join_type = proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER elif how in ["rightouter", "right"]: join_type = proto.Join.JoinType.JOIN_TYPE_RIGHT_OUTER elif how in ["leftsemi", "semi"]: join_type = proto.Join.JoinType.JOIN_TYPE_LEFT_SEMI elif how in ["leftanti", "anti"]: join_type = proto.Join.JoinType.JOIN_TYPE_LEFT_ANTI elif how == "cross": join_type = proto.Join.JoinType.JOIN_TYPE_CROSS else: raise AnalysisException( errorClass="UNSUPPORTED_JOIN_TYPE", messageParameters={ "typ": how, "supported": ( "'" + "', '".join( [ "inner", "outer", "full", "fullouter", "full_outer", "leftouter", "left", "left_outer", "rightouter", "right", "right_outer", "leftsemi", "left_semi", "semi", "leftanti", "left_anti", "anti", "cross", ] ) + "'" ), }, ) self.how = join_type def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.join.left.CopyFrom(self.left.plan(session)) plan.join.right.CopyFrom(self.right.plan(session)) if self.on is not None: if not isinstance(self.on, list): if isinstance(self.on, str): plan.join.using_columns.append(self.on) else: plan.join.join_condition.CopyFrom(self.on.to_plan(session)) elif len(self.on) > 0: if isinstance(self.on[0], str): plan.join.using_columns.extend(cast(str, self.on)) else: merge_column = functools.reduce(lambda c1, c2: c1 & c2, self.on) plan.join.join_condition.CopyFrom(cast(Column, merge_column).to_plan(session)) plan.join.join_type = self.how return self._with_relations(plan, session) @property def observations(self) -> Dict[str, "Observation"]: return dict(**super().observations, **self.right.observations) def print(self, indent: int = 0) -> str: i = " " * indent o = " " * (indent + LogicalPlan.INDENT) n = indent + LogicalPlan.INDENT * 2 return ( f"{i}<Join on={self.on} how={self.how}>\n{o}" f"left=\n{self.left.print(n)}\n{o}right=\n{self.right.print(n)}" ) def _repr_html_(self) -> str: return f""" <ul> <li> Join Left: {self.left._repr_html_()} Right: {self.right._repr_html_()} </li> </uL> """ class AsOfJoin(LogicalPlan): def __init__( self, left: LogicalPlan, right: LogicalPlan, left_as_of: Column, right_as_of: Column, on: Optional[Union[str, List[str], Column, List[Column]]], how: str, tolerance: Optional[Column], allow_exact_matches: bool, direction: str, ) -> None: super().__init__( left, self._collect_references( [left_as_of, right_as_of] + ( [] if on is None or isinstance(on, str) else [on] if isinstance(on, Column) else [c for c in on if isinstance(c, Column)] ) + ([tolerance] if tolerance is not None else []) ), ) self.left = left self.right = right self.left_as_of = left_as_of self.right_as_of = right_as_of self.on = on self.how = how self.tolerance = tolerance self.allow_exact_matches = allow_exact_matches self.direction = direction def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.as_of_join.left.CopyFrom(self.left.plan(session)) plan.as_of_join.right.CopyFrom(self.right.plan(session)) plan.as_of_join.left_as_of.CopyFrom(self.left_as_of.to_plan(session)) plan.as_of_join.right_as_of.CopyFrom(self.right_as_of.to_plan(session)) if self.on is not None: if not isinstance(self.on, list): if isinstance(self.on, str): plan.as_of_join.using_columns.append(self.on) else: plan.as_of_join.join_expr.CopyFrom(self.on.to_plan(session)) elif len(self.on) > 0: if isinstance(self.on[0], str): plan.as_of_join.using_columns.extend(cast(List[str], self.on)) else: merge_column = functools.reduce(lambda c1, c2: c1 & c2, self.on) plan.as_of_join.join_expr.CopyFrom(cast(Column, merge_column).to_plan(session)) plan.as_of_join.join_type = self.how if self.tolerance is not None: plan.as_of_join.tolerance.CopyFrom(self.tolerance.to_plan(session)) plan.as_of_join.allow_exact_matches = self.allow_exact_matches plan.as_of_join.direction = self.direction return self._with_relations(plan, session) @property def observations(self) -> Dict[str, "Observation"]: return dict(**super().observations, **self.right.observations) def print(self, indent: int = 0) -> str: assert self.left is not None assert self.right is not None i = " " * indent o = " " * (indent + LogicalPlan.INDENT) n = indent + LogicalPlan.INDENT * 2 return ( f"{i}<AsOfJoin left_as_of={self.left_as_of}, right_as_of={self.right_as_of}, " f"on={self.on} how={self.how}>\n{o}" f"left=\n{self.left.print(n)}\n{o}right=\n{self.right.print(n)}" ) def _repr_html_(self) -> str: assert self.left is not None assert self.right is not None return f""" <ul> <li> AsOfJoin Left: {self.left._repr_html_()} Right: {self.right._repr_html_()} </li> </uL> """ class LateralJoin(LogicalPlan): def __init__( self, left: Optional[LogicalPlan], right: LogicalPlan, on: Optional[Column], how: Optional[str], ) -> None: super().__init__(left, self._collect_references([on] if on is not None else [])) self.left = cast(LogicalPlan, left) self.right = right self.on = on if how is None: join_type = proto.Join.JoinType.JOIN_TYPE_INNER elif how == "inner": join_type = proto.Join.JoinType.JOIN_TYPE_INNER elif how in ["leftouter", "left"]: join_type = proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER elif how == "cross": join_type = proto.Join.JoinType.JOIN_TYPE_CROSS else: raise AnalysisException( errorClass="UNSUPPORTED_JOIN_TYPE", messageParameters={ "typ": how, "supported": ( "'" + "', '".join(["inner", "leftouter", "left", "left_outer", "cross"]) + "'" ), }, ) self.how = join_type def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.lateral_join.left.CopyFrom(self.left.plan(session)) plan.lateral_join.right.CopyFrom(self.right.plan(session)) if self.on is not None: plan.lateral_join.join_condition.CopyFrom(self.on.to_plan(session)) plan.lateral_join.join_type = self.how return self._with_relations(plan, session) @property def observations(self) -> Dict[str, "Observation"]: return dict(**super().observations, **self.right.observations) def print(self, indent: int = 0) -> str: i = " " * indent o = " " * (indent + LogicalPlan.INDENT) n = indent + LogicalPlan.INDENT * 2 return ( f"{i}<LateralJoin on={self.on} how={self.how}>\n{o}" f"left=\n{self.left.print(n)}\n{o}right=\n{self.right.print(n)}" ) def _repr_html_(self) -> str: return f""" <ul> <li> LateralJoin Left: {self.left._repr_html_()} Right: {self.right._repr_html_()} </li> </uL> """ class SetOperation(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], other: Optional["LogicalPlan"], set_op: str, is_all: bool = True, by_name: bool = False, allow_missing_columns: bool = False, ) -> None: super().__init__(child) self.other = other self.by_name = by_name self.is_all = is_all self.set_op = set_op self.allow_missing_columns = allow_missing_columns def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() if self._child is not None: plan.set_op.left_input.CopyFrom(self._child.plan(session)) if self.other is not None: plan.set_op.right_input.CopyFrom(self.other.plan(session)) if self.set_op == "union": plan.set_op.set_op_type = proto.SetOperation.SET_OP_TYPE_UNION elif self.set_op == "intersect": plan.set_op.set_op_type = proto.SetOperation.SET_OP_TYPE_INTERSECT elif self.set_op == "except": plan.set_op.set_op_type = proto.SetOperation.SET_OP_TYPE_EXCEPT else: raise PySparkValueError( errorClass="UNSUPPORTED_OPERATION", messageParameters={"operation": self.set_op}, ) plan.set_op.is_all = self.is_all plan.set_op.by_name = self.by_name plan.set_op.allow_missing_columns = self.allow_missing_columns return plan @property def observations(self) -> Dict[str, "Observation"]: return dict( **super().observations, **(self.other.observations if self.other is not None else {}), ) def print(self, indent: int = 0) -> str: assert self._child is not None assert self.other is not None i = " " * indent o = " " * (indent + LogicalPlan.INDENT) n = indent + LogicalPlan.INDENT * 2 return ( f"{i}SetOperation\n{o}child1=\n{self._child.print(n)}" f"\n{o}child2=\n{self.other.print(n)}" ) def _repr_html_(self) -> str: assert self._child is not None assert self.other is not None return f""" <ul> <li> SetOperation Left: {self._child._repr_html_()} Right: {self.other._repr_html_()} </li> </uL> """ class Repartition(LogicalPlan): """Repartition Relation into a different number of partitions.""" def __init__(self, child: Optional["LogicalPlan"], num_partitions: int, shuffle: bool) -> None: super().__init__(child) self._num_partitions = num_partitions self._shuffle = shuffle def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() if self._child is not None: plan.repartition.input.CopyFrom(self._child.plan(session)) plan.repartition.shuffle = self._shuffle plan.repartition.num_partitions = self._num_partitions return plan class RepartitionByExpression(LogicalPlan): """Repartition Relation into a different number of partitions using Expression""" def __init__( self, child: Optional["LogicalPlan"], num_partitions: Optional[int], columns: List[Column], ) -> None: assert all(isinstance(c, Column) for c in columns) super().__init__(child, self._collect_references(columns)) self.num_partitions = num_partitions self.columns = columns def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.repartition_by_expression.partition_exprs.extend( [c.to_plan(session) for c in self.columns] ) if self._child is not None: plan.repartition_by_expression.input.CopyFrom(self._child.plan(session)) if self.num_partitions is not None: plan.repartition_by_expression.num_partitions = self.num_partitions return self._with_relations(plan, session) class SubqueryAlias(LogicalPlan): """Alias for a relation.""" def __init__(self, child: Optional["LogicalPlan"], alias: str) -> None: super().__init__(child) self._alias = alias def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() if self._child is not None: plan.subquery_alias.input.CopyFrom(self._child.plan(session)) plan.subquery_alias.alias = self._alias return plan class WithRelations(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], references: Sequence["LogicalPlan"], ) -> None: super().__init__(child) assert references is not None and len(references) > 0 assert all(isinstance(ref, LogicalPlan) for ref in references) self._references = references def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() if self._child is not None: plan.with_relations.root.CopyFrom(self._child.plan(session)) for ref in self._references: plan.with_relations.references.append(ref.plan(session)) return plan class SQL(LogicalPlan): def __init__( self, query: str, args: Optional[List[Column]] = None, named_args: Optional[Dict[str, Column]] = None, views: Optional[Sequence[SubqueryAlias]] = None, ) -> None: if args is not None: assert isinstance(args, List) assert all(isinstance(arg, Column) for arg in args) if named_args is not None: assert isinstance(named_args, Dict) for k, arg in named_args.items(): assert isinstance(k, str) assert isinstance(arg, Column) if views is not None: assert isinstance(views, List) assert all(isinstance(v, SubqueryAlias) for v in views) super().__init__(None, views) self._query = query self._args = args self._named_args = named_args self._views = views def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.sql.query = self._query if self._args is not None and len(self._args) > 0: plan.sql.pos_arguments.extend([arg.to_plan(session) for arg in self._args]) if self._named_args is not None and len(self._named_args) > 0: for k, arg in self._named_args.items(): plan.sql.named_arguments[k].CopyFrom(arg.to_plan(session)) return self._with_relations(plan, session) def command(self, session: "SparkConnectClient") -> proto.Command: cmd = proto.Command() cmd.sql_command.input.CopyFrom(self.plan(session)) return cmd class Range(LogicalPlan): def __init__( self, start: int, end: int, step: int, num_partitions: Optional[int] = None, ) -> None: super().__init__(None) self._start = start self._end = end self._step = step self._num_partitions = num_partitions def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.range.start = self._start plan.range.end = self._end plan.range.step = self._step if self._num_partitions is not None: plan.range.num_partitions = self._num_partitions return plan class ToSchema(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], schema: DataType) -> None: super().__init__(child) self._schema = schema def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.to_schema.input.CopyFrom(self._child.plan(session)) plan.to_schema.schema.CopyFrom(pyspark_types_to_proto_types(self._schema)) return plan class WithColumnsRenamed(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], colsMap: Mapping[str, str]) -> None: super().__init__(child) self._colsMap = colsMap def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.with_columns_renamed.input.CopyFrom(self._child.plan(session)) if len(self._colsMap) > 0: for k, v in self._colsMap.items(): rename = proto.WithColumnsRenamed.Rename() rename.col_name = k rename.new_col_name = v plan.with_columns_renamed.renames.append(rename) return plan class Unpivot(LogicalPlan): """Logical plan object for a unpivot operation.""" def __init__( self, child: Optional["LogicalPlan"], ids: List[Column], values: Optional[List[Column]], variable_column_name: str, value_column_name: str, ) -> None: super().__init__(child, self._collect_references(ids + (values or []))) self.ids = ids self.values = values self.variable_column_name = variable_column_name self.value_column_name = value_column_name def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.unpivot.input.CopyFrom(self._child.plan(session)) plan.unpivot.ids.extend([id.to_plan(session) for id in self.ids]) if self.values is not None: plan.unpivot.values.values.extend([v.to_plan(session) for v in self.values]) plan.unpivot.variable_column_name = self.variable_column_name plan.unpivot.value_column_name = self.value_column_name return self._with_relations(plan, session) class Transpose(LogicalPlan): """Logical plan object for a transpose operation.""" def __init__( self, child: Optional["LogicalPlan"], index_columns: Sequence[Column], ) -> None: super().__init__(child, self._collect_references(index_columns)) self.index_columns = index_columns def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.transpose.input.CopyFrom(self._child.plan(session)) if self.index_columns is not None and len(self.index_columns) > 0: for index_column in self.index_columns: plan.transpose.index_columns.append(index_column.to_plan(session)) return self._with_relations(plan, session) class UnresolvedTableValuedFunction(LogicalPlan): def __init__(self, name: str, args: Sequence[Column]): super().__init__(None, self._collect_references(args)) self._name = name self._args = args def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.unresolved_table_valued_function.function_name = self._name for arg in self._args: plan.unresolved_table_valued_function.arguments.append(arg.to_plan(session)) return self._with_relations(plan, session) class CollectMetrics(LogicalPlan): """Logical plan object for a CollectMetrics operation.""" def __init__( self, child: Optional["LogicalPlan"], observation: Union[str, "Observation"], exprs: List[Column], ) -> None: assert all(isinstance(e, Column) for e in exprs) super().__init__(child, self._collect_references(exprs)) self._observation = observation self._exprs = exprs def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.collect_metrics.input.CopyFrom(self._child.plan(session)) plan.collect_metrics.name = ( self._observation if isinstance(self._observation, str) else str(self._observation._name) ) plan.collect_metrics.metrics.extend([e.to_plan(session) for e in self._exprs]) return self._with_relations(plan, session) @property def observations(self) -> Dict[str, "Observation"]: from pyspark.sql.connect.observation import Observation if isinstance(self._observation, Observation): observations = {str(self._observation._name): self._observation} else: observations = {} return dict(**super().observations, **observations) class NAFill(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], cols: Optional[List[str]], values: List[Any] ) -> None: super().__init__(child) assert ( isinstance(values, list) and len(values) > 0 and all(isinstance(v, (bool, int, float, str)) for v in values) ) if cols is not None and len(cols) > 0: assert isinstance(cols, list) and all(isinstance(c, str) for c in cols) if len(values) > 1: assert len(cols) == len(values) self.cols = cols self.values = values def _convert_value(self, v: Any) -> proto.Expression.Literal: value = proto.Expression.Literal() if isinstance(v, bool): value.boolean = v elif isinstance(v, int): value.long = v elif isinstance(v, float): value.double = v else: value.string = v return value def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.fill_na.input.CopyFrom(self._child.plan(session)) if self.cols is not None and len(self.cols) > 0: plan.fill_na.cols.extend(self.cols) plan.fill_na.values.extend([self._convert_value(v) for v in self.values]) return plan class NADrop(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], cols: Optional[List[str]], min_non_nulls: Optional[int], ) -> None: super().__init__(child) self.cols = cols self.min_non_nulls = min_non_nulls def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.drop_na.input.CopyFrom(self._child.plan(session)) if self.cols is not None and len(self.cols) > 0: plan.drop_na.cols.extend(self.cols) if self.min_non_nulls is not None: plan.drop_na.min_non_nulls = self.min_non_nulls return plan class NAReplace(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], cols: Optional[List[str]], replacements: Sequence[Tuple[Column, Column]], ) -> None: assert replacements is not None and isinstance(replacements, List) for k, v in replacements: assert k is not None and isinstance(k, Column) assert v is not None and isinstance(v, Column) super().__init__(child, self._collect_references([e for t in replacements for e in t])) self.cols = cols self.replacements = replacements def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.replace.input.CopyFrom(self._child.plan(session)) if self.cols is not None and len(self.cols) > 0: plan.replace.cols.extend(self.cols) if len(self.replacements) > 0: for old_value, new_value in self.replacements: replacement = proto.NAReplace.Replacement() replacement.old_value.CopyFrom(old_value.to_plan(session).literal) replacement.new_value.CopyFrom(new_value.to_plan(session).literal) plan.replace.replacements.append(replacement) return self._with_relations(plan, session) class StatSummary(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], statistics: List[str]) -> None: super().__init__(child) self.statistics = statistics def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.summary.input.CopyFrom(self._child.plan(session)) plan.summary.statistics.extend(self.statistics) return plan class StatDescribe(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], cols: List[str]) -> None: super().__init__(child) self.cols = cols def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.describe.input.CopyFrom(self._child.plan(session)) plan.describe.cols.extend(self.cols) return plan class StatCov(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], col1: str, col2: str) -> None: super().__init__(child) self._col1 = col1 self._col2 = col2 def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.cov.input.CopyFrom(self._child.plan(session)) plan.cov.col1 = self._col1 plan.cov.col2 = self._col2 return plan class StatApproxQuantile(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], cols: List[str], probabilities: List[float], relativeError: float, ) -> None: super().__init__(child) self._cols = cols self._probabilities = probabilities self._relativeError = relativeError def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.approx_quantile.input.CopyFrom(self._child.plan(session)) plan.approx_quantile.cols.extend(self._cols) plan.approx_quantile.probabilities.extend(self._probabilities) plan.approx_quantile.relative_error = self._relativeError return plan class StatCrosstab(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], col1: str, col2: str) -> None: super().__init__(child) self.col1 = col1 self.col2 = col2 def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.crosstab.input.CopyFrom(self._child.plan(session)) plan.crosstab.col1 = self.col1 plan.crosstab.col2 = self.col2 return plan class StatFreqItems(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], cols: List[str], support: float, ) -> None: super().__init__(child) self._cols = cols self._support = support def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.freq_items.input.CopyFrom(self._child.plan(session)) plan.freq_items.cols.extend(self._cols) plan.freq_items.support = self._support return plan class StatSampleBy(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], col: Column, fractions: Sequence[Tuple[Column, float]], seed: int, ) -> None: assert col is not None and isinstance(col, (Column, str)) assert fractions is not None and isinstance(fractions, List) for k, v in fractions: assert k is not None and isinstance(k, Column) assert v is not None and isinstance(v, float) assert seed is None or isinstance(seed, int) super().__init__( child, self._collect_references( [col] if isinstance(col, Column) else [] + [c for c, _ in fractions] ), ) self._col = col self._fractions = fractions self._seed = seed def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.sample_by.input.CopyFrom(self._child.plan(session)) plan.sample_by.col.CopyFrom(self._col._expr.to_plan(session)) if len(self._fractions) > 0: for k, v in self._fractions: fraction = proto.StatSampleBy.Fraction() fraction.stratum.CopyFrom(k.to_plan(session).literal) fraction.fraction = float(v) plan.sample_by.fractions.append(fraction) plan.sample_by.seed = self._seed return self._with_relations(plan, session) class StatCorr(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], col1: str, col2: str, method: str) -> None: super().__init__(child) self._col1 = col1 self._col2 = col2 self._method = method def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.corr.input.CopyFrom(self._child.plan(session)) plan.corr.col1 = self._col1 plan.corr.col2 = self._col2 plan.corr.method = self._method return plan class ToDF(LogicalPlan): def __init__(self, child: Optional["LogicalPlan"], cols: Sequence[str]) -> None: super().__init__(child) self._cols = cols def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.to_df.input.CopyFrom(self._child.plan(session)) plan.to_df.column_names.extend(self._cols) return plan class CreateView(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], name: str, is_global: bool, replace: bool ) -> None: super().__init__(child) self._name = name self._is_global = is_global self._replace = replace def command(self, session: "SparkConnectClient") -> proto.Command: assert self._child is not None plan = proto.Command() plan.create_dataframe_view.replace = self._replace plan.create_dataframe_view.is_global = self._is_global plan.create_dataframe_view.name = self._name plan.create_dataframe_view.input.CopyFrom(self._child.plan(session)) return plan class WriteOperation(LogicalPlan): def __init__(self, child: "LogicalPlan") -> None: super(WriteOperation, self).__init__(child) self.source: Optional[str] = None self.path: Optional[str] = None self.table_name: Optional[str] = None self.table_save_method: Optional[str] = None self.mode: Optional[str] = None self.sort_cols: List[str] = [] self.partitioning_cols: List[str] = [] self.clustering_cols: List[str] = [] self.options: Dict[str, Optional[str]] = {} self.num_buckets: int = -1 self.bucket_cols: List[str] = [] def command(self, session: "SparkConnectClient") -> proto.Command: assert self._child is not None plan = proto.Command() plan.write_operation.input.CopyFrom(self._child.plan(session)) if self.source is not None: plan.write_operation.source = self.source plan.write_operation.sort_column_names.extend(self.sort_cols) plan.write_operation.partitioning_columns.extend(self.partitioning_cols) plan.write_operation.clustering_columns.extend(self.clustering_cols) if self.num_buckets > 0: plan.write_operation.bucket_by.bucket_column_names.extend(self.bucket_cols) plan.write_operation.bucket_by.num_buckets = self.num_buckets for k in self.options: if self.options[k] is None: plan.write_operation.options.pop(k, None) else: plan.write_operation.options[k] = cast(str, self.options[k]) if self.table_name is not None: plan.write_operation.table.table_name = self.table_name if self.table_save_method is not None: tsm = self.table_save_method.lower() if tsm == "save_as_table": plan.write_operation.table.save_method = ( proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE # noqa: E501 ) elif tsm == "insert_into": plan.write_operation.table.save_method = ( proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO ) else: raise PySparkValueError( errorClass="UNSUPPORTED_OPERATION", messageParameters={"operation": tsm}, ) elif self.path is not None: plan.write_operation.path = self.path if self.mode is not None: wm = self.mode.lower() if wm == "append": plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_APPEND elif wm == "overwrite": plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE elif wm == "error": plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_ERROR_IF_EXISTS elif wm == "ignore": plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE else: raise PySparkValueError( errorClass="UNSUPPORTED_OPERATION", messageParameters={"operation": self.mode}, ) return plan def print(self, indent: int = 0) -> str: i = " " * indent return ( f"{i}" f"<WriteOperation source='{self.source}' " f"path='{self.path} " f"table_name='{self.table_name}' " f"table_save_method='{self.table_save_method}' " f"mode='{self.mode}' " f"sort_cols='{self.sort_cols}' " f"partitioning_cols='{self.partitioning_cols}' " f"clustering_cols='{self.clustering_cols}' " f"num_buckets='{self.num_buckets}' " f"bucket_cols='{self.bucket_cols}' " f"options='{self.options}'>" ) def _repr_html_(self) -> str: return ( f"<uL><li>WriteOperation source='{self.source}' " f"path: '{self.path} " f"table_name: '{self.table_name}' " f"table_save_method: '{self.table_save_method}' " f"mode: '{self.mode}' " f"sort_cols: '{self.sort_cols}' " f"partitioning_cols: '{self.partitioning_cols}' " f"clustering_cols: '{self.clustering_cols}' " f"num_buckets: '{self.num_buckets}' " f"bucket_cols: '{self.bucket_cols}' " f"options: '{self.options}' " f"</li></ul>" ) class WriteOperationV2(LogicalPlan): def __init__(self, child: "LogicalPlan", table_name: str) -> None: super(WriteOperationV2, self).__init__(child) self.table_name: Optional[str] = table_name self.provider: Optional[str] = None self.partitioning_columns: List[Column] = [] self.clustering_columns: List[str] = [] self.options: dict[str, Optional[str]] = {} self.table_properties: dict[str, Optional[str]] = {} self.mode: Optional[str] = None self.overwrite_condition: Optional[Column] = None def command(self, session: "SparkConnectClient") -> proto.Command: assert self._child is not None plan = proto.Command() plan.write_operation_v2.input.CopyFrom(self._child.plan(session)) if self.table_name is not None: plan.write_operation_v2.table_name = self.table_name if self.provider is not None: plan.write_operation_v2.provider = self.provider plan.write_operation_v2.partitioning_columns.extend( [c.to_plan(session) for c in self.partitioning_columns] ) plan.write_operation_v2.clustering_columns.extend(self.clustering_columns) for k in self.options: if self.options[k] is None: plan.write_operation_v2.options.pop(k, None) else: plan.write_operation_v2.options[k] = cast(str, self.options[k]) for k in self.table_properties: if self.table_properties[k] is None: plan.write_operation_v2.table_properties.pop(k, None) else: plan.write_operation_v2.table_properties[k] = cast(str, self.table_properties[k]) if self.mode is not None: wm = self.mode.lower() if wm == "create": plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_CREATE elif wm == "overwrite": plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_OVERWRITE if self.overwrite_condition is not None: plan.write_operation_v2.overwrite_condition.CopyFrom( self.overwrite_condition.to_plan(session) ) elif wm == "overwrite_partitions": plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_OVERWRITE_PARTITIONS elif wm == "append": plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_APPEND elif wm == "replace": plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_REPLACE elif wm == "create_or_replace": plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_CREATE_OR_REPLACE else: raise PySparkValueError( errorClass="UNSUPPORTED_OPERATION", messageParameters={"operation": self.mode}, ) return plan class WriteStreamOperation(LogicalPlan): def __init__(self, child: "LogicalPlan") -> None: super(WriteStreamOperation, self).__init__(child) self.write_op = proto.WriteStreamOperationStart() def command(self, session: "SparkConnectClient") -> proto.Command: assert self._child is not None self.write_op.input.CopyFrom(self._child.plan(session)) cmd = proto.Command() cmd.write_stream_operation_start.CopyFrom(self.write_op) return cmd class RemoveRemoteCachedRelation(LogicalPlan): def __init__(self, relation: CachedRemoteRelation) -> None: super().__init__(None) self._relation = relation def command(self, session: "SparkConnectClient") -> proto.Command: plan = self._create_proto_relation() plan.cached_remote_relation.relation_id = self._relation._relation_id cmd = proto.Command() cmd.remove_cached_remote_relation_command.relation.CopyFrom(plan.cached_remote_relation) return cmd class Checkpoint(LogicalPlan): def __init__( self, child: Optional["LogicalPlan"], local: bool, eager: bool, storage_level: Optional[StorageLevel] = None, ) -> None: super().__init__(child) self._local = local self._eager = eager self._storage_level = storage_level def command(self, session: "SparkConnectClient") -> proto.Command: cmd = proto.Command() assert self._child is not None checkpoint_command = proto.CheckpointCommand( relation=self._child.plan(session), local=self._local, eager=self._eager, ) if self._storage_level is not None: checkpoint_command.storage_level.CopyFrom(storage_level_to_proto(self._storage_level)) cmd.checkpoint_command.CopyFrom(checkpoint_command) return cmd # Catalog API (internal-only) class CurrentDatabase(LogicalPlan): def __init__(self) -> None: super().__init__(None) def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.current_database.SetInParent() return plan class SetCurrentDatabase(LogicalPlan): def __init__(self, db_name: str) -> None: super().__init__(None) self._db_name = db_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.set_current_database.db_name = self._db_name return plan class ListDatabases(LogicalPlan): def __init__(self, pattern: Optional[str] = None) -> None: super().__init__(None) self._pattern = pattern def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.list_databases.SetInParent() if self._pattern is not None: plan.catalog.list_databases.pattern = self._pattern return plan class ListTables(LogicalPlan): def __init__(self, db_name: Optional[str] = None, pattern: Optional[str] = None) -> None: super().__init__(None) self._db_name = db_name self._pattern = pattern def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.list_tables.SetInParent() if self._db_name is not None: plan.catalog.list_tables.db_name = self._db_name if self._pattern is not None: plan.catalog.list_tables.pattern = self._pattern return plan class ListFunctions(LogicalPlan): def __init__(self, db_name: Optional[str] = None, pattern: Optional[str] = None) -> None: super().__init__(None) self._db_name = db_name self._pattern = pattern def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.list_functions.SetInParent() if self._db_name is not None: plan.catalog.list_functions.db_name = self._db_name if self._pattern is not None: plan.catalog.list_functions.pattern = self._pattern return plan class ListColumns(LogicalPlan): def __init__(self, table_name: str, db_name: Optional[str] = None) -> None: super().__init__(None) self._table_name = table_name self._db_name = db_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.list_columns.table_name = self._table_name if self._db_name is not None: plan.catalog.list_columns.db_name = self._db_name return plan class GetDatabase(LogicalPlan): def __init__(self, db_name: str) -> None: super().__init__(None) self._db_name = db_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.get_database.db_name = self._db_name return plan class GetTable(LogicalPlan): def __init__(self, table_name: str, db_name: Optional[str] = None) -> None: super().__init__(None) self._table_name = table_name self._db_name = db_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.get_table.table_name = self._table_name if self._db_name is not None: plan.catalog.get_table.db_name = self._db_name return plan class GetFunction(LogicalPlan): def __init__(self, function_name: str, db_name: Optional[str] = None) -> None: super().__init__(None) self._function_name = function_name self._db_name = db_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.get_function.function_name = self._function_name if self._db_name is not None: plan.catalog.get_function.db_name = self._db_name return plan class DatabaseExists(LogicalPlan): def __init__(self, db_name: str) -> None: super().__init__(None) self._db_name = db_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.database_exists.db_name = self._db_name return plan class TableExists(LogicalPlan): def __init__(self, table_name: str, db_name: Optional[str] = None) -> None: super().__init__(None) self._table_name = table_name self._db_name = db_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.table_exists.table_name = self._table_name if self._db_name is not None: plan.catalog.table_exists.db_name = self._db_name return plan class FunctionExists(LogicalPlan): def __init__(self, function_name: str, db_name: Optional[str] = None) -> None: super().__init__(None) self._function_name = function_name self._db_name = db_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.function_exists.function_name = self._function_name if self._db_name is not None: plan.catalog.function_exists.db_name = self._db_name return plan class CreateTable(LogicalPlan): def __init__( self, table_name: str, path: str, source: Optional[str] = None, description: Optional[str] = None, schema: Optional[DataType] = None, options: Mapping[str, str] = {}, ) -> None: super().__init__(None) self._table_name = table_name self._path = path self._source = source self._description = description self._schema = schema self._options = options def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.create_table.table_name = self._table_name if self._path is not None: plan.catalog.create_table.path = self._path if self._source is not None: plan.catalog.create_table.source = self._source if self._description is not None: plan.catalog.create_table.description = self._description if self._schema is not None: plan.catalog.create_table.schema.CopyFrom(pyspark_types_to_proto_types(self._schema)) for k in self._options.keys(): v = self._options.get(k) if v is not None: plan.catalog.create_table.options[k] = v return plan class DropTempView(LogicalPlan): def __init__(self, view_name: str) -> None: super().__init__(None) self._view_name = view_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.drop_temp_view.view_name = self._view_name return plan class DropGlobalTempView(LogicalPlan): def __init__(self, view_name: str) -> None: super().__init__(None) self._view_name = view_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.drop_global_temp_view.view_name = self._view_name return plan class RecoverPartitions(LogicalPlan): def __init__(self, table_name: str) -> None: super().__init__(None) self._table_name = table_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.recover_partitions.table_name = self._table_name return plan class IsCached(LogicalPlan): def __init__(self, table_name: str) -> None: super().__init__(None) self._table_name = table_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.is_cached.table_name = self._table_name return plan class CacheTable(LogicalPlan): def __init__(self, table_name: str, storage_level: Optional[StorageLevel] = None) -> None: super().__init__(None) self._table_name = table_name self._storage_level = storage_level def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() _cache_table = proto.CacheTable(table_name=self._table_name) if self._storage_level: _cache_table.storage_level.CopyFrom(storage_level_to_proto(self._storage_level)) plan.catalog.cache_table.CopyFrom(_cache_table) return plan class UncacheTable(LogicalPlan): def __init__(self, table_name: str) -> None: super().__init__(None) self._table_name = table_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.uncache_table.table_name = self._table_name return plan class ClearCache(LogicalPlan): def __init__(self) -> None: super().__init__(None) def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.clear_cache.SetInParent() return plan class RefreshTable(LogicalPlan): def __init__(self, table_name: str) -> None: super().__init__(None) self._table_name = table_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.refresh_table.table_name = self._table_name return plan class RefreshByPath(LogicalPlan): def __init__(self, path: str) -> None: super().__init__(None) self._path = path def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.refresh_by_path.path = self._path return plan class CurrentCatalog(LogicalPlan): def __init__(self) -> None: super().__init__(None) def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.current_catalog.SetInParent() return plan class SetCurrentCatalog(LogicalPlan): def __init__(self, catalog_name: str) -> None: super().__init__(None) self._catalog_name = catalog_name def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.set_current_catalog.catalog_name = self._catalog_name return plan class ListCatalogs(LogicalPlan): def __init__(self, pattern: Optional[str] = None) -> None: super().__init__(None) self._pattern = pattern def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.catalog.list_catalogs.SetInParent() if self._pattern is not None: plan.catalog.list_catalogs.pattern = self._pattern return plan class MapPartitions(LogicalPlan): """Logical plan object for a mapPartitions-equivalent API: mapInPandas, mapInArrow.""" def __init__( self, child: Optional["LogicalPlan"], function: "UserDefinedFunction", cols: List[str], is_barrier: bool, profile: Optional[ResourceProfile], ) -> None: super().__init__(child) self._function = function._build_common_inline_user_defined_function(*cols) self._is_barrier = is_barrier self._profile = profile def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.map_partitions.input.CopyFrom(self._child.plan(session)) plan.map_partitions.func.CopyFrom(self._function.to_plan_udf(session)) plan.map_partitions.is_barrier = self._is_barrier if self._profile is not None: plan.map_partitions.profile_id = self._profile.id return plan class GroupMap(LogicalPlan): """Logical plan object for a Group Map API: apply, applyInPandas.""" def __init__( self, child: Optional["LogicalPlan"], grouping_cols: Sequence[Column], function: "UserDefinedFunction", cols: List[str], ): assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols) super().__init__(child, self._collect_references(grouping_cols)) self._grouping_cols = grouping_cols self._function = function._build_common_inline_user_defined_function(*cols) def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.group_map.input.CopyFrom(self._child.plan(session)) plan.group_map.grouping_expressions.extend( [c.to_plan(session) for c in self._grouping_cols] ) plan.group_map.func.CopyFrom(self._function.to_plan_udf(session)) return self._with_relations(plan, session) class CoGroupMap(LogicalPlan): """Logical plan object for a CoGroup Map API: applyInPandas.""" def __init__( self, input: Optional["LogicalPlan"], input_grouping_cols: Sequence[Column], other: Optional["LogicalPlan"], other_grouping_cols: Sequence[Column], function: "UserDefinedFunction", ): assert isinstance(input_grouping_cols, list) and all( isinstance(c, Column) for c in input_grouping_cols ) assert isinstance(other_grouping_cols, list) and all( isinstance(c, Column) for c in other_grouping_cols ) super().__init__(input, self._collect_references(input_grouping_cols + other_grouping_cols)) self._input_grouping_cols = input_grouping_cols self._other_grouping_cols = other_grouping_cols self._other = cast(LogicalPlan, other) # The function takes entire DataFrame as inputs, no need to do # column binding (no input columns). self._function = function._build_common_inline_user_defined_function() def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.co_group_map.input.CopyFrom(self._child.plan(session)) plan.co_group_map.input_grouping_expressions.extend( [c.to_plan(session) for c in self._input_grouping_cols] ) plan.co_group_map.other.CopyFrom(self._other.plan(session)) plan.co_group_map.other_grouping_expressions.extend( [c.to_plan(session) for c in self._other_grouping_cols] ) plan.co_group_map.func.CopyFrom(self._function.to_plan_udf(session)) return self._with_relations(plan, session) class ApplyInPandasWithState(LogicalPlan): """Logical plan object for a applyInPandasWithState.""" def __init__( self, child: Optional["LogicalPlan"], grouping_cols: Sequence[Column], function: "UserDefinedFunction", output_schema: str, state_schema: str, output_mode: str, timeout_conf: str, cols: List[str], ): assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols) super().__init__(child, self._collect_references(grouping_cols)) self._grouping_cols = grouping_cols self._function = function._build_common_inline_user_defined_function(*cols) self._output_schema = output_schema self._state_schema = state_schema self._output_mode = output_mode self._timeout_conf = timeout_conf def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.apply_in_pandas_with_state.input.CopyFrom(self._child.plan(session)) plan.apply_in_pandas_with_state.grouping_expressions.extend( [c.to_plan(session) for c in self._grouping_cols] ) plan.apply_in_pandas_with_state.func.CopyFrom(self._function.to_plan_udf(session)) plan.apply_in_pandas_with_state.output_schema = self._output_schema plan.apply_in_pandas_with_state.state_schema = self._state_schema plan.apply_in_pandas_with_state.output_mode = self._output_mode plan.apply_in_pandas_with_state.timeout_conf = self._timeout_conf return self._with_relations(plan, session) class BaseTransformWithStateInPySpark(LogicalPlan): """Base implementation of logical plan object for a TransformWithStateIn(PySpark/Pandas).""" def __init__( self, child: Optional["LogicalPlan"], grouping_cols: Sequence[Column], function: "UserDefinedFunction", output_schema: Union[DataType, str], output_mode: str, time_mode: str, event_time_col_name: str, cols: List[str], initial_state_plan: Optional["LogicalPlan"], initial_state_grouping_cols: Optional[Sequence[Column]], ): assert isinstance(grouping_cols, list) and all(isinstance(c, Column) for c in grouping_cols) if initial_state_plan is not None: assert isinstance(initial_state_grouping_cols, list) and all( isinstance(c, Column) for c in initial_state_grouping_cols ) super().__init__( child, self._collect_references(grouping_cols + initial_state_grouping_cols) ) else: super().__init__(child, self._collect_references(grouping_cols)) self._grouping_cols = grouping_cols self._output_schema: DataType = ( UnparsedDataType(output_schema) if isinstance(output_schema, str) else output_schema ) self._output_mode = output_mode self._time_mode = time_mode self._event_time_col_name = event_time_col_name self._function = function._build_common_inline_user_defined_function(*cols) self._initial_state_plan = initial_state_plan self._initial_state_grouping_cols = initial_state_grouping_cols def plan(self, session: "SparkConnectClient") -> proto.Relation: assert self._child is not None plan = self._create_proto_relation() plan.group_map.input.CopyFrom(self._child.plan(session)) plan.group_map.grouping_expressions.extend( [c.to_plan(session) for c in self._grouping_cols] ) plan.group_map.output_mode = self._output_mode # fill in initial state related fields if self._initial_state_plan is not None: plan.group_map.initial_input.CopyFrom(self._initial_state_plan.plan(session)) assert self._initial_state_grouping_cols is not None plan.group_map.initial_grouping_expressions.extend( [c.to_plan(session) for c in self._initial_state_grouping_cols] ) # fill in transformWithStateInPySpark/Pandas related fields tws_info = proto.TransformWithStateInfo() tws_info.time_mode = self._time_mode tws_info.event_time_column_name = self._event_time_col_name tws_info.output_schema.CopyFrom(pyspark_types_to_proto_types(self._output_schema)) plan.group_map.transform_with_state_info.CopyFrom(tws_info) # wrap transformWithStateInPySparkUdf in a function plan.group_map.func.CopyFrom(self._function.to_plan_udf(session)) return self._with_relations(plan, session) class TransformWithStateInPySpark(BaseTransformWithStateInPySpark): """Logical plan object for a TransformWithStateInPySpark.""" pass # Retaining this to avoid breaking backward compatibility. class TransformWithStateInPandas(BaseTransformWithStateInPySpark): """Logical plan object for a TransformWithStateInPandas.""" pass class PythonUDTF: """Represents a Python user-defined table function.""" def __init__( self, func: Type, return_type: Optional[Union[DataType, str]], eval_type: int, python_ver: str, ) -> None: self._func = func self._name = func.__name__ self._return_type: Optional[DataType] = ( None if return_type is None else UnparsedDataType(return_type) if isinstance(return_type, str) else return_type ) self._eval_type = eval_type self._python_ver = python_ver def to_plan(self, session: "SparkConnectClient") -> proto.PythonUDTF: udtf = proto.PythonUDTF() if self._return_type is not None: udtf.return_type.CopyFrom(pyspark_types_to_proto_types(self._return_type)) udtf.eval_type = self._eval_type try: udtf.command = CloudPickleSerializer().dumps(self._func) except pickle.PicklingError: raise PySparkPicklingError( errorClass="UDTF_SERIALIZATION_ERROR", messageParameters={ "name": self._name, "message": "Please check the stack trace and " "make sure the function is serializable.", }, ) udtf.python_ver = self._python_ver return udtf def __repr__(self) -> str: return ( f"PythonUDTF({self._name}, {self._return_type}, " f"{self._eval_type}, {self._python_ver})" ) class CommonInlineUserDefinedTableFunction(LogicalPlan): """ Logical plan object for a user-defined table function with an inlined defined function body. """ def __init__( self, function_name: str, function: PythonUDTF, deterministic: bool, arguments: Sequence[Expression], ) -> None: super().__init__(None, self._collect_references(arguments)) self._function_name = function_name self._deterministic = deterministic self._arguments = arguments self._function = function def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.common_inline_user_defined_table_function.function_name = self._function_name plan.common_inline_user_defined_table_function.deterministic = self._deterministic if len(self._arguments) > 0: plan.common_inline_user_defined_table_function.arguments.extend( [arg.to_plan(session) for arg in self._arguments] ) plan.common_inline_user_defined_table_function.python_udtf.CopyFrom( self._function.to_plan(session) ) return self._with_relations(plan, session) def udtf_plan( self, session: "SparkConnectClient" ) -> "proto.CommonInlineUserDefinedTableFunction": """ Compared to `plan`, it returns a `proto.CommonInlineUserDefinedTableFunction` instead of a `proto.Relation`. """ plan = proto.CommonInlineUserDefinedTableFunction() plan.function_name = self._function_name plan.deterministic = self._deterministic if len(self._arguments) > 0: plan.arguments.extend([arg.to_plan(session) for arg in self._arguments]) plan.python_udtf.CopyFrom( cast(proto.PythonUDF, self._function.to_plan(session)) # type: ignore[arg-type] ) return plan def __repr__(self) -> str: return f"{self._function_name}({', '.join([str(arg) for arg in self._arguments])})" class PythonDataSource: """Represents a user-defined Python data source.""" def __init__(self, data_source: Type, python_ver: str): self._data_source = data_source self._python_ver = python_ver def to_plan(self, session: "SparkConnectClient") -> proto.PythonDataSource: ds = proto.PythonDataSource() ds.command = CloudPickleSerializer().dumps(self._data_source) ds.python_ver = self._python_ver return ds class CommonInlineUserDefinedDataSource(LogicalPlan): """Logical plan object for a user-defined data source""" def __init__(self, name: str, data_source: PythonDataSource) -> None: super().__init__(None) self._name = name self._data_source = data_source def plan(self, session: "SparkConnectClient") -> proto.Relation: plan = self._create_proto_relation() plan.common_inline_user_defined_data_source.name = self._name plan.common_inline_user_defined_data_source.python_data_source.CopyFrom( self._data_source.to_plan(session) ) return plan def to_data_source_proto( self, session: "SparkConnectClient" ) -> "proto.CommonInlineUserDefinedDataSource": plan = proto.CommonInlineUserDefinedDataSource() plan.name = self._name plan.python_data_source.CopyFrom(self._data_source.to_plan(session)) return plan class CachedRelation(LogicalPlan): def __init__(self, plan: proto.Relation) -> None: super(CachedRelation, self).__init__(None) self._plan = plan # Update the plan ID based on the incremented counter. self._plan.common.plan_id = self._plan_id def plan(self, session: "SparkConnectClient") -> proto.Relation: return self._plan

python/pyspark/sql/connect/plan.py (2,133 lines of code) (raw):