core/maxframe/dataframe/misc/eval.py (318 lines of code) (raw):

#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 1999-2025 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import ast import binascii import operator import sys import textwrap import tokenize from collections import OrderedDict from functools import reduce from io import StringIO import numpy as np import pandas as pd from ... import opcodes from ...core import ENTITY_TYPE, OutputType, get_output_types from ...serialization.serializables import BoolField, DictField, StringField from ..operators import DataFrameOperator, DataFrameOperatorMixin from ..utils import parse_index LOCAL_TAG = "_local_var_" BACKTICK_TAG = "_backtick_var_" def _tokenize_str(reader): token_generator = tokenize.generate_tokens(reader) def _iter_backtick_string(gen, line, back_start): for _, tokval, start, _, _ in gen: if tokval == "`": return ( BACKTICK_TAG + binascii.b2a_hex( line[back_start[1] + 1 : start[1]].encode() ).decode() ) else: raise SyntaxError(f"backtick quote at {back_start} does not match") for toknum, tokval, start, _, line in token_generator: if toknum == tokenize.OP: if tokval == "@": tokval = LOCAL_TAG if tokval == "&": toknum = tokenize.NAME tokval = "and" elif tokval == "|": toknum = tokenize.NAME tokval = "or" elif tokval == "`": yield tokenize.NAME, _iter_backtick_string(token_generator, line, start) continue yield toknum, tokval class CollectionVisitor(ast.NodeVisitor): _op_handlers = { ast.Add: operator.add, ast.Sub: operator.sub, ast.Mult: operator.mul, ast.Div: operator.truediv, ast.FloorDiv: operator.floordiv, ast.mod: operator.mod, ast.Pow: operator.pow, ast.Eq: operator.eq, ast.NotEq: operator.ne, ast.Lt: operator.lt, ast.LtE: operator.le, ast.Gt: operator.gt, ast.GtE: operator.ge, ast.In: lambda x, y: y.isin(x), ast.NotIn: lambda x, y: ~y.isin(x), ast.UAdd: operator.pos, ast.USub: operator.neg, ast.Invert: operator.invert, ast.And: operator.and_, ast.Or: operator.or_, } def __init__(self, resolvers, target, env): self.env = env self.target = target self.resolvers = resolvers self.referenced_vars = set() self.assigned = False self.entity_subscribe = False def _preparse(self, expr): reader = StringIO(expr).readline return tokenize.untokenize(list(_tokenize_str(reader))) def eval(self, expr, rewrite=True): if rewrite: expr = self._preparse(expr) node = ast.fix_missing_locations(ast.parse(expr)) return self.visit(node) def get_named_object(self, obj_name): for resolver in self.resolvers: try: return resolver[obj_name] except (IndexError, KeyError): continue if obj_name in self.env: self.referenced_vars.add(obj_name) return self.env[obj_name] try: return self.target[obj_name] except KeyError: pass raise KeyError(f"name {obj_name} is not defined") def visit(self, node): if isinstance(node, ENTITY_TYPE): return node node_name = node.__class__.__name__ method = "visit_" + node_name try: visitor = getattr(self, method) except AttributeError: raise SyntaxError( "Query string contains unsupported syntax: {}".format(node_name) ) return visitor(node) def visit_Module(self, node): if self.target is None and len(node.body) != 1: raise SyntaxError("Only a single expression is allowed") result = None for expr in node.body: result = self.visit(expr) return result def visit_Expr(self, node): return self.visit(node.value) def visit_BinOp(self, node): left = self.visit(node.left) right = self.visit(node.right) return self._op_handlers[type(node.op)](left, right) def visit_Call(self, node): func = self.visit(node.func) args = [self.visit(n) for n in node.args] kwargs = OrderedDict([(kw.arg, self.visit(kw.value)) for kw in node.keywords]) return func(*args, **kwargs) def visit_Compare(self, node): ops = node.ops comps = node.comparators if len(comps) == 1: binop = ast.BinOp(op=ops[0], left=node.left, right=comps[0]) return self.visit(binop) left = node.left values = [] for op, comp in zip(ops, comps): new_node = ast.Compare(comparators=[comp], left=left, ops=[op]) left = comp values.append(new_node) return self.visit(ast.BoolOp(op=ast.And(), values=values)) def visit_BoolOp(self, node): def func(lhs, rhs): binop = ast.BinOp(op=node.op, left=lhs, right=rhs) return self.visit(binop) return reduce(func, node.values) def visit_UnaryOp(self, node): op = self.visit(node.operand) return self._op_handlers[type(node.op)](op) def visit_Name(self, node): if node.id.startswith(LOCAL_TAG): local_name = node.id.replace(LOCAL_TAG, "") self.referenced_vars.add(local_name) return self.env[local_name] if node.id.startswith(BACKTICK_TAG): local_name = binascii.a2b_hex( node.id.replace(BACKTICK_TAG, "").encode() ).decode() return self.get_named_object(local_name) return self.get_named_object(node.id) def visit_NameConstant(self, node): # pragma: no cover return node.value def visit_Num(self, node): # pragma: no cover return node.n def visit_Str(self, node): # pragma: no cover return node.s def visit_Constant(self, node): return node.value def visit_List(self, node): return [self.visit(e) for e in node.elts] def visit_Assign(self, node): if self.target is None: raise ValueError("Target not specified for assignment") if isinstance(node.targets[0], ast.Tuple): raise ValueError("Does not support assigning to multiple objects") target = node.targets[0].id value = self.visit(node.value) self.target[target] = value self.assigned = True visit_Tuple = visit_List def visit_Attribute(self, node): attr = node.attr value = node.value ctx = node.ctx if isinstance(ctx, ast.Load): resolved = self.visit(value) return getattr(resolved, attr) raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) def visit_Subscript(self, node): value = self.visit(node.value) sub = self.visit(node.slice) if isinstance(value, ENTITY_TYPE): self.entity_subscribe = True return value[sub] def visit_Index(self, node): return self.visit(node.value) def visit_Slice(self, node): lower = node.lower if lower is not None: lower = self.visit(lower) upper = node.upper if upper is not None: upper = self.visit(upper) step = node.step if step is not None: step = self.visit(step) return slice(lower, upper, step) class DataFrameEval(DataFrameOperator, DataFrameOperatorMixin): _op_type_ = opcodes.DATAFRAME_EVAL expr = StringField("expr", default=None) parser = StringField("parser", default=None) engine = StringField("engine", default=None) variables = DictField("variables", default=None) self_target = BoolField("self_target", default=None) is_query = BoolField("is_query", default=None) def __call__(self, df, output_type, shape, dtypes): self._output_types = [output_type] params = df.params new_index_value = ( df.index_value if not np.isnan(shape[0]) else parse_index(pd.RangeIndex(-1)) ) if output_type == OutputType.dataframe: params.update( dict( dtypes=dtypes, shape=shape, columns_value=parse_index(dtypes.index, store_data=True), index_value=new_index_value, ) ) else: name, dtype = dtypes params = dict( name=name, dtype=dtype, shape=shape, index_value=new_index_value, ) return self.new_tileable([df], **params) def convert_to_query(self, df, output_type, shape, dtypes): new_op = self.copy().reset_key() new_op.is_query = True new_op.self_target = False return new_op(df, output_type, shape, dtypes) def maxframe_eval( expr, parser="maxframe", engine=None, local_dict=None, global_dict=None, resolvers=(), level=0, target=None, inplace=False, ): """ Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, :keyword:`or`, and :keyword:`not` with the same semantics as the corresponding bitwise operators. :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. Parameters ---------- expr : str The expression to evaluate. This string cannot contain any Python `statements <https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__, only Python `expressions <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__. local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional A dictionary of global variables, taken from globals() by default. resolvers : list of dict-like or None, optional A list of objects implementing the ``__getitem__`` special method that you can use to inject an additional collection of namespaces to use for variable lookup. For example, this is used in the :meth:`~DataFrame.query` method to inject the ``DataFrame.index`` and ``DataFrame.columns`` variables that refer to their respective :class:`~pandas.DataFrame` instance attributes. level : int, optional The number of prior stack frames to traverse and add to the current scope. Most users will **not** need to change this parameter. target : object, optional, default None This is the target object for assignment. It is used when there is variable assignment in the expression. If so, then `target` must support item assignment with string keys, and if a copy is being returned, it must also support `.copy()`. inplace : bool, default False If `target` is provided, and the expression mutates `target`, whether to modify `target` inplace. Otherwise, return a copy of `target` with the mutation. Returns ------- ndarray, numeric scalar, DataFrame, Series Raises ------ ValueError There are many instances where such an error can be raised: - `target=None`, but the expression is multiline. - The expression is multiline, but not all them have item assignment. An example of such an arrangement is this: a = b + 1 a + 2 Here, there are expressions on different lines, making it multiline, but the last line has no variable assigned to the output of `a + 2`. - `inplace=True`, but the expression is missing item assignment. - Item assignment is provided, but the `target` does not support string item assignment. - Item assignment is provided and `inplace=False`, but the `target` does not support the `.copy()` method See Also -------- DataFrame.query : Evaluates a boolean expression to query the columns of a frame. DataFrame.eval : Evaluate a string describing operations on DataFrame columns. Notes ----- The ``dtype`` of any objects involved in an arithmetic ``%`` operation are recursively cast to ``float64``. See the :ref:`enhancing performance <enhancingperf.eval>` documentation for more details. Examples -------- >>> import maxframe.dataframe as md >>> df = md.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df.execute() animal age 0 dog 10 1 pig 20 We can add a new column using ``pd.eval``: >>> md.eval("double_age = df.age * 2", target=df).execute() animal age double_age 0 dog 10 20 1 pig 20 40 """ if not isinstance(expr, str): raise TypeError("expr must be a string") expr = textwrap.dedent(expr) try: frame = sys._getframe(level + 1) local_dict = local_dict or dict() local_dict.update(frame.f_locals) global_dict = global_dict or dict() global_dict.update(frame.f_globals) finally: del frame env = dict() env.update(global_dict) env.update(local_dict) ref_frames = set(resolvers) | set([target] if target is not None else []) self_target = len(resolvers) > 0 and resolvers[0] is target if target is not None and not inplace: target = target.copy() visitor = CollectionVisitor(resolvers, target, env) result = visitor.eval(expr) result = result if result is not None else target has_var_frame = any( isinstance(env[k], ENTITY_TYPE) for k in visitor.referenced_vars ) if len(ref_frames) != 1 or visitor.entity_subscribe or has_var_frame: if parser != "maxframe": raise NotImplementedError( "Does not support parser names other than maxframe" ) if engine is not None: raise NotImplementedError("Does not support specifying engine names") return result else: parser = "pandas" if parser == "maxframe" else parser referenced_env = {k: env[k] for k in visitor.referenced_vars} op = DataFrameEval( expr=expr, parser=parser, engine=engine, variables=referenced_env, self_target=visitor.assigned and self_target, is_query=False, ) output_type = get_output_types(result)[0] dtypes = result.dtypes if result.ndim == 2 else (result.name, result.dtype) return op(resolvers[0], output_type, result.shape, dtypes) def df_eval(df, expr, inplace=False, **kwargs): """ Evaluate a string describing operations on DataFrame columns. Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. Parameters ---------- expr : str The expression string to evaluate. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, a new DataFrame is returned. **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by :meth:`~pandas.DataFrame.query`. Returns ------- ndarray, scalar, or pandas object The result of the evaluation. See Also -------- DataFrame.query : Evaluates a boolean expression to query the columns of a frame. DataFrame.assign : Can evaluate an expression or function to create new values for a column. eval : Evaluate a Python expression as a string using various backends. Notes ----- For more details see the API documentation for :func:`~eval`. For detailed examples see :ref:`enhancing performance with eval <enhancingperf.eval>`. Examples -------- >>> import maxframe.dataframe as md >>> df = md.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) >>> df.execute() A B 0 1 10 1 2 8 2 3 6 3 4 4 4 5 2 >>> df.eval('A + B').execute() 0 11 1 10 2 9 3 8 4 7 dtype: int64 Assignment is allowed though by default the original DataFrame is not modified. >>> df.eval('C = A + B').execute() A B C 0 1 10 11 1 2 8 10 2 3 6 9 3 4 4 8 4 5 2 7 >>> df.execute() A B 0 1 10 1 2 8 2 3 6 3 4 4 4 5 2 Use ``inplace=True`` to modify the original DataFrame. >>> df.eval('C = A + B', inplace=True) >>> df.execute() A B C 0 1 10 11 1 2 8 10 2 3 6 9 3 4 4 8 4 5 2 7 Multiple columns can be assigned to using multi-line expressions: >>> df.eval(''' ... C = A + B ... D = A - B ... ''').execute() A B C D 0 1 10 11 -9 1 2 8 10 -6 2 3 6 9 -3 3 4 4 8 0 4 5 2 7 3 """ level = kwargs.pop("level", None) or 0 kwargs["inplace"] = inplace val = maxframe_eval(expr, resolvers=(df,), target=df, level=level + 1, **kwargs) if not inplace: return val def df_query(df, expr, inplace=False, **kwargs): """ Query the columns of a DataFrame with a boolean expression. Parameters ---------- expr : str The query string to evaluate. You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. You can refer to column names that contain spaces or operators by surrounding them in backticks. This way you can also escape names that start with a digit, or those that are a Python keyword. Basically when it is not valid Python identifier. See notes down for more details. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. inplace : bool Whether the query should modify the data in place or return a modified copy. **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by :meth:`DataFrame.query`. Returns ------- DataFrame DataFrame resulting from the provided query expression. See Also -------- eval : Evaluate a string describing operations on DataFrame columns. DataFrame.eval : Evaluate a string describing operations on DataFrame columns. Notes ----- The result of the evaluation of this expression is first passed to :attr:`DataFrame.loc` and if that fails because of a multidimensional key (e.g., a DataFrame) then the result will be passed to :meth:`DataFrame.__getitem__`. This method uses the top-level :func:`eval` function to evaluate the passed query. The :meth:`~pandas.DataFrame.query` method uses a slightly modified Python syntax by default. For example, the ``&`` and ``|`` (bitwise) operators have the precedence of their boolean cousins, :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, however the semantics are different. You can change the semantics of the expression by passing the keyword argument ``parser='python'``. This enforces the same semantics as evaluation in Python space. Likewise, you can pass ``engine='python'`` to evaluate an expression using Python itself as a backend. This is not recommended as it is inefficient compared to using ``numexpr`` as the engine. The :attr:`DataFrame.index` and :attr:`DataFrame.columns` attributes of the :class:`~pandas.DataFrame` instance are placed in the query namespace by default, which allows you to treat both the index and columns of the frame as a column in the frame. The identifier ``index`` is used for the frame index; you can also use the name of the index to identify it in a query. Please note that Python keywords may not be used as identifiers. For further details and examples see the ``query`` documentation in :ref:`indexing <indexing.query>`. *Backtick quoted variables* Backtick quoted variables are parsed as literal Python code and are converted internally to a Python valid identifier. This can lead to the following problems. During parsing a number of disallowed characters inside the backtick quoted string are replaced by strings that are allowed as a Python identifier. These characters include all operators in Python, the space character, the question mark, the exclamation mark, the dollar sign, and the euro sign. For other characters that fall outside the ASCII range (U+0001..U+007F) and those that are not further specified in PEP 3131, the query parser will raise an error. This excludes whitespace different than the space character, but also the hashtag (as it is used for comments) and the backtick itself (backtick can also not be escaped). In a special case, quotes that make a pair around a backtick can confuse the parser. For example, ```it's` > `that's``` will raise an error, as it forms a quoted string (``'s > `that'``) with a backtick inside. See also the Python documentation about lexical analysis (https://docs.python.org/3/reference/lexical_analysis.html) in combination with the source code in :mod:`pandas.core.computation.parsing`. Examples -------- >>> import maxframe.dataframe as md >>> df = md.DataFrame({'A': range(1, 6), ... 'B': range(10, 0, -2), ... 'C C': range(10, 5, -1)}) >>> df.execute() A B C C 0 1 10 10 1 2 8 9 2 3 6 8 3 4 4 7 4 5 2 6 >>> df.query('A > B').execute() A B C C 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B].execute() A B C C 4 5 2 6 For columns with spaces in their name, you can use backtick quoting. >>> df.query('B == `C C`').execute() A B C C 0 1 10 10 The previous expression is equivalent to >>> df[df.B == df['C C']].execute() A B C C 0 1 10 10 """ level = kwargs.pop("level", None) or 0 predicate = maxframe_eval(expr, resolvers=(df,), level=level + 1, **kwargs) result = df[predicate] if isinstance(predicate.op, DataFrameEval): output_type = get_output_types(result)[0] dtypes = result.dtypes if result.ndim == 2 else (result.name, result.dtype) result = predicate.op.convert_to_query(df, output_type, result.shape, dtypes) if inplace: df.data = result.data else: return result