core/maxframe/dataframe/misc/eval.py (318 lines of code) (raw):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 1999-2025 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ast
import binascii
import operator
import sys
import textwrap
import tokenize
from collections import OrderedDict
from functools import reduce
from io import StringIO
import numpy as np
import pandas as pd
from ... import opcodes
from ...core import ENTITY_TYPE, OutputType, get_output_types
from ...serialization.serializables import BoolField, DictField, StringField
from ..operators import DataFrameOperator, DataFrameOperatorMixin
from ..utils import parse_index
LOCAL_TAG = "_local_var_"
BACKTICK_TAG = "_backtick_var_"
def _tokenize_str(reader):
token_generator = tokenize.generate_tokens(reader)
def _iter_backtick_string(gen, line, back_start):
for _, tokval, start, _, _ in gen:
if tokval == "`":
return (
BACKTICK_TAG
+ binascii.b2a_hex(
line[back_start[1] + 1 : start[1]].encode()
).decode()
)
else:
raise SyntaxError(f"backtick quote at {back_start} does not match")
for toknum, tokval, start, _, line in token_generator:
if toknum == tokenize.OP:
if tokval == "@":
tokval = LOCAL_TAG
if tokval == "&":
toknum = tokenize.NAME
tokval = "and"
elif tokval == "|":
toknum = tokenize.NAME
tokval = "or"
elif tokval == "`":
yield tokenize.NAME, _iter_backtick_string(token_generator, line, start)
continue
yield toknum, tokval
class CollectionVisitor(ast.NodeVisitor):
_op_handlers = {
ast.Add: operator.add,
ast.Sub: operator.sub,
ast.Mult: operator.mul,
ast.Div: operator.truediv,
ast.FloorDiv: operator.floordiv,
ast.mod: operator.mod,
ast.Pow: operator.pow,
ast.Eq: operator.eq,
ast.NotEq: operator.ne,
ast.Lt: operator.lt,
ast.LtE: operator.le,
ast.Gt: operator.gt,
ast.GtE: operator.ge,
ast.In: lambda x, y: y.isin(x),
ast.NotIn: lambda x, y: ~y.isin(x),
ast.UAdd: operator.pos,
ast.USub: operator.neg,
ast.Invert: operator.invert,
ast.And: operator.and_,
ast.Or: operator.or_,
}
def __init__(self, resolvers, target, env):
self.env = env
self.target = target
self.resolvers = resolvers
self.referenced_vars = set()
self.assigned = False
self.entity_subscribe = False
def _preparse(self, expr):
reader = StringIO(expr).readline
return tokenize.untokenize(list(_tokenize_str(reader)))
def eval(self, expr, rewrite=True):
if rewrite:
expr = self._preparse(expr)
node = ast.fix_missing_locations(ast.parse(expr))
return self.visit(node)
def get_named_object(self, obj_name):
for resolver in self.resolvers:
try:
return resolver[obj_name]
except (IndexError, KeyError):
continue
if obj_name in self.env:
self.referenced_vars.add(obj_name)
return self.env[obj_name]
try:
return self.target[obj_name]
except KeyError:
pass
raise KeyError(f"name {obj_name} is not defined")
def visit(self, node):
if isinstance(node, ENTITY_TYPE):
return node
node_name = node.__class__.__name__
method = "visit_" + node_name
try:
visitor = getattr(self, method)
except AttributeError:
raise SyntaxError(
"Query string contains unsupported syntax: {}".format(node_name)
)
return visitor(node)
def visit_Module(self, node):
if self.target is None and len(node.body) != 1:
raise SyntaxError("Only a single expression is allowed")
result = None
for expr in node.body:
result = self.visit(expr)
return result
def visit_Expr(self, node):
return self.visit(node.value)
def visit_BinOp(self, node):
left = self.visit(node.left)
right = self.visit(node.right)
return self._op_handlers[type(node.op)](left, right)
def visit_Call(self, node):
func = self.visit(node.func)
args = [self.visit(n) for n in node.args]
kwargs = OrderedDict([(kw.arg, self.visit(kw.value)) for kw in node.keywords])
return func(*args, **kwargs)
def visit_Compare(self, node):
ops = node.ops
comps = node.comparators
if len(comps) == 1:
binop = ast.BinOp(op=ops[0], left=node.left, right=comps[0])
return self.visit(binop)
left = node.left
values = []
for op, comp in zip(ops, comps):
new_node = ast.Compare(comparators=[comp], left=left, ops=[op])
left = comp
values.append(new_node)
return self.visit(ast.BoolOp(op=ast.And(), values=values))
def visit_BoolOp(self, node):
def func(lhs, rhs):
binop = ast.BinOp(op=node.op, left=lhs, right=rhs)
return self.visit(binop)
return reduce(func, node.values)
def visit_UnaryOp(self, node):
op = self.visit(node.operand)
return self._op_handlers[type(node.op)](op)
def visit_Name(self, node):
if node.id.startswith(LOCAL_TAG):
local_name = node.id.replace(LOCAL_TAG, "")
self.referenced_vars.add(local_name)
return self.env[local_name]
if node.id.startswith(BACKTICK_TAG):
local_name = binascii.a2b_hex(
node.id.replace(BACKTICK_TAG, "").encode()
).decode()
return self.get_named_object(local_name)
return self.get_named_object(node.id)
def visit_NameConstant(self, node): # pragma: no cover
return node.value
def visit_Num(self, node): # pragma: no cover
return node.n
def visit_Str(self, node): # pragma: no cover
return node.s
def visit_Constant(self, node):
return node.value
def visit_List(self, node):
return [self.visit(e) for e in node.elts]
def visit_Assign(self, node):
if self.target is None:
raise ValueError("Target not specified for assignment")
if isinstance(node.targets[0], ast.Tuple):
raise ValueError("Does not support assigning to multiple objects")
target = node.targets[0].id
value = self.visit(node.value)
self.target[target] = value
self.assigned = True
visit_Tuple = visit_List
def visit_Attribute(self, node):
attr = node.attr
value = node.value
ctx = node.ctx
if isinstance(ctx, ast.Load):
resolved = self.visit(value)
return getattr(resolved, attr)
raise ValueError("Invalid Attribute context {0}".format(ctx.__name__))
def visit_Subscript(self, node):
value = self.visit(node.value)
sub = self.visit(node.slice)
if isinstance(value, ENTITY_TYPE):
self.entity_subscribe = True
return value[sub]
def visit_Index(self, node):
return self.visit(node.value)
def visit_Slice(self, node):
lower = node.lower
if lower is not None:
lower = self.visit(lower)
upper = node.upper
if upper is not None:
upper = self.visit(upper)
step = node.step
if step is not None:
step = self.visit(step)
return slice(lower, upper, step)
class DataFrameEval(DataFrameOperator, DataFrameOperatorMixin):
_op_type_ = opcodes.DATAFRAME_EVAL
expr = StringField("expr", default=None)
parser = StringField("parser", default=None)
engine = StringField("engine", default=None)
variables = DictField("variables", default=None)
self_target = BoolField("self_target", default=None)
is_query = BoolField("is_query", default=None)
def __call__(self, df, output_type, shape, dtypes):
self._output_types = [output_type]
params = df.params
new_index_value = (
df.index_value if not np.isnan(shape[0]) else parse_index(pd.RangeIndex(-1))
)
if output_type == OutputType.dataframe:
params.update(
dict(
dtypes=dtypes,
shape=shape,
columns_value=parse_index(dtypes.index, store_data=True),
index_value=new_index_value,
)
)
else:
name, dtype = dtypes
params = dict(
name=name,
dtype=dtype,
shape=shape,
index_value=new_index_value,
)
return self.new_tileable([df], **params)
def convert_to_query(self, df, output_type, shape, dtypes):
new_op = self.copy().reset_key()
new_op.is_query = True
new_op.self_target = False
return new_op(df, output_type, shape, dtypes)
def maxframe_eval(
expr,
parser="maxframe",
engine=None,
local_dict=None,
global_dict=None,
resolvers=(),
level=0,
target=None,
inplace=False,
):
"""
Evaluate a Python expression as a string using various backends.
The following arithmetic operations are supported: ``+``, ``-``, ``*``,
``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
:keyword:`or`, and :keyword:`not` with the same semantics as the
corresponding bitwise operators. :class:`~pandas.Series` and
:class:`~pandas.DataFrame` objects are supported and behave as they would
with plain ol' Python evaluation.
Parameters
----------
expr : str
The expression to evaluate. This string cannot contain any Python
`statements
<https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
only Python `expressions
<https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
local_dict : dict or None, optional
A dictionary of local variables, taken from locals() by default.
global_dict : dict or None, optional
A dictionary of global variables, taken from globals() by default.
resolvers : list of dict-like or None, optional
A list of objects implementing the ``__getitem__`` special method that
you can use to inject an additional collection of namespaces to use for
variable lookup. For example, this is used in the
:meth:`~DataFrame.query` method to inject the
``DataFrame.index`` and ``DataFrame.columns``
variables that refer to their respective :class:`~pandas.DataFrame`
instance attributes.
level : int, optional
The number of prior stack frames to traverse and add to the current
scope. Most users will **not** need to change this parameter.
target : object, optional, default None
This is the target object for assignment. It is used when there is
variable assignment in the expression. If so, then `target` must
support item assignment with string keys, and if a copy is being
returned, it must also support `.copy()`.
inplace : bool, default False
If `target` is provided, and the expression mutates `target`, whether
to modify `target` inplace. Otherwise, return a copy of `target` with
the mutation.
Returns
-------
ndarray, numeric scalar, DataFrame, Series
Raises
------
ValueError
There are many instances where such an error can be raised:
- `target=None`, but the expression is multiline.
- The expression is multiline, but not all them have item assignment.
An example of such an arrangement is this:
a = b + 1
a + 2
Here, there are expressions on different lines, making it multiline,
but the last line has no variable assigned to the output of `a + 2`.
- `inplace=True`, but the expression is missing item assignment.
- Item assignment is provided, but the `target` does not support
string item assignment.
- Item assignment is provided and `inplace=False`, but the `target`
does not support the `.copy()` method
See Also
--------
DataFrame.query : Evaluates a boolean expression to query the columns
of a frame.
DataFrame.eval : Evaluate a string describing operations on
DataFrame columns.
Notes
-----
The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
recursively cast to ``float64``.
See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
more details.
Examples
--------
>>> import maxframe.dataframe as md
>>> df = md.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]})
>>> df.execute()
animal age
0 dog 10
1 pig 20
We can add a new column using ``pd.eval``:
>>> md.eval("double_age = df.age * 2", target=df).execute()
animal age double_age
0 dog 10 20
1 pig 20 40
"""
if not isinstance(expr, str):
raise TypeError("expr must be a string")
expr = textwrap.dedent(expr)
try:
frame = sys._getframe(level + 1)
local_dict = local_dict or dict()
local_dict.update(frame.f_locals)
global_dict = global_dict or dict()
global_dict.update(frame.f_globals)
finally:
del frame
env = dict()
env.update(global_dict)
env.update(local_dict)
ref_frames = set(resolvers) | set([target] if target is not None else [])
self_target = len(resolvers) > 0 and resolvers[0] is target
if target is not None and not inplace:
target = target.copy()
visitor = CollectionVisitor(resolvers, target, env)
result = visitor.eval(expr)
result = result if result is not None else target
has_var_frame = any(
isinstance(env[k], ENTITY_TYPE) for k in visitor.referenced_vars
)
if len(ref_frames) != 1 or visitor.entity_subscribe or has_var_frame:
if parser != "maxframe":
raise NotImplementedError(
"Does not support parser names other than maxframe"
)
if engine is not None:
raise NotImplementedError("Does not support specifying engine names")
return result
else:
parser = "pandas" if parser == "maxframe" else parser
referenced_env = {k: env[k] for k in visitor.referenced_vars}
op = DataFrameEval(
expr=expr,
parser=parser,
engine=engine,
variables=referenced_env,
self_target=visitor.assigned and self_target,
is_query=False,
)
output_type = get_output_types(result)[0]
dtypes = result.dtypes if result.ndim == 2 else (result.name, result.dtype)
return op(resolvers[0], output_type, result.shape, dtypes)
def df_eval(df, expr, inplace=False, **kwargs):
"""
Evaluate a string describing operations on DataFrame columns.
Operates on columns only, not specific rows or elements. This allows
`eval` to run arbitrary code, which can make you vulnerable to code
injection if you pass user input to this function.
Parameters
----------
expr : str
The expression string to evaluate.
inplace : bool, default False
If the expression contains an assignment, whether to perform the
operation inplace and mutate the existing DataFrame. Otherwise,
a new DataFrame is returned.
**kwargs
See the documentation for :func:`eval` for complete details
on the keyword arguments accepted by
:meth:`~pandas.DataFrame.query`.
Returns
-------
ndarray, scalar, or pandas object
The result of the evaluation.
See Also
--------
DataFrame.query : Evaluates a boolean expression to query the columns
of a frame.
DataFrame.assign : Can evaluate an expression or function to create new
values for a column.
eval : Evaluate a Python expression as a string using various
backends.
Notes
-----
For more details see the API documentation for :func:`~eval`.
For detailed examples see :ref:`enhancing performance with eval
<enhancingperf.eval>`.
Examples
--------
>>> import maxframe.dataframe as md
>>> df = md.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
>>> df.execute()
A B
0 1 10
1 2 8
2 3 6
3 4 4
4 5 2
>>> df.eval('A + B').execute()
0 11
1 10
2 9
3 8
4 7
dtype: int64
Assignment is allowed though by default the original DataFrame is not
modified.
>>> df.eval('C = A + B').execute()
A B C
0 1 10 11
1 2 8 10
2 3 6 9
3 4 4 8
4 5 2 7
>>> df.execute()
A B
0 1 10
1 2 8
2 3 6
3 4 4
4 5 2
Use ``inplace=True`` to modify the original DataFrame.
>>> df.eval('C = A + B', inplace=True)
>>> df.execute()
A B C
0 1 10 11
1 2 8 10
2 3 6 9
3 4 4 8
4 5 2 7
Multiple columns can be assigned to using multi-line expressions:
>>> df.eval('''
... C = A + B
... D = A - B
... ''').execute()
A B C D
0 1 10 11 -9
1 2 8 10 -6
2 3 6 9 -3
3 4 4 8 0
4 5 2 7 3
"""
level = kwargs.pop("level", None) or 0
kwargs["inplace"] = inplace
val = maxframe_eval(expr, resolvers=(df,), target=df, level=level + 1, **kwargs)
if not inplace:
return val
def df_query(df, expr, inplace=False, **kwargs):
"""
Query the columns of a DataFrame with a boolean expression.
Parameters
----------
expr : str
The query string to evaluate.
You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.
You can refer to column names that contain spaces or operators by
surrounding them in backticks. This way you can also escape
names that start with a digit, or those that are a Python keyword.
Basically when it is not valid Python identifier. See notes down
for more details.
For example, if one of your columns is called ``a a`` and you want
to sum it with ``b``, your query should be ```a a` + b``.
inplace : bool
Whether the query should modify the data in place or return
a modified copy.
**kwargs
See the documentation for :func:`eval` for complete details
on the keyword arguments accepted by :meth:`DataFrame.query`.
Returns
-------
DataFrame
DataFrame resulting from the provided query expression.
See Also
--------
eval : Evaluate a string describing operations on
DataFrame columns.
DataFrame.eval : Evaluate a string describing operations on
DataFrame columns.
Notes
-----
The result of the evaluation of this expression is first passed to
:attr:`DataFrame.loc` and if that fails because of a
multidimensional key (e.g., a DataFrame) then the result will be passed
to :meth:`DataFrame.__getitem__`.
This method uses the top-level :func:`eval` function to
evaluate the passed query.
The :meth:`~pandas.DataFrame.query` method uses a slightly
modified Python syntax by default. For example, the ``&`` and ``|``
(bitwise) operators have the precedence of their boolean cousins,
:keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
however the semantics are different.
You can change the semantics of the expression by passing the keyword
argument ``parser='python'``. This enforces the same semantics as
evaluation in Python space. Likewise, you can pass ``engine='python'``
to evaluate an expression using Python itself as a backend. This is not
recommended as it is inefficient compared to using ``numexpr`` as the
engine.
The :attr:`DataFrame.index` and
:attr:`DataFrame.columns` attributes of the
:class:`~pandas.DataFrame` instance are placed in the query namespace
by default, which allows you to treat both the index and columns of the
frame as a column in the frame.
The identifier ``index`` is used for the frame index; you can also
use the name of the index to identify it in a query. Please note that
Python keywords may not be used as identifiers.
For further details and examples see the ``query`` documentation in
:ref:`indexing <indexing.query>`.
*Backtick quoted variables*
Backtick quoted variables are parsed as literal Python code and
are converted internally to a Python valid identifier.
This can lead to the following problems.
During parsing a number of disallowed characters inside the backtick
quoted string are replaced by strings that are allowed as a Python identifier.
These characters include all operators in Python, the space character, the
question mark, the exclamation mark, the dollar sign, and the euro sign.
For other characters that fall outside the ASCII range (U+0001..U+007F)
and those that are not further specified in PEP 3131,
the query parser will raise an error.
This excludes whitespace different than the space character,
but also the hashtag (as it is used for comments) and the backtick
itself (backtick can also not be escaped).
In a special case, quotes that make a pair around a backtick can
confuse the parser.
For example, ```it's` > `that's``` will raise an error,
as it forms a quoted string (``'s > `that'``) with a backtick inside.
See also the Python documentation about lexical analysis
(https://docs.python.org/3/reference/lexical_analysis.html)
in combination with the source code in :mod:`pandas.core.computation.parsing`.
Examples
--------
>>> import maxframe.dataframe as md
>>> df = md.DataFrame({'A': range(1, 6),
... 'B': range(10, 0, -2),
... 'C C': range(10, 5, -1)})
>>> df.execute()
A B C C
0 1 10 10
1 2 8 9
2 3 6 8
3 4 4 7
4 5 2 6
>>> df.query('A > B').execute()
A B C C
4 5 2 6
The previous expression is equivalent to
>>> df[df.A > df.B].execute()
A B C C
4 5 2 6
For columns with spaces in their name, you can use backtick quoting.
>>> df.query('B == `C C`').execute()
A B C C
0 1 10 10
The previous expression is equivalent to
>>> df[df.B == df['C C']].execute()
A B C C
0 1 10 10
"""
level = kwargs.pop("level", None) or 0
predicate = maxframe_eval(expr, resolvers=(df,), level=level + 1, **kwargs)
result = df[predicate]
if isinstance(predicate.op, DataFrameEval):
output_type = get_output_types(result)[0]
dtypes = result.dtypes if result.ndim == 2 else (result.name, result.dtype)
result = predicate.op.convert_to_query(df, output_type, result.shape, dtypes)
if inplace:
df.data = result.data
else:
return result