core/maxframe/dataframe/extensions/flatjson.py (53 lines of code) (raw):
# Copyright 1999-2025 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from ... import opcodes
from ...core import OutputType
from ...serialization.serializables import ListField
from ...serialization.serializables.field_type import FieldTypes
from ..core import DataFrame
from ..operators import DataFrameOperator, DataFrameOperatorMixin
from ..utils import make_dtypes, parse_index
class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin):
_op_type_ = opcodes.FLATJSON
query_paths = ListField("query_paths", field_type=FieldTypes.string, default=None)
def __call__(self, series, dtypes):
if self._output_types[0] == OutputType.series:
name, dtype = dtypes
return self.new_series(
[series],
shape=series.shape,
index_value=series.index_value,
name=name,
dtype=dtype,
)
return self.new_dataframe(
[series],
shape=(series.shape[0], len(dtypes)),
index_value=series.index_value,
columns_value=parse_index(dtypes.index, store_data=True),
dtypes=make_dtypes(dtypes),
)
def series_flatjson(
series,
query_paths: List[str],
dtypes=None,
dtype=None,
name: str = None,
) -> DataFrame:
"""
Flat JSON object in the series to a dataframe according to JSON query.
Parameters
----------
series : Series
The series of json strings.
query_paths: List[str] or str
The JSON query paths for each generated column. The path format should follow
[RFC9535](https://datatracker.ietf.org/doc/rfc9535/).
dtypes : Series, default None
Specify dtypes of returned DataFrame. Can't work with dtype.
dtype : numpy.dtype, default None
Specify dtype of returned Series. Can't work with dtypes.
name : str, default None
Specify name of the returned Series.
Returns
-------
DataFrame or Series
Result of DataFrame when dtypes specified, else Series.
Examples
--------
>>> import maxframe.dataframe as md
>>> import pandas as pd
>>> s = md.Series(
... [
... '{"age": 24, "gender": "male", "graduated": false}',
... '{"age": 25, "gender": "female", "graduated": true}',
... ]
... )
>>> s.execute()
0 {"age": 24, "gender": "male", "graduated": false}
1 {"age": 25, "gender": "female", "graduated": true}
dtype: object
>>> df = s.mf.flatjson(
... ["$.age", "$.gender", "$.graduated"],
... dtypes=pd.Series(["int32", "object", "bool"], index=["age", "gender", "graduated"]),
... )
>>> df.execute()
age gender graduated
0 24 male True
1 25 female True
>>> s2 = s.mf.flatjson("$.age", name="age", dtype="int32")
>>> s2.execute()
0 24
1 25
Name: age, dtype: int32
"""
if isinstance(query_paths, str):
query_paths = [query_paths]
if dtypes is not None and dtype is not None:
raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
if dtype is not None:
if len(query_paths) != 1:
raise ValueError("query_paths should have only one path if dtype is set")
output_type = OutputType.series
elif dtypes is not None:
if len(dtypes) != len(query_paths):
raise ValueError("query_paths and dtypes should have same length")
output_type = OutputType.dataframe
else:
raise ValueError("dtypes or dtype should be specified")
dtypes = (name, dtype) if dtype is not None else dtypes
return SeriesFlatJSONOperator(query_paths=query_paths, _output_types=[output_type])(
series, dtypes
)