core/maxframe/dataframe/extensions/flatjson.py (53 lines of code) (raw):

# Copyright 1999-2025 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List from ... import opcodes from ...core import OutputType from ...serialization.serializables import ListField from ...serialization.serializables.field_type import FieldTypes from ..core import DataFrame from ..operators import DataFrameOperator, DataFrameOperatorMixin from ..utils import make_dtypes, parse_index class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin): _op_type_ = opcodes.FLATJSON query_paths = ListField("query_paths", field_type=FieldTypes.string, default=None) def __call__(self, series, dtypes): if self._output_types[0] == OutputType.series: name, dtype = dtypes return self.new_series( [series], shape=series.shape, index_value=series.index_value, name=name, dtype=dtype, ) return self.new_dataframe( [series], shape=(series.shape[0], len(dtypes)), index_value=series.index_value, columns_value=parse_index(dtypes.index, store_data=True), dtypes=make_dtypes(dtypes), ) def series_flatjson( series, query_paths: List[str], dtypes=None, dtype=None, name: str = None, ) -> DataFrame: """ Flat JSON object in the series to a dataframe according to JSON query. Parameters ---------- series : Series The series of json strings. query_paths: List[str] or str The JSON query paths for each generated column. The path format should follow [RFC9535](https://datatracker.ietf.org/doc/rfc9535/). dtypes : Series, default None Specify dtypes of returned DataFrame. Can't work with dtype. dtype : numpy.dtype, default None Specify dtype of returned Series. Can't work with dtypes. name : str, default None Specify name of the returned Series. Returns ------- DataFrame or Series Result of DataFrame when dtypes specified, else Series. Examples -------- >>> import maxframe.dataframe as md >>> import pandas as pd >>> s = md.Series( ... [ ... '{"age": 24, "gender": "male", "graduated": false}', ... '{"age": 25, "gender": "female", "graduated": true}', ... ] ... ) >>> s.execute() 0 {"age": 24, "gender": "male", "graduated": false} 1 {"age": 25, "gender": "female", "graduated": true} dtype: object >>> df = s.mf.flatjson( ... ["$.age", "$.gender", "$.graduated"], ... dtypes=pd.Series(["int32", "object", "bool"], index=["age", "gender", "graduated"]), ... ) >>> df.execute() age gender graduated 0 24 male True 1 25 female True >>> s2 = s.mf.flatjson("$.age", name="age", dtype="int32") >>> s2.execute() 0 24 1 25 Name: age, dtype: int32 """ if isinstance(query_paths, str): query_paths = [query_paths] if dtypes is not None and dtype is not None: raise ValueError("Both dtypes and dtype cannot be specified at the same time.") if dtype is not None: if len(query_paths) != 1: raise ValueError("query_paths should have only one path if dtype is set") output_type = OutputType.series elif dtypes is not None: if len(dtypes) != len(query_paths): raise ValueError("query_paths and dtypes should have same length") output_type = OutputType.dataframe else: raise ValueError("dtypes or dtype should be specified") dtypes = (name, dtype) if dtype is not None else dtypes return SeriesFlatJSONOperator(query_paths=query_paths, _output_types=[output_type])( series, dtypes )