core/maxframe/learn/contrib/xgboost/dmatrix.py (115 lines of code) (raw):

# Copyright 1999-2025 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .... import opcodes from ....core.entity.output_types import get_output_types from ....core.operator.base import Operator from ....core.operator.core import TileableOperatorMixin from ....dataframe.core import DATAFRAME_TYPE from ....serialization.serializables import Float64Field, KeyField, ListField from ....serialization.serializables.field import AnyField, BoolField, Int64Field from ....tensor import tensor as astensor from ....tensor.core import TENSOR_TYPE from ....typing_ import TileableType from ...utils import convert_to_tensor_or_dataframe class ToDMatrix(Operator, TileableOperatorMixin): _op_type_ = opcodes.TO_DMATRIX data = KeyField("data", default=None) label = KeyField("label", default=None) missing = Float64Field("missing", default=None) weight = KeyField("weight", default=None) base_margin = KeyField("base_margin", default=None) feature_names = ListField("feature_names", default=None) feature_types = ListField("feature_types", default=None) feature_weights = AnyField("feature_weights", default=None) nthread = Int64Field("nthread", default=None) group = AnyField("group", default=None) qid = AnyField("qid", default=None) label_lower_bound = AnyField("label_lower_bound", default=None) label_upper_bound = AnyField("label_upper_bound", default=None) enable_categorical = BoolField("enable_categorical", default=None) @property def output_limit(self): return 1 def _set_inputs(self, inputs): super()._set_inputs(inputs) if self.data is not None: self.data = self._inputs[0] has_label = self.label is not None if has_label: self.label = self._inputs[1] if self.weight is not None: i = 1 if not has_label else 2 self.weight = self._inputs[i] if self.base_margin is not None: self.base_margin = self._inputs[-1] @staticmethod def _get_kw(obj): if isinstance(obj, TENSOR_TYPE): return {"shape": obj.shape, "dtype": obj.dtype, "order": obj.order} else: return { "shape": obj.shape, "dtypes": obj.dtypes, "index_value": obj.index_value, "columns_value": obj.columns_value, } def __call__(self): inputs = [self.data] kw = self._get_kw(self.data) if self.label is not None: inputs.append(self.label) if self.weight is not None: inputs.append(self.weight) if self.base_margin is not None: inputs.append(self.base_margin) return self.new_tileable(inputs, **kw) def check_data(data): data = convert_to_tensor_or_dataframe(data) if data.ndim != 2: raise ValueError(f"Expecting 2-d data, got: {data.ndim}-d") return data def check_array_like(y: TileableType, name: str) -> TileableType: if y is None: return y = convert_to_tensor_or_dataframe(y) if isinstance(y, DATAFRAME_TYPE): y = y.iloc[:, 0] return astensor(y) def to_dmatrix( data, label=None, missing=None, weight=None, base_margin=None, feature_names=None, feature_types=None, feature_weights=None, nthread=None, group=None, qid=None, label_lower_bound=None, label_upper_bound=None, enable_categorical=None, ): data = check_data(data) label = check_array_like(label, "label") weight = check_array_like(weight, "weight") base_margin = check_array_like(base_margin, "base_margin") # If not multiple outputs, try to collect the chunks on same worker into one # to feed the data into XGBoost for training. op = ToDMatrix( data=data, label=label, missing=missing, weight=weight, base_margin=base_margin, feature_names=feature_names, feature_types=feature_types, feature_weights=feature_weights, nthread=nthread, group=group, qid=qid, label_lower_bound=label_lower_bound, label_upper_bound=label_upper_bound, gpu=data.op.gpu, _output_types=get_output_types(data), enable_categorical=enable_categorical, ) return op() DMatrix = to_dmatrix