in python/pyarrow/src/arrow/python/inference.cc [454:550]
Status GetType(std::shared_ptr<DataType>* out) {
// TODO(wesm): handling forming unions
if (make_unions_) {
return Status::NotImplemented("Creating union types not yet supported");
}
RETURN_NOT_OK(Validate());
if (arrow_scalar_count_ > 0 && arrow_scalar_count_ + none_count_ != total_count_) {
return Status::Invalid(
"pyarrow scalars cannot be mixed "
"with other Python scalar values currently");
}
if (numpy_dtype_count_ > 0) {
// All NumPy scalars and Nones/nulls
if (numpy_dtype_count_ + none_count_ == total_count_) {
return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out);
}
// The "bad path": data contains a mix of NumPy scalars and
// other kinds of scalars. Note this can happen innocuously
// because numpy.nan is not a NumPy scalar (it's a built-in
// PyFloat)
// TODO(ARROW-5564): Merge together type unification so this
// hack is not necessary
switch (numpy_unifier_.current_type_num()) {
case NPY_BOOL:
bool_count_ += numpy_dtype_count_;
break;
case NPY_INT8:
case NPY_INT16:
case NPY_INT32:
case NPY_INT64:
case NPY_UINT8:
case NPY_UINT16:
case NPY_UINT32:
case NPY_UINT64:
int_count_ += numpy_dtype_count_;
break;
case NPY_FLOAT32:
case NPY_FLOAT64:
float_count_ += numpy_dtype_count_;
break;
case NPY_DATETIME:
return Status::Invalid(
"numpy.datetime64 scalars cannot be mixed "
"with other Python scalar values currently");
}
}
if (list_count_) {
std::shared_ptr<DataType> value_type;
RETURN_NOT_OK(list_inferrer_->GetType(&value_type));
*out = list(value_type);
} else if (struct_count_) {
RETURN_NOT_OK(GetStructType(out));
} else if (decimal_count_) {
if (max_decimal_metadata_.precision() > Decimal128Type::kMaxPrecision) {
// the default constructor does not validate the precision and scale
ARROW_ASSIGN_OR_RAISE(*out,
Decimal256Type::Make(max_decimal_metadata_.precision(),
max_decimal_metadata_.scale()));
} else {
ARROW_ASSIGN_OR_RAISE(*out,
Decimal128Type::Make(max_decimal_metadata_.precision(),
max_decimal_metadata_.scale()));
}
} else if (float_count_) {
// Prioritize floats before integers
*out = float64();
} else if (int_count_) {
*out = int64();
} else if (date_count_) {
*out = date32();
} else if (time_count_) {
*out = time64(TimeUnit::MICRO);
} else if (timestamp_micro_count_) {
*out = timestamp(TimeUnit::MICRO, timezone_);
} else if (duration_count_) {
*out = duration(TimeUnit::MICRO);
} else if (bool_count_) {
*out = boolean();
} else if (binary_count_) {
*out = binary();
} else if (unicode_count_) {
*out = utf8();
} else if (interval_count_) {
*out = month_day_nano_interval();
} else if (arrow_scalar_count_) {
*out = scalar_type_;
} else {
*out = null();
}
return Status::OK();
}