in python/pyarrow/array.pxi [0:0]
def array(object obj, type=None, mask=None, size=None, from_pandas=None,
bint safe=True, MemoryPool memory_pool=None):
"""
Create pyarrow.Array instance from a Python object.
Parameters
----------
obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array
If both type and size are specified may be a single use iterable. If
not strongly-typed, Arrow type will be inferred for resulting array.
Any Arrow-compatible array that implements the Arrow PyCapsule Protocol
(has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method)
can be passed as well.
type : pyarrow.DataType
Explicit type to attempt to coerce to, otherwise will be inferred from
the data.
mask : array[bool], optional
Indicate which values are null (True) or not null (False).
size : int64, optional
Size of the elements. If the input is larger than size bail at this
length. For iterators, if size is larger than the input iterator this
will be treated as a "max size", but will involve an initial allocation
of size followed by a resize to the actual size (so if you know the
exact size specifying it correctly will give you better performance).
from_pandas : bool, default None
Use pandas's semantics for inferring nulls from values in
ndarray-like data. If passed, the mask tasks precedence, but
if a value is unmasked (not-null), but still null according to
pandas semantics, then it is null. Defaults to False if not
passed explicitly by user, or True if a pandas object is
passed in.
safe : bool, default True
Check for overflows or other unsafe conversions.
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the currently-set default
memory pool.
Returns
-------
array : pyarrow.Array or pyarrow.ChunkedArray
A ChunkedArray instead of an Array is returned if:
- the object data overflowed binary storage.
- the object's ``__arrow_array__`` protocol method returned a chunked
array.
Notes
-----
Timezone will be preserved in the returned array for timezone-aware data,
else no timezone will be returned for naive timestamps.
Internally, UTC values are stored for timezone-aware data with the
timezone set in the data type.
Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by
default converted as MonthDayNanoIntervalArray. relativedelta leapdays
are ignored as are all absolute fields on both objects. datetime.timedelta
can also be converted to MonthDayNanoIntervalArray but this requires
passing MonthDayNanoIntervalType explicitly.
Converting to dictionary array will promote to a wider integer type for
indices if the number of distinct values cannot be represented, even if
the index type was explicitly set. This means that if there are more than
127 values the returned dictionary array's index type will be at least
pa.int16() even if pa.int8() was passed to the function. Note that an
explicit index type will not be demoted even if it is wider than required.
Examples
--------
>>> import pandas as pd
>>> import pyarrow as pa
>>> pa.array(pd.Series([1, 2]))
<pyarrow.lib.Int64Array object at ...>
[
1,
2
]
>>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string()))
<pyarrow.lib.DictionaryArray object at ...>
...
-- dictionary:
[
"a",
"b"
]
-- indices:
[
0,
1,
0
]
>>> import numpy as np
>>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool))
<pyarrow.lib.Int64Array object at ...>
[
1,
null
]
>>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64()))
>>> arr.type.index_type
DataType(int16)
"""
cdef:
CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
bint is_pandas_object = False
bint c_from_pandas
type = ensure_type(type, allow_none=True)
extension_type = None
if type is not None and type.id == _Type_EXTENSION:
extension_type = type
type = type.storage_type
if from_pandas is None:
c_from_pandas = False
else:
c_from_pandas = from_pandas
if isinstance(obj, Array):
if type is not None and not obj.type.equals(type):
obj = obj.cast(type, safe=safe, memory_pool=memory_pool)
return obj
if hasattr(obj, '__arrow_array__'):
return _handle_arrow_array_protocol(obj, type, mask, size)
elif hasattr(obj, '__arrow_c_device_array__'):
if type is not None:
requested_type = type.__arrow_c_schema__()
else:
requested_type = None
schema_capsule, array_capsule = obj.__arrow_c_device_array__(requested_type)
out_array = Array._import_from_c_device_capsule(schema_capsule, array_capsule)
if type is not None and out_array.type != type:
# PyCapsule interface type coercion is best effort, so we need to
# check the type of the returned array and cast if necessary
out_array = array.cast(type, safe=safe, memory_pool=memory_pool)
return out_array
elif hasattr(obj, '__arrow_c_array__'):
if type is not None:
requested_type = type.__arrow_c_schema__()
else:
requested_type = None
schema_capsule, array_capsule = obj.__arrow_c_array__(requested_type)
out_array = Array._import_from_c_capsule(schema_capsule, array_capsule)
if type is not None and out_array.type != type:
# PyCapsule interface type coercion is best effort, so we need to
# check the type of the returned array and cast if necessary
out_array = array.cast(type, safe=safe, memory_pool=memory_pool)
return out_array
elif _is_array_like(obj):
if mask is not None:
if _is_array_like(mask):
mask = get_values(mask, &is_pandas_object)
else:
raise TypeError("Mask must be a numpy array "
"when converting numpy arrays")
values = get_values(obj, &is_pandas_object)
if is_pandas_object and from_pandas is None:
c_from_pandas = True
if isinstance(values, np.ma.MaskedArray):
if mask is not None:
raise ValueError("Cannot pass a numpy masked array and "
"specify a mask at the same time")
else:
# don't use shrunken masks
mask = None if values.mask is np.ma.nomask else values.mask
values = values.data
if mask is not None:
if mask.dtype != np.bool_:
raise TypeError("Mask must be boolean dtype")
if mask.ndim != 1:
raise ValueError("Mask must be 1D array")
if len(values) != len(mask):
raise ValueError(
"Mask is a different length from sequence being converted")
if hasattr(values, '__arrow_array__'):
return _handle_arrow_array_protocol(values, type, mask, size)
elif (pandas_api.is_categorical(values) and
type is not None and type.id != Type_DICTIONARY):
result = _ndarray_to_array(
np.asarray(values), mask, type, c_from_pandas, safe, pool
)
elif pandas_api.is_categorical(values):
if type is not None:
index_type = type.index_type
value_type = type.value_type
if values.ordered != type.ordered:
raise ValueError(
"The 'ordered' flag of the passed categorical values "
"does not match the 'ordered' of the specified type. ")
else:
index_type = None
value_type = None
indices = _codes_to_indices(
values.codes, mask, index_type, memory_pool)
try:
dictionary = array(
values.categories.values, type=value_type,
memory_pool=memory_pool)
except TypeError:
# TODO when removing the deprecation warning, this whole
# try/except can be removed (to bubble the TypeError of
# the first array(..) call)
if value_type is not None:
warnings.warn(
"The dtype of the 'categories' of the passed "
"categorical values ({0}) does not match the "
"specified type ({1}). For now ignoring the specified "
"type, but in the future this mismatch will raise a "
"TypeError".format(
values.categories.dtype, value_type),
FutureWarning, stacklevel=2)
dictionary = array(
values.categories.values, memory_pool=memory_pool)
else:
raise
return DictionaryArray.from_arrays(
indices, dictionary, ordered=values.ordered, safe=safe)
else:
if pandas_api.have_pandas:
values, type = pandas_api.compat.get_datetimetz_type(
values, obj.dtype, type)
if type and type.id == _Type_RUN_END_ENCODED:
arr = _ndarray_to_array(
values, mask, type.value_type, c_from_pandas, safe, pool)
result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
memory_pool=memory_pool)
else:
result = _ndarray_to_array(values, mask, type, c_from_pandas, safe,
pool)
else:
if type and type.id == _Type_RUN_END_ENCODED:
arr = _sequence_to_array(
obj, mask, size, type.value_type, pool, from_pandas)
result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
memory_pool=memory_pool)
# ConvertPySequence does strict conversion if type is explicitly passed
else:
result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
if extension_type is not None:
result = ExtensionArray.from_storage(extension_type, result)
return result