def array()

in python/pyarrow/array.pxi [0:0]
129 lines of code
42 McCabe index (conditional complexity)

def array(object obj, type=None, mask=None, size=None, from_pandas=None,
          bint safe=True, MemoryPool memory_pool=None):
    """
    Create pyarrow.Array instance from a Python object.

    Parameters
    ----------
    obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array
        If both type and size are specified may be a single use iterable. If
        not strongly-typed, Arrow type will be inferred for resulting array.
        Any Arrow-compatible array that implements the Arrow PyCapsule Protocol
        (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method)
        can be passed as well.
    type : pyarrow.DataType
        Explicit type to attempt to coerce to, otherwise will be inferred from
        the data.
    mask : array[bool], optional
        Indicate which values are null (True) or not null (False).
    size : int64, optional
        Size of the elements. If the input is larger than size bail at this
        length. For iterators, if size is larger than the input iterator this
        will be treated as a "max size", but will involve an initial allocation
        of size followed by a resize to the actual size (so if you know the
        exact size specifying it correctly will give you better performance).
    from_pandas : bool, default None
        Use pandas's semantics for inferring nulls from values in
        ndarray-like data. If passed, the mask tasks precedence, but
        if a value is unmasked (not-null), but still null according to
        pandas semantics, then it is null. Defaults to False if not
        passed explicitly by user, or True if a pandas object is
        passed in.
    safe : bool, default True
        Check for overflows or other unsafe conversions.
    memory_pool : pyarrow.MemoryPool, optional
        If not passed, will allocate memory from the currently-set default
        memory pool.

    Returns
    -------
    array : pyarrow.Array or pyarrow.ChunkedArray
        A ChunkedArray instead of an Array is returned if:

        - the object data overflowed binary storage.
        - the object's ``__arrow_array__`` protocol method returned a chunked
          array.

    Notes
    -----
    Timezone will be preserved in the returned array for timezone-aware data,
    else no timezone will be returned for naive timestamps.
    Internally, UTC values are stored for timezone-aware data with the
    timezone set in the data type.

    Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by
    default converted as MonthDayNanoIntervalArray. relativedelta leapdays
    are ignored as are all absolute fields on both objects. datetime.timedelta
    can also be converted to MonthDayNanoIntervalArray but this requires
    passing MonthDayNanoIntervalType explicitly.

    Converting to dictionary array will promote to a wider integer type for
    indices if the number of distinct values cannot be represented, even if
    the index type was explicitly set. This means that if there are more than
    127 values the returned dictionary array's index type will be at least
    pa.int16() even if pa.int8() was passed to the function. Note that an
    explicit index type will not be demoted even if it is wider than required.

    Examples
    --------
    >>> import pandas as pd
    >>> import pyarrow as pa
    >>> pa.array(pd.Series([1, 2]))
    <pyarrow.lib.Int64Array object at ...>
    [
      1,
      2
    ]

    >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string()))
    <pyarrow.lib.DictionaryArray object at ...>
    ...
    -- dictionary:
      [
        "a",
        "b"
      ]
    -- indices:
      [
        0,
        1,
        0
      ]

    >>> import numpy as np
    >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool))
    <pyarrow.lib.Int64Array object at ...>
    [
      1,
      null
    ]

    >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64()))
    >>> arr.type.index_type
    DataType(int16)
    """
    cdef:
        CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
        bint is_pandas_object = False
        bint c_from_pandas

    type = ensure_type(type, allow_none=True)

    extension_type = None
    if type is not None and type.id == _Type_EXTENSION:
        extension_type = type
        type = type.storage_type

    if from_pandas is None:
        c_from_pandas = False
    else:
        c_from_pandas = from_pandas

    if isinstance(obj, Array):
        if type is not None and not obj.type.equals(type):
            obj = obj.cast(type, safe=safe, memory_pool=memory_pool)
        return obj

    if hasattr(obj, '__arrow_array__'):
        return _handle_arrow_array_protocol(obj, type, mask, size)
    elif hasattr(obj, '__arrow_c_device_array__'):
        if type is not None:
            requested_type = type.__arrow_c_schema__()
        else:
            requested_type = None
        schema_capsule, array_capsule = obj.__arrow_c_device_array__(requested_type)
        out_array = Array._import_from_c_device_capsule(schema_capsule, array_capsule)
        if type is not None and out_array.type != type:
            # PyCapsule interface type coercion is best effort, so we need to
            # check the type of the returned array and cast if necessary
            out_array = array.cast(type, safe=safe, memory_pool=memory_pool)
        return out_array
    elif hasattr(obj, '__arrow_c_array__'):
        if type is not None:
            requested_type = type.__arrow_c_schema__()
        else:
            requested_type = None
        schema_capsule, array_capsule = obj.__arrow_c_array__(requested_type)
        out_array = Array._import_from_c_capsule(schema_capsule, array_capsule)
        if type is not None and out_array.type != type:
            # PyCapsule interface type coercion is best effort, so we need to
            # check the type of the returned array and cast if necessary
            out_array = array.cast(type, safe=safe, memory_pool=memory_pool)
        return out_array
    elif _is_array_like(obj):
        if mask is not None:
            if _is_array_like(mask):
                mask = get_values(mask, &is_pandas_object)
            else:
                raise TypeError("Mask must be a numpy array "
                                "when converting numpy arrays")

        values = get_values(obj, &is_pandas_object)
        if is_pandas_object and from_pandas is None:
            c_from_pandas = True

        if isinstance(values, np.ma.MaskedArray):
            if mask is not None:
                raise ValueError("Cannot pass a numpy masked array and "
                                 "specify a mask at the same time")
            else:
                # don't use shrunken masks
                mask = None if values.mask is np.ma.nomask else values.mask
                values = values.data

        if mask is not None:
            if mask.dtype != np.bool_:
                raise TypeError("Mask must be boolean dtype")
            if mask.ndim != 1:
                raise ValueError("Mask must be 1D array")
            if len(values) != len(mask):
                raise ValueError(
                    "Mask is a different length from sequence being converted")

        if hasattr(values, '__arrow_array__'):
            return _handle_arrow_array_protocol(values, type, mask, size)
        elif (pandas_api.is_categorical(values) and
              type is not None and type.id != Type_DICTIONARY):
            result = _ndarray_to_array(
                np.asarray(values), mask, type, c_from_pandas, safe, pool
            )
        elif pandas_api.is_categorical(values):
            if type is not None:
                index_type = type.index_type
                value_type = type.value_type
                if values.ordered != type.ordered:
                    raise ValueError(
                        "The 'ordered' flag of the passed categorical values "
                        "does not match the 'ordered' of the specified type. ")
            else:
                index_type = None
                value_type = None

            indices = _codes_to_indices(
                values.codes, mask, index_type, memory_pool)
            try:
                dictionary = array(
                    values.categories.values, type=value_type,
                    memory_pool=memory_pool)
            except TypeError:
                # TODO when removing the deprecation warning, this whole
                # try/except can be removed (to bubble the TypeError of
                # the first array(..) call)
                if value_type is not None:
                    warnings.warn(
                        "The dtype of the 'categories' of the passed "
                        "categorical values ({0}) does not match the "
                        "specified type ({1}). For now ignoring the specified "
                        "type, but in the future this mismatch will raise a "
                        "TypeError".format(
                            values.categories.dtype, value_type),
                        FutureWarning, stacklevel=2)
                    dictionary = array(
                        values.categories.values, memory_pool=memory_pool)
                else:
                    raise

            return DictionaryArray.from_arrays(
                indices, dictionary, ordered=values.ordered, safe=safe)
        else:
            if pandas_api.have_pandas:
                values, type = pandas_api.compat.get_datetimetz_type(
                    values, obj.dtype, type)
            if type and type.id == _Type_RUN_END_ENCODED:
                arr = _ndarray_to_array(
                    values, mask, type.value_type, c_from_pandas, safe, pool)
                result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
                                              memory_pool=memory_pool)
            else:
                result = _ndarray_to_array(values, mask, type, c_from_pandas, safe,
                                           pool)
    else:
        if type and type.id == _Type_RUN_END_ENCODED:
            arr = _sequence_to_array(
                obj, mask, size, type.value_type, pool, from_pandas)
            result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
                                          memory_pool=memory_pool)
        # ConvertPySequence does strict conversion if type is explicitly passed
        else:
            result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)

    if extension_type is not None:
        result = ExtensionArray.from_storage(extension_type, result)
    return result