def loadtxt()

in amplify/backend/function/iamxawswrangler/lib/python/numpy/lib/npyio.py [0:0]
200 lines of code
60 McCabe index (conditional complexity)

def loadtxt(fname, dtype=float, comments='#', delimiter=None,
            converters=None, skiprows=0, usecols=None, unpack=False,
            ndmin=0, encoding='bytes', max_rows=None, *, like=None):
    r"""
    Load data from a text file.

    Each row in the text file must have the same number of values.

    Parameters
    ----------
    fname : file, str, or pathlib.Path
        File, filename, or generator to read.  If the filename extension is
        ``.gz`` or ``.bz2``, the file is first decompressed. Note that
        generators should return byte strings.
    dtype : data-type, optional
        Data-type of the resulting array; default: float.  If this is a
        structured data-type, the resulting array will be 1-dimensional, and
        each row will be interpreted as an element of the array.  In this
        case, the number of columns used must match the number of fields in
        the data-type.
    comments : str or sequence of str, optional
        The characters or list of characters used to indicate the start of a
        comment. None implies no comments. For backwards compatibility, byte
        strings will be decoded as 'latin1'. The default is '#'.
    delimiter : str, optional
        The string used to separate values. For backwards compatibility, byte
        strings will be decoded as 'latin1'. The default is whitespace.
    converters : dict, optional
        A dictionary mapping column number to a function that will parse the
        column string into the desired value.  E.g., if column 0 is a date
        string: ``converters = {0: datestr2num}``.  Converters can also be
        used to provide a default value for missing data (but see also
        `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``.
        Default: None.
    skiprows : int, optional
        Skip the first `skiprows` lines, including comments; default: 0.
    usecols : int or sequence, optional
        Which columns to read, with 0 being the first. For example,
        ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
        The default, None, results in all columns being read.

        .. versionchanged:: 1.11.0
            When a single column has to be read it is possible to use
            an integer instead of a tuple. E.g ``usecols = 3`` reads the
            fourth column the same way as ``usecols = (3,)`` would.
    unpack : bool, optional
        If True, the returned array is transposed, so that arguments may be
        unpacked using ``x, y, z = loadtxt(...)``.  When used with a
        structured data-type, arrays are returned for each field.
        Default is False.
    ndmin : int, optional
        The returned array will have at least `ndmin` dimensions.
        Otherwise mono-dimensional axes will be squeezed.
        Legal values: 0 (default), 1 or 2.

        .. versionadded:: 1.6.0
    encoding : str, optional
        Encoding used to decode the inputfile. Does not apply to input streams.
        The special value 'bytes' enables backward compatibility workarounds
        that ensures you receive byte arrays as results if possible and passes
        'latin1' encoded strings to converters. Override this value to receive
        unicode arrays and pass strings as input to converters.  If set to None
        the system default is used. The default value is 'bytes'.

        .. versionadded:: 1.14.0
    max_rows : int, optional
        Read `max_rows` lines of content after `skiprows` lines. The default
        is to read all the lines.

        .. versionadded:: 1.16.0
    ${ARRAY_FUNCTION_LIKE}

        .. versionadded:: 1.20.0

    Returns
    -------
    out : ndarray
        Data read from the text file.

    See Also
    --------
    load, fromstring, fromregex
    genfromtxt : Load data with missing values handled as specified.
    scipy.io.loadmat : reads MATLAB data files

    Notes
    -----
    This function aims to be a fast reader for simply formatted files.  The
    `genfromtxt` function provides more sophisticated handling of, e.g.,
    lines with missing values.

    .. versionadded:: 1.10.0

    The strings produced by the Python float.hex method can be used as
    input for floats.

    Examples
    --------
    >>> from io import StringIO   # StringIO behaves like a file object
    >>> c = StringIO("0 1\n2 3")
    >>> np.loadtxt(c)
    array([[0., 1.],
           [2., 3.]])

    >>> d = StringIO("M 21 72\nF 35 58")
    >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
    ...                      'formats': ('S1', 'i4', 'f4')})
    array([(b'M', 21, 72.), (b'F', 35, 58.)],
          dtype=[('gender', 'S1'), ('age', '<i4'), ('weight', '<f4')])

    >>> c = StringIO("1,0,2\n3,0,4")
    >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
    >>> x
    array([1., 3.])
    >>> y
    array([2., 4.])

    This example shows how `converters` can be used to convert a field
    with a trailing minus sign into a negative number.

    >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94')
    >>> def conv(fld):
    ...     return -float(fld[:-1]) if fld.endswith(b'-') else float(fld)
    ...
    >>> np.loadtxt(s, converters={0: conv, 1: conv})
    array([[ 10.01, -31.25],
           [ 19.22,  64.31],
           [-17.57,  63.94]])
    """

    if like is not None:
        return _loadtxt_with_like(
            fname, dtype=dtype, comments=comments, delimiter=delimiter,
            converters=converters, skiprows=skiprows, usecols=usecols,
            unpack=unpack, ndmin=ndmin, encoding=encoding,
            max_rows=max_rows, like=like
        )

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Nested functions used by loadtxt.
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    # not to be confused with the flatten_dtype we import...
    @recursive
    def flatten_dtype_internal(self, dt):
        """Unpack a structured data-type, and produce re-packing info."""
        if dt.names is None:
            # If the dtype is flattened, return.
            # If the dtype has a shape, the dtype occurs
            # in the list more than once.
            shape = dt.shape
            if len(shape) == 0:
                return ([dt.base], None)
            else:
                packing = [(shape[-1], list)]
                if len(shape) > 1:
                    for dim in dt.shape[-2::-1]:
                        packing = [(dim*packing[0][0], packing*dim)]
                return ([dt.base] * int(np.prod(dt.shape)), packing)
        else:
            types = []
            packing = []
            for field in dt.names:
                tp, bytes = dt.fields[field]
                flat_dt, flat_packing = self(tp)
                types.extend(flat_dt)
                # Avoid extra nesting for subarrays
                if tp.ndim > 0:
                    packing.extend(flat_packing)
                else:
                    packing.append((len(flat_dt), flat_packing))
            return (types, packing)

    @recursive
    def pack_items(self, items, packing):
        """Pack items into nested lists based on re-packing info."""
        if packing is None:
            return items[0]
        elif packing is tuple:
            return tuple(items)
        elif packing is list:
            return list(items)
        else:
            start = 0
            ret = []
            for length, subpacking in packing:
                ret.append(self(items[start:start+length], subpacking))
                start += length
            return tuple(ret)

    def split_line(line):
        """Chop off comments, strip, and split at delimiter. """
        line = _decode_line(line, encoding=encoding)

        if comments is not None:
            line = regex_comments.split(line, maxsplit=1)[0]
        line = line.strip('\r\n')
        return line.split(delimiter) if line else []

    def read_data(chunk_size):
        """Parse each line, including the first.

        The file read, `fh`, is a global defined above.

        Parameters
        ----------
        chunk_size : int
            At most `chunk_size` lines are read at a time, with iteration
            until all lines are read.

        """
        X = []
        line_iter = itertools.chain([first_line], fh)
        line_iter = itertools.islice(line_iter, max_rows)
        for i, line in enumerate(line_iter):
            vals = split_line(line)
            if len(vals) == 0:
                continue
            if usecols:
                vals = [vals[j] for j in usecols]
            if len(vals) != N:
                line_num = i + skiprows + 1
                raise ValueError("Wrong number of columns at line %d"
                                 % line_num)

            # Convert each value according to its column and store
            items = [conv(val) for (conv, val) in zip(converters, vals)]

            # Then pack it according to the dtype's nesting
            items = pack_items(items, packing)
            X.append(items)
            if len(X) > chunk_size:
                yield X
                X = []
        if X:
            yield X

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Main body of loadtxt.
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    # Check correctness of the values of `ndmin`
    if ndmin not in [0, 1, 2]:
        raise ValueError('Illegal value of ndmin keyword: %s' % ndmin)

    # Type conversions for Py3 convenience
    if comments is not None:
        if isinstance(comments, (str, bytes)):
            comments = [comments]
        comments = [_decode_line(x) for x in comments]
        # Compile regex for comments beforehand
        comments = (re.escape(comment) for comment in comments)
        regex_comments = re.compile('|'.join(comments))

    if delimiter is not None:
        delimiter = _decode_line(delimiter)

    user_converters = converters

    byte_converters = False
    if encoding == 'bytes':
        encoding = None
        byte_converters = True

    if usecols is not None:
        # Allow usecols to be a single int or a sequence of ints
        try:
            usecols_as_list = list(usecols)
        except TypeError:
            usecols_as_list = [usecols]
        for col_idx in usecols_as_list:
            try:
                opindex(col_idx)
            except TypeError as e:
                e.args = (
                    "usecols must be an int or a sequence of ints but "
                    "it contains at least one element of type %s" %
                    type(col_idx),
                    )
                raise
        # Fall back to existing code
        usecols = usecols_as_list

    # Make sure we're dealing with a proper dtype
    dtype = np.dtype(dtype)
    defconv = _getconv(dtype)

    dtype_types, packing = flatten_dtype_internal(dtype)

    fown = False
    try:
        if isinstance(fname, os_PathLike):
            fname = os_fspath(fname)
        if _is_string_like(fname):
            fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
            fencoding = getattr(fh, 'encoding', 'latin1')
            fh = iter(fh)
            fown = True
        else:
            fh = iter(fname)
            fencoding = getattr(fname, 'encoding', 'latin1')
    except TypeError as e:
        raise ValueError(
            'fname must be a string, file handle, or generator'
        ) from e

    # input may be a python2 io stream
    if encoding is not None:
        fencoding = encoding
    # we must assume local encoding
    # TODO emit portability warning?
    elif fencoding is None:
        import locale
        fencoding = locale.getpreferredencoding()

    try:
        # Skip the first `skiprows` lines
        for i in range(skiprows):
            next(fh)

        # Read until we find a line with some values, and use
        # it to estimate the number of columns, N.
        first_vals = None
        try:
            while not first_vals:
                first_line = next(fh)
                first_vals = split_line(first_line)
        except StopIteration:
            # End of lines reached
            first_line = ''
            first_vals = []
            warnings.warn('loadtxt: Empty input file: "%s"' % fname,
                          stacklevel=2)
        N = len(usecols or first_vals)

        # Now that we know N, create the default converters list, and
        # set packing, if necessary.
        if len(dtype_types) > 1:
            # We're dealing with a structured array, each field of
            # the dtype matches a column
            converters = [_getconv(dt) for dt in dtype_types]
        else:
            # All fields have the same dtype
            converters = [defconv for i in range(N)]
            if N > 1:
                packing = [(N, tuple)]

        # By preference, use the converters specified by the user
        for i, conv in (user_converters or {}).items():
            if usecols:
                try:
                    i = usecols.index(i)
                except ValueError:
                    # Unused converter specified
                    continue
            if byte_converters:
                # converters may use decode to workaround numpy's old
                # behaviour, so encode the string again before passing to
                # the user converter
                def tobytes_first(x, conv):
                    if type(x) is bytes:
                        return conv(x)
                    return conv(x.encode("latin1"))
                converters[i] = functools.partial(tobytes_first, conv=conv)
            else:
                converters[i] = conv

        converters = [conv if conv is not bytes else
                      lambda x: x.encode(fencoding) for conv in converters]

        # read data in chunks and fill it into an array via resize
        # over-allocating and shrinking the array later may be faster but is
        # probably not relevant compared to the cost of actually reading and
        # converting the data
        X = None
        for x in read_data(_loadtxt_chunksize):
            if X is None:
                X = np.array(x, dtype)
            else:
                nshape = list(X.shape)
                pos = nshape[0]
                nshape[0] += len(x)
                X.resize(nshape, refcheck=False)
                X[pos:, ...] = x
    finally:
        if fown:
            fh.close()

    if X is None:
        X = np.array([], dtype)

    # Multicolumn data are returned with shape (1, N, M), i.e.
    # (1, 1, M) for a single row - remove the singleton dimension there
    if X.ndim == 3 and X.shape[:2] == (1, 1):
        X.shape = (1, -1)

    # Verify that the array has at least dimensions `ndmin`.
    # Tweak the size and shape of the arrays - remove extraneous dimensions
    if X.ndim > ndmin:
        X = np.squeeze(X)
    # and ensure we have the minimum number of dimensions asked for
    # - has to be in this order for the odd case ndmin=1, X.squeeze().ndim=0
    if X.ndim < ndmin:
        if ndmin == 1:
            X = np.atleast_1d(X)
        elif ndmin == 2:
            X = np.atleast_2d(X).T

    if unpack:
        if len(dtype_types) > 1:
            # For structured arrays, return an array for each field.
            return [X[field] for field in dtype.names]
        else:
            return X.T
    else:
        return X