def read_block()

in azure/datalake/store/utils.py [0:0]


def read_block(f, offset, length, delimiter=None):
    """ Read a block of bytes from a file

    Parameters
    ----------
    fn: file object
        a file object that supports seek, tell and read.
    offset: int
        Byte offset to start read
    length: int
        Maximum number of bytes to read
    delimiter: bytes (optional)
        Ensure reading stops at delimiter bytestring

    If using the ``delimiter=`` keyword argument we ensure that the read
    stops at or before the delimiter boundaries that follow the location
    ``offset + length``. For ADL, if no delimiter is found and the data
    requested is > 4MB an exception is raised, since a single record cannot
    exceed 4MB and be guaranteed to land contiguously in ADL.
    The bytestring returned WILL include the
    terminating delimiter string.

    Examples
    --------

    >>> from io import BytesIO  # doctest: +SKIP
    >>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300')  # doctest: +SKIP
    >>> read_block(f, 0, 13)  # doctest: +SKIP
    b'Alice, 100\\nBo'

    >>> read_block(f, 0, 13, delimiter=b'\\n')  # doctest: +SKIP
    b'Alice, 100\\n'

    >>> read_block(f, 10, 10, delimiter=b'\\n')  # doctest: +SKIP
    b'\\nCharlie, 300'
    >>> f  = BytesIO(bytearray(2**22))  # doctest: +SKIP
    >>> read_block(f,0,2**22, delimiter=b'\\n')  # doctest: +SKIP
    IndexError: No delimiter found within max record size of 4MB. 
    Transfer without specifying a delimiter (as binary) instead.
    """
    f.seek(offset)
    bytes = f.read(length)
    if delimiter:
        # max record size is 4MB
        max_record = 2**22
        if length > max_record:
            raise IndexError('Records larger than ' + str(max_record) + ' bytes are not supported. The length requested was: ' + str(length) + 'bytes')
        # get the last index of the delimiter if it exists
        try:
            last_delim_index = len(bytes) -1 - bytes[::-1].index(delimiter)
            # this ensures the length includes all of the last delimiter (in the event that it is more than one character)
            length = last_delim_index + len(delimiter)
            return bytes[0:length]
        except ValueError:
            # TODO: Before delimters can be supported through the ADLUploader logic, the number of chunks being uploaded 
            # needs to be visible to this method, since it needs to throw if:
            # 1. We cannot find a delimiter in <= 4MB of data
            # 2. If the remaining size is less than 4MB but there are multiple chunks that need to be stitched together,
            #   since the delimiter could be split across chunks.
            # 3. If delimiters are specified, there must be logic during segment determination that ensures all chunks
            #   terminate at the end of a record (on a new line), even if that makes the chunk < 256MB.
            if length >= max_record:
                raise IndexError('No delimiter found within max record size of ' + str(max_record) + ' bytes. Transfer without specifying a delimiter (as binary) instead.')
    
    return bytes