in azure/datalake/store/utils.py [0:0]
def read_block(f, offset, length, delimiter=None):
""" Read a block of bytes from a file
Parameters
----------
fn: file object
a file object that supports seek, tell and read.
offset: int
Byte offset to start read
length: int
Maximum number of bytes to read
delimiter: bytes (optional)
Ensure reading stops at delimiter bytestring
If using the ``delimiter=`` keyword argument we ensure that the read
stops at or before the delimiter boundaries that follow the location
``offset + length``. For ADL, if no delimiter is found and the data
requested is > 4MB an exception is raised, since a single record cannot
exceed 4MB and be guaranteed to land contiguously in ADL.
The bytestring returned WILL include the
terminating delimiter string.
Examples
--------
>>> from io import BytesIO # doctest: +SKIP
>>> f = BytesIO(b'Alice, 100\\nBob, 200\\nCharlie, 300') # doctest: +SKIP
>>> read_block(f, 0, 13) # doctest: +SKIP
b'Alice, 100\\nBo'
>>> read_block(f, 0, 13, delimiter=b'\\n') # doctest: +SKIP
b'Alice, 100\\n'
>>> read_block(f, 10, 10, delimiter=b'\\n') # doctest: +SKIP
b'\\nCharlie, 300'
>>> f = BytesIO(bytearray(2**22)) # doctest: +SKIP
>>> read_block(f,0,2**22, delimiter=b'\\n') # doctest: +SKIP
IndexError: No delimiter found within max record size of 4MB.
Transfer without specifying a delimiter (as binary) instead.
"""
f.seek(offset)
bytes = f.read(length)
if delimiter:
# max record size is 4MB
max_record = 2**22
if length > max_record:
raise IndexError('Records larger than ' + str(max_record) + ' bytes are not supported. The length requested was: ' + str(length) + 'bytes')
# get the last index of the delimiter if it exists
try:
last_delim_index = len(bytes) -1 - bytes[::-1].index(delimiter)
# this ensures the length includes all of the last delimiter (in the event that it is more than one character)
length = last_delim_index + len(delimiter)
return bytes[0:length]
except ValueError:
# TODO: Before delimters can be supported through the ADLUploader logic, the number of chunks being uploaded
# needs to be visible to this method, since it needs to throw if:
# 1. We cannot find a delimiter in <= 4MB of data
# 2. If the remaining size is less than 4MB but there are multiple chunks that need to be stitched together,
# since the delimiter could be split across chunks.
# 3. If delimiters are specified, there must be logic during segment determination that ensures all chunks
# terminate at the end of a record (on a new line), even if that makes the chunk < 256MB.
if length >= max_record:
raise IndexError('No delimiter found within max record size of ' + str(max_record) + ' bytes. Transfer without specifying a delimiter (as binary) instead.')
return bytes