in src/processors.py [0:0]
def _trim_to_max_bytes(self, s, max_bytes):
"""
Ensure that the UTF-8 encoding of a string has not more than max_bytes bytes.
The table below summarizes the format of these different octet types.
Char. number range | UTF-8 octet sequence
(hexadecimal) | (binary)
--------------------+---------------------------------------------
0000 0000-0000 007F | 0xxxxxxx
0000 0080-0000 07FF | 110xxxxx 10xxxxxx
0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
"""
def safe_b_of_i(b, i):
try:
return b[i]
except IndexError:
return 0
# Edge cases
if s == '' or max_bytes < 1:
return ''
# cut it twice to avoid encoding potentially GBs of string just to get e.g. 10 bytes?
bytes_array = s[:max_bytes].encode('utf-8')[:max_bytes]
# find the first byte from end which contains the starting byte of a utf8 character which is this format 11xxxxxx for
# multi byte character. For single byte character the format is 0xxxxxxx as described above
if bytes_array[-1] & 0b10000000:
last_11xxxxxx_index = [
i
for i in range(-1, -5, -1)
if safe_b_of_i(bytes_array, i) & 0b11000000 == 0b11000000
][0]
# As described above in the table , we can determine the total size(in bytes) of char from the first byte itself
starting_byte = bytes_array[last_11xxxxxx_index]
if not starting_byte & 0b00100000:
last_char_length = 2
elif not starting_byte & 0b00010000:
last_char_length = 3
elif not starting_byte & 0b00001000:
last_char_length = 4
else:
raise Exception(f"Unexpected utf-8 {starting_byte} byte encountered")
if last_char_length > -last_11xxxxxx_index:
# remove the incomplete character
bytes_array = bytes_array[:last_11xxxxxx_index]
return bytes_array.decode('utf-8')