def _trim_to_max

def _trim_to_max_bytes()

in src/processors.py [0:0]
27 lines of code
12 McCabe index (conditional complexity)

    def _trim_to_max_bytes(self, s, max_bytes):
        """
        Ensure that the UTF-8 encoding of a string has not more than max_bytes bytes.

        The table below summarizes the format of these different octet types.
           Char. number range  |        UTF-8 octet sequence
              (hexadecimal)    |              (binary)
           --------------------+---------------------------------------------
           0000 0000-0000 007F | 0xxxxxxx
           0000 0080-0000 07FF | 110xxxxx 10xxxxxx
           0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
           0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        """

        def safe_b_of_i(b, i):
            try:
                return b[i]
            except IndexError:
                return 0

        # Edge cases
        if s == '' or max_bytes < 1:
            return ''

        # cut it twice to avoid encoding potentially GBs of string just to get e.g. 10 bytes?
        bytes_array = s[:max_bytes].encode('utf-8')[:max_bytes]

        # find the first byte from end which contains the starting byte of a utf8 character which is this format 11xxxxxx for
        # multi byte character. For single byte character the format is 0xxxxxxx as described above
        if bytes_array[-1] & 0b10000000:
            last_11xxxxxx_index = [
                i
                for i in range(-1, -5, -1)
                if safe_b_of_i(bytes_array, i) & 0b11000000 == 0b11000000
            ][0]
            # As described above in the table , we can determine the total size(in bytes) of char from the first byte itself
            starting_byte = bytes_array[last_11xxxxxx_index]
            if not starting_byte & 0b00100000:
                last_char_length = 2
            elif not starting_byte & 0b00010000:
                last_char_length = 3
            elif not starting_byte & 0b00001000:
                last_char_length = 4
            else:
                raise Exception(f"Unexpected utf-8 {starting_byte} byte encountered")

            if last_char_length > -last_11xxxxxx_index:
                # remove the incomplete character
                bytes_array = bytes_array[:last_11xxxxxx_index]

        return bytes_array.decode('utf-8')