public static int encodeUtf8()

in hugegraph-store/hg-store-core/src/main/java/org/apache/hugegraph/store/util/UnsafeUtf8Util.java [86:149]


    public static int encodeUtf8(CharSequence in, byte[] out, int offset, int length) {
        long outIx = offset;
        final long outLimit = outIx + length;
        final int inLimit = in.length();
        if (inLimit > length || out.length - length < offset) {
            // Not even enough room for an ASCII-encoded string.
            throw new ArrayIndexOutOfBoundsException(
                    "Failed writing " + in.charAt(inLimit - 1) + " at index "
                    + (offset + length));
        }

        // Designed to take advantage of
        // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
        int inIx = 0;
        for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
            UnsafeUtil.putByte(out, outIx++, (byte) c);
        }
        if (inIx == inLimit) {
            // We're done, it was ASCII encoded.
            return (int) outIx;
        }

        for (char c; inIx < inLimit; ++inIx) {
            c = in.charAt(inIx);
            if (c < 0x80 && outIx < outLimit) {
                UnsafeUtil.putByte(out, outIx++, (byte) c);
            } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
                UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6)));
                UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
            } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
                // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
                UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12)));
                UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
                UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
            } else if (outIx <= outLimit - 4L) {
                // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four
                // UTF-8
                // bytes
                final char low;
                if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
                    throw new IllegalArgumentException(
                            "Unpaired surrogate at index " + (inIx - 1) + " of " + inLimit);
                }
                int codePoint = toCodePoint(c, low);
                UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
                UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
                UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
                UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint)));
            } else {
                if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
                    && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
                    // We are surrogates and we're not a surrogate pair.
                    throw new IllegalArgumentException(
                            "Unpaired surrogate at index " + inIx + " of " + inLimit);
                }
                // Not enough space in the output buffer.
                throw new ArrayIndexOutOfBoundsException(
                        "Failed writing " + c + " at index " + outIx);
            }
        }

        // All bytes have been encoded.
        return (int) outIx;
    }