lib/twitter_cldr/collation/implicit_collation_elements.rb (89 lines of code) (raw):

# encoding: UTF-8 # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 module TwitterCldr module Collation # ImplicitCollationElements generates implicit collation elements for code points (including some CJK characters), # that are not explicitly mentioned in the collation elements table. # # This module was ported from the ICU4J library (ImplicitCEGenerator class). See NOTICE file for license information. # module ImplicitCollationElements DEFAULT_SECONDARY_AND_TERTIARY = 5 class << self def for_code_point(code_point) [[primary_weight(swapCJK(code_point) + 1), DEFAULT_SECONDARY_AND_TERTIARY, DEFAULT_SECONDARY_AND_TERTIARY]] end private # Generates the primary weight of the implicit CE for a given code point. # def primary_weight(code_point) byte0 = code_point - MIN_4_BOUNDARY if byte0 < 0 byte1 = code_point / FINAL_3_COUNT byte0 = code_point % FINAL_3_COUNT byte2 = byte1 / MEDIAL_COUNT byte1 %= MEDIAL_COUNT # spread out, leaving gap at start byte0 = MIN_TRAIL + byte0 * FINAL_3_MULTIPLIER # offset byte1 += MIN_TRAIL byte2 += MIN_PRIMARY (byte2 << 16) + (byte1 << 8) + byte0 else byte1 = byte0 / FINAL_4_COUNT byte0 %= FINAL_4_COUNT byte2 = byte1 / MEDIAL_COUNT byte1 %= MEDIAL_COUNT byte3 = byte2 / MEDIAL_COUNT byte2 %= MEDIAL_COUNT # spread out, leaving gap at start byte0 = MIN_TRAIL + byte0 * FINAL_4_MULTIPLIER # offset byte1 += MIN_TRAIL byte2 += MIN_TRAIL byte3 += MIN_4_PRIMARY (byte3 << 24) + (byte2 << 16) + (byte1 << 8) + byte0 end end # Method used to: # a) collapse two different Han ranges from UCA into one (in the right order) # b) bump any non-CJK characters by NON_CJK_OFFSET. # # The relevant blocks are: # A: 4E00..9FFF; CJK Unified Ideographs # F900..FAFF; CJK Compatibility Ideographs # B: 3400..4DBF; CJK Unified Ideographs Extension A # 20000..XX; CJK Unified Ideographs Extension B (and others later on) # # As long as # no new B characters are allocated between 4E00 and FAFF, and # no new A characters are outside of this range, # (very high probability) this simple code will work. # # The reordered blocks are: # Block1 is CJK # Block2 is CJK_COMPAT_USED # Block3 is CJK_A # (all contiguous) # # Any other CJK gets its normal code point. # # When we reorder Block1, we make sure that it is at the very start, so that it will use a 3-byte form. # def swapCJK(code_point) if code_point >= CJK_BASE return code_point - CJK_BASE if code_point < CJK_LIMIT return code_point + NON_CJK_OFFSET if code_point < CJK_COMPAT_USED_BASE return code_point - CJK_COMPAT_USED_BASE + (CJK_LIMIT - CJK_BASE) if code_point < CJK_COMPAT_USED_LIMIT return code_point + NON_CJK_OFFSET if code_point < CJK_B_BASE return code_point if code_point < CJK_B_LIMIT # non-BMP-CJK return code_point + NON_CJK_OFFSET if code_point < CJK_C_BASE return code_point if code_point < CJK_C_LIMIT # non-BMP-CJK return code_point + NON_CJK_OFFSET if code_point < CJK_D_BASE return code_point if code_point < CJK_D_LIMIT # non-BMP-CJK return code_point + NON_CJK_OFFSET # non-CJK end return code_point + NON_CJK_OFFSET if code_point < CJK_A_BASE return code_point - CJK_A_BASE + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE) if code_point < CJK_A_LIMIT code_point + NON_CJK_OFFSET # non-CJK end end # primary value MIN_PRIMARY = 0xE0 MAX_PRIMARY = 0xe4 # final byte MIN_TRAIL = 0x04 MAX_TRAIL = 0xFE # gap for tailoring of 3-byte forms GAP_3 = 1 # number of 3-byte primaries that can be used PRIMARIES_3_COUNT = 1 # 2 * [Unicode range] + 2 MAX_INPUT = 0x220001 # medials can use full range MEDIAL_COUNT = MAX_TRAIL - MIN_TRAIL + 1 # number of values we can use in trailing bytes # leave room for empty values between AND above, e.g., if gap = 2 # range 3..7 => +3 -4 -5 -6 -7: so 1 value # range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values # range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values FINAL_3_MULTIPLIER = GAP_3 + 1 FINAL_3_COUNT = MEDIAL_COUNT / FINAL_3_MULTIPLIER # find out how many values fit in each form THREE_BYTE_COUNT = MEDIAL_COUNT * FINAL_3_COUNT # now determine where the 3/4 boundary is # we use 3 bytes below the boundary, and 4 above PRIMARIES_AVAILABLE = MAX_PRIMARY - MIN_PRIMARY + 1 PRIMARIES_4_COUNT = PRIMARIES_AVAILABLE - PRIMARIES_3_COUNT MIN_4_PRIMARY = MIN_PRIMARY + PRIMARIES_3_COUNT MIN_4_BOUNDARY = PRIMARIES_3_COUNT * THREE_BYTE_COUNT TOTAL_NEEDED = MAX_INPUT - MIN_4_BOUNDARY NEEDED_PER_PRIMARY_BYTE = (TOTAL_NEEDED - 1) / PRIMARIES_4_COUNT + 1 NEEDED_PER_FINAL_BYTE = (NEEDED_PER_PRIMARY_BYTE - 1) / (MEDIAL_COUNT * MEDIAL_COUNT) + 1 GAP_4 = (MAX_TRAIL - MIN_TRAIL - 1) / NEEDED_PER_FINAL_BYTE FINAL_4_MULTIPLIER = GAP_4 + 1 FINAL_4_COUNT = NEEDED_PER_FINAL_BYTE # CJK constants NON_CJK_OFFSET = 0x110000 CJK_COMPAT_USED_BASE = 0xFA0E CJK_COMPAT_USED_LIMIT = 0xFA2F + 1 CJK_BASE = 0x4E00 # 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; CJK_LIMIT = 0x9FCC + 1 # 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; CJK_A_BASE = 0x3400 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; CJK_A_LIMIT = 0x4DB5 + 1 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; CJK_B_BASE = 0x20000 # 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; CJK_B_LIMIT = 0x2A6D6 + 1 # 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; CJK_C_BASE = 0x2A700 # 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; CJK_C_LIMIT = 0x2B734 + 1 # 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;; CJK_D_BASE = 0x2B740 # 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;; CJK_D_LIMIT = 0x2B81D + 1 # 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;; end end end