int dump_zh_hans()

in mysql_strings/uca9-dump.cc [563:724]


int dump_zh_hans(MY_UCA *uca, int *pageloaded, FILE *infile, FILE *outfile) {
  /*
    zh.xml of cldr v33 defines 41336 Chinese Han characters. This xml file is
    encoded in utf8. Most of the Han characters are encoded in 3 bytes, and some
    are encoded in 4 bytes.
   */
  constexpr int ZH_HAN_CNT = 41336;
  unsigned char zh_bytes[ZH_HAN_CNT * 4]{0};

  if (read_in_lang_data((char *)zh_bytes, sizeof(zh_bytes), infile)) return 1;
  /*
    Since the rule [reorder Hani], Chinese Han character's weight should be
    smaller than any other non-ignorable characters (except of the core
    characters like spaces, symbols).

    To make the reordering, we decide to change the weight of all characters
    as:
    Char Group   | Origin Weight Range         | Reordered Weight Range
    -------------|-----------------------------|----------------------------
    core chars   | 0200 - 1C46                 | 0200 - 1C46
    Han in zh.xml| [FB40, AAAA] - [FB85, BBBB] | 1C47 - BDBE
    Other Han    | [FB40, CCCC] - [FB85, DDDD] | [BDBF, CCCC] - [BDC3, DDDD]
    Latin, etc   | 1C47 - 54A3                 | BDC4 - F620
    Others       | [FBC0, XXXX] - [FBE1, YYYY] | [F621, XXXX] - [F642, YYYY]

    This function changes only the weight of the Han characters defined in
    zh.xml and other characters in the same pages these Han characters reside.
   */
  constexpr int ZH_CORE_HAN_BASE_WT = 0x1C47;

  std::map<int, int> zh_han_to_single_weight_map;
  unsigned char *zh_ch = zh_bytes;
  int zh_len = strlen((char *)zh_bytes);
  int min_page = 0x1100;  // the max code point utf8mb4 supports is 0x10FFFF.
  int max_page = 0;
  for (int i = 0; i < ZH_HAN_CNT; i++) {
    my_wc_t ch = 0;
    int bytes = my_mb_wc_utf8mb4(&ch, zh_ch, zh_ch + zh_len);
    if (bytes <= 0) break;
    zh_ch += bytes;
    int page = ch >> 8;
    uca->item[ch].num_of_ce = 1;
    uca->item[ch].weight[0] = ZH_CORE_HAN_BASE_WT + i;
    uca->item[ch].weight[1] = 0x20;
    uca->item[ch].weight[2] = 0x02;
    pageloaded[page]++;
    min_page = std::min(min_page, page);
    max_page = std::max(max_page, page);
    MY_UCA_ITEM tmp_item;
    set_implicit_weights(&tmp_item, ch);
    zh_han_to_single_weight_map[ch] = ZH_CORE_HAN_BASE_WT + i;
  }

  // Chinese Han characters defined in zh.xml are all in pages 0x2E ~ 0x9F and
  // pages 0x200 ~ 0x2B8.
  for (int page = min_page; page <= max_page; page++) {
    if (pageloaded[page]) {
      // There is same page in DUCET.
      if (uca900_weight[page]) {
        for (int off = 0; off < MY_UCA_CHARS_PER_PAGE; off++) {
          int ch_off = (page << 8) + off;
          // Copy other characters' weight from DUCET.
          if (uca->item[ch_off].num_of_ce == 0) {
            uca->item[ch_off].num_of_ce =
                UCA900_NUM_OF_CE(uca900_weight[page], off);
            for (int level = 0; level < 3; level++) {
              uint16 *weight =
                  UCA900_WEIGHT_ADDR(uca900_weight[page], level, off);
              uint16 *dst = uca->item[ch_off].weight + level;
              for (int ce = 0; ce < uca->item[ch_off].num_of_ce; ce++) {
                if (*weight >= 0x1C47 && *weight <= 0x54A3) {
                  *dst = *weight + 0xBDC4 - 0x1C47;
                } else if (*weight >= 0xFB00) {  // implicit weight
                  uint16 next_implicit =
                      *(weight + UCA900_DISTANCE_BETWEEN_WEIGHTS);
                  my_wc_t ch = convert_implicit_to_ch(*weight, next_implicit);
                  if (zh_han_to_single_weight_map.find(ch) !=
                      zh_han_to_single_weight_map.end()) {
                    *dst = zh_han_to_single_weight_map[ch];
                    dst += 3;
                    weight += UCA900_DISTANCE_BETWEEN_WEIGHTS;
                    ce++;
                  } else {
                    *dst = change_zh_implicit(*weight);
                    dst += 3;
                    weight += UCA900_DISTANCE_BETWEEN_WEIGHTS;
                    ce++;
                    *dst = *weight;
                    dst += 3;
                    weight += UCA900_DISTANCE_BETWEEN_WEIGHTS;
                  }
                } else {
                  *dst = *weight;
                }
                dst += 3;
                weight += UCA900_DISTANCE_BETWEEN_WEIGHTS;
              }
            }
          }
        }
      } else {
        for (int off = 0; off < MY_UCA_CHARS_PER_PAGE; off++) {
          int ch = (page << 8) + off;
          if (uca->item[ch].num_of_ce == 0) {
            // calculate its implicit weight.
            set_implicit_weights(&uca->item[ch], ch);
            // Only the first primary weight needs to be changed in place.
            uca->item[ch].weight[0] =
                change_zh_implicit(uca->item[ch].weight[0]);
          }
        }
      }
    }
  }

  fprintf(outfile, "#include \"my_inttypes.h\"\n\n");
  fprintf(outfile, "namespace myodbc {\n\n");
  fprintf(outfile, "extern const int MIN_ZH_HAN_PAGE = 0x%X;\n", min_page);
  fprintf(outfile, "extern const int MAX_ZH_HAN_PAGE = 0x%X;\n\n", max_page);
  for (int page = min_page; page <= max_page; page++) {
    if (pageloaded[page]) {
      int maxnum = 0;
      get_page_statistics(uca, page, &maxnum);

      maxnum = maxnum * MY_UCA_CE_SIZE + 1;
      print_one_page(uca, page, "zh_han_p", maxnum, outfile);
    }
  }

  fprintf(outfile, "uint16* zh_han_pages[%d] = {\n", max_page - min_page + 1);
  for (int page = min_page; page <= max_page; page++) {
    if (!((page - min_page) % 5)) {
      if (pageloaded[page]) {
        fprintf(outfile, "%10s%03X", "zh_han_p", page);
      } else {
        fprintf(outfile, "%13s", "NULL");
      }
    } else {
      if (pageloaded[page]) {
        fprintf(outfile, "%9s%03X", "zh_han_p", page);
      } else {
        fprintf(outfile, "%12s", "NULL");
      }
    }
    if ((page - min_page + 1) != MY_UCA_NPAGES) fprintf(outfile, ",");
    if (!((page - min_page + 1) % 5) || (page - min_page + 1) == MY_UCA_NPAGES)
      fprintf(outfile, "\n");
  }
  fprintf(outfile, "\n};\n\n");

  fprintf(outfile, "int zh_han_to_single_weight[] = {\n");
  for (auto map_it = zh_han_to_single_weight_map.begin();
       map_it != zh_han_to_single_weight_map.end(); map_it++) {
    fprintf(outfile, "  0x%05X, 0x%04X,\n", map_it->first, map_it->second);
  }
  fprintf(outfile, "\n};\n\n");
  fprintf(outfile, "extern const int ZH_HAN_WEIGHT_PAIRS = %lu;\n",
          static_cast<unsigned long>(zh_han_to_single_weight_map.size()));
  fprintf(outfile, "} /* namespace myodbc */\n");

  return 0;
}