Squiggly/ane/WinDLLHunspellANE/HunspellNativeExtension/EncConv.cpp (180 lines of code) (raw):

//////////////////////////////////////////////////////////////////////////////// // // Licensed to the Apache Software Foundation (ASF) under one or more // contributor license agreements. See the NOTICE file distributed with // this work for additional information regarding copyright ownership. // The ASF licenses this file to You under the Apache License, Version 2.0 // (the "License"); you may not use this file except in compliance with // the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // //////////////////////////////////////////////////////////////////////////////// /* CharConv.cpp Author: Sudhakar Pandey <sudhakar@adobe.com> */ #include <iostream> #include <cstring> #include <stdexcept> #include "EncConv.h" #include "unicode/ucnv.h" //Adding this as an alternative to native iconv library #include "unicode/ustring.h" //Adding this as an alternative to native iconv library #ifndef MAX #define MAX(a,b) (((a)>(b))?(a):(b)) #endif /* MAX */ // ============================================================================= namespace EncConv { // ----------------------------------------------------------------------------- U16Char_t* convSpecialCharsInU16Str(const U16Char_t* src) { const LM_UInt32 srcLen = GetNumOfUnits(src); U16Char_t* pdst = new U16Char_t [srcLen + 1]; LM_UInt32 i; for(i = 0; i < srcLen; ++i) { U16Char_t c = src[i]; switch (c) { case 0x2018: // U+2018: Left Single Quotation Mark case 0x2019: // U+2019: Right Single Quotation Mark c = '\''; break; case 0x201C: // U+201C: Left Double Quotation Mark case 0x201D: // U+201D: Right Double Quotation Mark c = '"'; break; } pdst[i] = c; } pdst[i] = 0; return pdst; //delete[] pdst; //should be deleted by calling releaseU16CharString() function. } const char * getPlatformEncoding(const char* enc) { //If encoding is null or not specified then try the default encoding "ISO-8859-1" if(strlen(enc) == 0) return "ISO-8859-1"; if(strcmp(enc,"ISO8859-1") == 0) return "ISO-8859-1"; else if (strcmp(enc,"ISO8859-2") == 0) return "ISO-8859-2"; else if (strcmp(enc,"ISO8859-3") == 0) return "ISO-8859-3"; else if (strcmp(enc,"ISO8859-4") == 0) return "ISO-8859-4"; else if (strcmp(enc,"ISO8859-5") == 0) return "ISO-8859-5"; else if (strcmp(enc,"ISO8859-6") == 0) return "ISO-8859-6"; else if (strcmp(enc,"ISO8859-7") == 0) return "ISO-8859-7"; else if (strcmp(enc,"ISO8859-8") == 0) return "ISO-8859-8"; else if (strcmp(enc,"ISO8859-9") == 0) return "ISO-8859-9"; else if (strcmp(enc,"ISO8859-10") == 0) return "ISO-8859-10"; else if (strcmp(enc,"KOI8-R") == 0) return "KOI8-R"; else if (strcmp(enc,"KOI8-U") == 0) return "KOI8-U"; else if (strcmp(enc,"microsoft-cp1251") == 0) return "cp1251"; else if (strcmp(enc,"ISO8859-13") == 0) return "ISO-8859-13"; else if (strcmp(enc,"ISO8859-14") == 0) return "ISO-8859-14"; else if (strcmp(enc,"ISO8859-15") == 0) return "ISO-8859-15"; else if (strcmp(enc,"ISCII-DEVANAGARI") == 0) return "ibm-1137"; else if (strcmp(enc,"TIS620-2533") == 0) return "TIS-620"; else if (strcmp(enc,"UTF-8") == 0) return "UTF-8"; else return enc; } const std::string convU16StrToCharStr(const U16Char_t* src, const char* Encoding) { //static char const* const tocode = CHARCONV_ICONV_UTF16; char const* const tocode = getPlatformEncoding(Encoding); UErrorCode status = U_ZERO_ERROR; #ifdef ENCCONV_DEBUG std::cout << "\t" "convString" << std::endl; std::cout << "\t\t" "tocode = " << tocode << std::endl; //std::cout << "\t\t" "fromcode = " << fromcode << std::endl; #endif //iconv_t cd = iconv_open(tocode, fromcode); // Initializing ICU converter UConverter *conv = ucnv_open(tocode, &status); #ifdef CHARCONV_DEBUG std::cout << "\t\t" "aft ucnv_open: status = " << status << std::endl; #endif if (conv == NULL) { // try default encoding "ISO-8859-1" //throw std::runtime_error("Unable to create Unicode converter object"); status = U_ZERO_ERROR; conv = ucnv_open("ISO-8859-1", &status); } //still if conv is null simply return blank string if (conv == NULL) { return std::string(""); } U16Char_t const* srcWrk = src; const size_t srcSizeInUnits = GetNumOfUnits(src); const size_t srcSizeInBytes = srcSizeInUnits * sizeof(U16Char_t); const size_t dstSizeInBytes = MAX(256, (srcSizeInUnits + 1)) * 4; // How much byte buffer is needed? (UTF16 --> MBCS) char* dst = new char [dstSizeInBytes]; if(dst==NULL) return std::string(""); char* dstWrk =(char*)(dst); size_t srcLeftInBytes = srcSizeInBytes; size_t dstLeftInBytes = dstSizeInBytes - sizeof(char); status = U_ZERO_ERROR; ucnv_fromUChars(conv, dstWrk, dstLeftInBytes, (UChar*)srcWrk, -1, &status); U16Char_t* reverseConvertedVal = convCharStrToU16Str(dstWrk,Encoding); if(strcmp((char*)reverseConvertedVal,(char*)src)!=0) { EncConv::releaseU16Str(reverseConvertedVal); delete[] dst; return std::string(""); } EncConv::releaseU16Str(reverseConvertedVal); #ifdef CHARCONV_DEBUG std::cout << "\t\t" "aft iconv: status = " << status << std::endl; #endif if (status != U_ZERO_ERROR ) { // throw std::runtime_error("Unable to convert to string"); *dstWrk = 0; } std::string dst2(dst); delete[] dst; //const int err = iconv_close(cd); ucnv_close(conv); //if (err == -1) // throw std::runtime_error("Unable to deallocate iconv_t object"); return dst2; } U16Char_t* convCharStrToU16Str(const char* src, const char* Encoding) { //static char const* const tocode = CHARCONV_ICONV_UTF16; char const* const fromcode = getPlatformEncoding(Encoding); UErrorCode status = U_ZERO_ERROR; #ifdef ENCCONV_DEBUG std::cout << "\t" "convString" << std::endl; //std::cout << "\t\t" "tocode = " << tocode << std::endl; std::cout << "\t\t" "fromcode = " << fromcode << std::endl; #endif //iconv_t cd = iconv_open(tocode, fromcode); // Initializing ICU converter UConverter *conv = ucnv_open(fromcode, &status); #ifdef CHARCONV_DEBUG std::cout << "\t\t" "aft ucnv_open: status = " << status << std::endl; #endif if (conv == NULL) { // try default encoding "ISO-8859-1" //throw std::runtime_error("Unable to create Unicode converter object"); conv = ucnv_open("ISO-8859-1", &status); } char const* srcWrk = src; const size_t srcSizeInBytes = std::strlen(src); const size_t dstSizeInBytes = MAX(256, (srcSizeInBytes + 1)) * sizeof(U16Char_t); U16Char_t* dst = new U16Char_t [dstSizeInBytes / sizeof(U16Char_t)]; U16Char_t* dstWrk = dst; size_t srcLeftInBytes = srcSizeInBytes; size_t dstLeftInBytes = dstSizeInBytes - sizeof(U16Char_t); status = U_ZERO_ERROR; //still if conv is null simply return blank string if (conv == NULL) { dst[0] = NULL; return dst; } ucnv_toUChars(conv, (UChar *) dstWrk, dstLeftInBytes, (char*)srcWrk, srcLeftInBytes, &status); #ifdef CHARCONV_DEBUG std::cout << "\t\t" "aft iconv: status = " << status << std::endl; #endif if (status != U_ZERO_ERROR ) { // throw std::runtime_error("Unable to convert to string"); *dstWrk = 0; } //const int err = iconv_close(cd); ucnv_close(conv); //if (err == -1) // throw std::runtime_error("Unable to deallocate iconv_t object"); return dst; } void releaseU16Str(const U16Char_t* buf) { if(buf != NULL) { delete[] buf; buf = NULL; } return; } }// namespace // -----------------------------------------------------------------------------