src/cas/xcasdeserializer_handler.cpp (731 lines of code) (raw):

/** @name xcasdeserializer_handler.cpp ----------------------------------------------------------------------------- * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. ----------------------------------------------------------------------------- 10/18/2005 Initial creation -------------------------------------------------------------------------- */ //TODO support multiple indexed FS // --------------------------------------------------------------------------- // Includes // --------------------------------------------------------------------------- #include "uima/pragmas.hpp" #include <iostream> #include <sstream> #include <algorithm> using namespace std; #include "xercesc/sax2/Attributes.hpp" #include "xercesc/sax/SAXParseException.hpp" #include "xercesc/sax/SAXException.hpp" #include "uima/msg.h" #include "uima/exceptions.hpp" #include "uima/lowlevel_typesystem.hpp" #include "uima/lowlevel_indexrepository.hpp" #include "uima/xcasdeserializer_handler.hpp" #include "uima/internal_fspromoter.hpp" #include "uima/internal_typeshortcuts.hpp" #include "uima/internal_casimpl.hpp" #include "uima/fsindexrepository.hpp" #include "uima/arrayfs.hpp" #include "uima/annotator_context.hpp" #include "uima/resmgr.hpp" #define DEBUG 0 namespace uima { // --------------------------------------------------------------------------- // XCASDeserialiserHandler: Constructors and Destructor // --------------------------------------------------------------------------- XCASDeserializerHandler::XCASDeserializerHandler(CAS & cas, AnnotatorContext * const ctx) : iv_cas(cas.getBaseCas() ), iv_locator(NULL), iv_ctx(ctx), iv_casimpl( uima::internal::CASImpl::promoteCAS(*iv_cas) // ,iv_typesystem(iv_casimpl.getHeap().getTypeSystem()) ) { if (DEBUG) std::cerr << " XCASDeserializerHandler::constructor " << std::endl; currentContentFeat.append(DEFAULT_CONTENT_FEATURE); sofaTypeCode = uima::internal::gs_tySofaType; FSIndexRepository * fsidx = &iv_cas->getBaseIndexRepository(); indexRepositories.push_back((lowlevel::IndexRepository*)fsidx); // There should always be another index for the Initial View fsidx = &iv_cas->getView(CAS::NAME_DEFAULT_SOFA)->getIndexRepository(); indexRepositories.push_back((lowlevel::IndexRepository*)fsidx); // get temp heap handle for checking if an FS is an annotation lowlevel::FSHeap const & crHeap = iv_casimpl.getHeap(); // uima::lowlevel::FSHeap::TyFSHeap const & tyTempHeap = crHeap.iv_clTemporaryHeap; iv_typesystem = &crHeap.getTypeSystem(); // add entry for baseCAS ... point non-compliant annotations at first Sofa sofaRefMap.push_back(1); // add entry for baseCAS ... _indexed=0 stays in 0 indexMap.push_back(0); } XCASDeserializerHandler::~XCASDeserializerHandler() { if (DEBUG) std::cout << " XCASDeserializerHandler::destructor " << std::endl; for (size_t i = 0; i < fsTree.size(); i++) { FSInfo * fsinfo = (FSInfo*) fsTree[i]; if (fsinfo != 0) { delete fsinfo->indexRep; delete fsinfo; } } for (size_t i = 0; i < idLess.size(); i++) { FSInfo * fsinfo = (FSInfo*) idLess[i]; if (fsinfo != 0) { delete fsinfo->indexRep; delete fsinfo; } } // free some storage fsTree.clear(); sofaRefMap.clear(); indexMap.clear(); } // --------------------------------------------------------------------------- // XCASDeserializerHandler: Implementation of the SAX2 ContentHandler interface // --------------------------------------------------------------------------- void XCASDeserializerHandler::setDocumentLocator(const Locator* const locator) { if (DEBUG) std::cerr << " XCASDeserializerHandler::setDocumentLocator() " << std::endl; iv_locator = locator; } void XCASDeserializerHandler::startDocument() { if (DEBUG) cout << " XCASDeserializerHandler::startDocument() " << endl; iv_state = DOC_STATE; } void XCASDeserializerHandler::startElement(const XMLCh* const uri, const XMLCh* const localname, const XMLCh* const qname, const Attributes & attrs) { if (DEBUG) std::cerr << " XCASDeserializerHandler::startElement() " << icu::UnicodeString((UChar*)qname, XMLString::stringLen(qname)) << endl; assert(sizeof(XMLCh) == sizeof(UChar)); icu::UnicodeString qualifiedName( (UChar const *) qname, XMLString::stringLen(qname)); buffer.remove(); switch (iv_state) { case DOC_STATE: { if (qualifiedName.compare(CASTAGNAME) != 0) { ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_FATALERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam( qualifiedName ); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } iv_state = FS_STATE; break; } case FS_STATE: { currentContentFeat = DEFAULT_CONTENT_FEATURE; if (qualifiedName.compare(DEFAULT_DOC_TYPE_NAME) == 0) { iv_state = DOC_TEXT_STATE; } else { readFS(qualifiedName, attrs); } break; } case ARRAY_ELE_STATE: { readArrayElement(qualifiedName, attrs); break; } default: { // If we're not in an element expecting state, raise an error. ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_FATALERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam( qualifiedName ); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } } } void XCASDeserializerHandler::characters( const XMLCh* const cpwsz, const XMLSize_t uiLength) { if (DEBUG) std::cerr << "XCASDeserializerHandler::characters: \"" << icu::UnicodeString(cpwsz, uiLength) << "\"" << endl; assert(sizeof(XMLCh) == sizeof(UChar)); switch (this->iv_state) { case DOC_TEXT_STATE: case CONTENT_STATE: case ARRAY_ELE_CONTENT_STATE: case FEAT_CONTENT_STATE: buffer.append( (UChar const *) cpwsz, 0, uiLength ); break; default: break; } } void XCASDeserializerHandler::endElement(const XMLCh* const uri, const XMLCh* const localname, const XMLCh* const qname) { if (DEBUG) std::cerr << " XCASDeserializerHandler::endElement() " << icu::UnicodeString( (UChar*) qname, XMLString::stringLen(qname) ) << " -- buffer is >" << buffer << "<" << endl; icu::UnicodeString qualifiedName( (UChar const *) qname, XMLString::stringLen(qname)); assert(sizeof(XMLCh) == sizeof(UChar)); switch (iv_state) { case DOC_STATE: { // Do nothing. break; } case FS_STATE: { iv_state = DOC_STATE; break; } case FEAT_STATE: { iv_state = FS_STATE; break; } case CONTENT_STATE: { // Set the value of the content feature. //if (!isAllWhitespace(buffer)) //{ handleFeature(currentAddr, currentContentFeat, buffer, true); //} } iv_state = FS_STATE; break; } case FEAT_CONTENT_STATE: { // Create a feature value from an element. handleFeature(currentAddr, qualifiedName, buffer, false); iv_state = FEAT_STATE; break; } case ARRAY_ELE_CONTENT_STATE: { // Create an array value. addArrayElement(buffer); iv_state = ARRAY_ELE_STATE; break; } case ARRAY_ELE_STATE: { iv_state = FS_STATE; break; } case DOC_TEXT_STATE: { // Assume old style TCAS with one text Sofa SofaFS newSofa = iv_cas->createInitialSofa(icu::UnicodeString("text")); CAS* cas = iv_cas->getInitialView(); cas->registerView(newSofa); // Set the document text without creating a documentAnnotation cas->setDocTextFromDeserializtion(UnicodeStringRef(buffer.getBuffer(), buffer.length())); // and assume the new Sofa is at location 1! int addr = 1; int id = 1; sofaRefMap.push_back(id); // and register the id for this Sofa FSInfo * fsInfo = new FSInfo(addr, new vector<int>); // FSInfo * fsInfo = new FSInfo(addr, -1); //??? Should be 0 or -1 ??? fsTree[id] = fsInfo; iv_state = FS_STATE; break; } } } void XCASDeserializerHandler::endDocument() { //cout << " XCASDeserializerHandler::endDocument() " << endl; //update features that are FSs for (size_t i = 0; i < fsTree.size(); i++) { FSInfo * fsinfo = (FSInfo*) fsTree[i]; if (fsinfo != 0) finalizeFS(*fsinfo); } //update features that are FSs for (size_t i = 0; i < idLess.size(); i++) { FSInfo * fsinfo = (FSInfo*) idLess[i]; if (fsinfo != 0) finalizeFS(*fsinfo); } //update document annotation info in tcas for (size_t i = 0; i < tcasInstances.size(); i++) { CAS * tcas = (CAS *) tcasInstances[i]; if (tcas != 0) { tcas->pickupDocumentAnnotation(); } } } void XCASDeserializerHandler::ignorableWhitespace(const XMLCh* const cpwsz, const unsigned int length) { cout << " XCASDeserializerHandler::ignorableWhitespace() " << endl; } // Create a new FS. void XCASDeserializerHandler::readFS(icu::UnicodeString & qualifiedName, const Attributes & attrs) { icu::UnicodeString typeName(qualifiedName); Type type = iv_cas->getTypeSystem().getType(typeName); uima::lowlevel::TyFSType typecode = uima::internal::FSPromoter::demoteType(type); if (!type.isValid() ) { cout << "INFO: invalid type " << typeName << endl; iv_state = CONTENT_STATE; } else { if (iv_cas->getTypeSystem().isArrayType(typecode)) { readArray(type, attrs); return; } uima::lowlevel::TyFS addr = uima::internal::FSPromoter::demoteFS(iv_cas->createFS(type)); readFS(addr, attrs, true); } } void XCASDeserializerHandler::readFS(lowlevel::TyFS addr, const Attributes & attrs, bool toIndex) { // Hang on address for setting content feature currentAddr = addr; int id = -1; // int sofaRef = -1; // 0 ==> baseCas indexRepository vector<int>* sofaRef = new vector<int>; icu::UnicodeString attrName; icu::UnicodeString attrValue; bool nameMapping = false; UChar ubuff[256]; UErrorCode errorCode = U_ZERO_ERROR; lowlevel::TyFS heapValue = iv_casimpl.getHeap().getType(addr); // Special handling for Sofas if (sofaTypeCode == heapValue) { // create some maps to handle v1 format XCAS ... // ... where the sofa feature of annotations was an int not a ref // determine if this is the one and only initial view Sofa bool isInitialView = false; int extsz = icu::UnicodeString(CAS::FEATURE_BASE_NAME_SOFAID).extract(ubuff, 256, errorCode); if (extsz > 256) { cout << "ACK!" << endl; } const UChar* sofaID = attrs.getValue(ubuff); if (0==UnicodeStringRef(sofaID).compare(icu::UnicodeString("_DefaultTextSofaName"))) { sofaID = ubuff; } // no Sofa mapping for now // if (iv_ctx != NULL) { // // Map incoming SofaIDs // sofaID = iv_ctx->mapToSofaID(sofaID).getSofaId(); // } if (0==UnicodeStringRef(sofaID).compare(icu::UnicodeString(CAS::NAME_DEFAULT_SOFA))) { isInitialView = true; } // get the sofaNum extsz = icu::UnicodeString(CAS::FEATURE_BASE_NAME_SOFANUM).extract(ubuff, 256, errorCode); if (extsz > 256) { cout << "ACK!" << endl; } const UChar* aString = attrs.getValue(ubuff); int thisSofaNum = atoi(UnicodeStringRef(aString).asUTF8().c_str()); // get the sofa's FeatureStructure id icu::UnicodeString(ID_ATTR_NAME).extract(ubuff,256, errorCode); aString = attrs.getValue(ubuff); int sofaFsId = atoi(UnicodeStringRef(aString).asUTF8().c_str()); // for v1 and v2 formats, create the index map // ***we assume Sofas are always received in Sofanum order*** // Two scenarios ... the initial view is the first sofa, or not. // If not, the _indexed values need to be remapped to leave room for the initial view, // which may or may not be in the received CAS. if (indexMap.size() == 1) { if (isInitialView) { // the first Sofa an initial view if (thisSofaNum == 2) { // this sofa was mapped to the initial view indexMap.push_back(-1); // for this CAS, there should not be a sofanum = 1 indexMap.push_back(1); // map 2 to 1 nextIndex = 2; } else { indexMap.push_back(1); nextIndex = 2; } } else { if (thisSofaNum > 1) { // the first Sofa not initial, but sofaNum > 1 // must be a v2 format, and sofaNum better be 2 indexMap.push_back(1); assert (thisSofaNum == 2); indexMap.push_back(2); nextIndex = 3; } else { // must be v1 format indexMap.push_back(2); nextIndex = 3; } } } else { // if the new Sofa is the initial view, always map to 1 if (isInitialView) { // the initial view is not the first // if v2 format, space already reserved in mapping if (indexMap.size() == thisSofaNum) { // v1 format, add mapping for initial view indexMap.push_back(1); } } else { indexMap.push_back(nextIndex); nextIndex++; } } // Now update the mapping from annotation int to ref values if (sofaRefMap.size() == thisSofaNum) { // Sofa received in sofaNum order, add new one sofaRefMap.push_back(sofaFsId); } else if ((int)sofaRefMap.size() > thisSofaNum) { // new Sofa has lower sofaNum than last one sofaRefMap[thisSofaNum] = sofaFsId; } else { // new Sofa has skipped ahead more than 1 sofaRefMap.resize(thisSofaNum + 1); sofaRefMap[thisSofaNum] = sofaFsId; } } Type type = uima::internal::FSPromoter::promoteType(heapValue, iv_cas->getTypeSystem().getLowlevelTypeSystem()); for (size_t i = 0; i < attrs.getLength(); i++) { assertWithMsg( sizeof(XMLCh) == sizeof(UChar), "Port required!"); attrName = (UChar*)attrs.getQName(i); attrValue = (UChar*)attrs.getValue(i); if (attrName.startsWith("_")) { if (attrName.compare(ID_ATTR_NAME) == 0) { id = atoi(UnicodeStringRef(attrValue).asUTF8().c_str()); } else if (attrName.compare(CONTENT_ATTR_NAME) == 0) { currentContentFeat = attrValue; } else if (attrName.compare(INDEXED_ATTR_NAME)== 0) { // if (toIndex) // { // suppress indexing of document annotation if old CAS // if (attrValue.compare(TRUE_VALUE) == 0) // sofaRef = 1; // else if (!attrValue.compare("false") == 0) // sofaRef = atoi(uniStr2SingleByteStr(attrValue,"UTF-8").c_str()); // } char indexes[256]; // we have a problem here if number of indexed views is ridiculously big strcpy(indexes, UnicodeStringRef(attrValue).asUTF8().c_str()); char* ptr = strtok (indexes," "); while (ptr != NULL) { sofaRef->push_back(atoi(ptr)); ptr = strtok (NULL, " "); } } else { handleFeature(type, addr, attrName, attrValue, false); } } else { if (nameMapping && attrName.compare(CAS::FEATURE_BASE_NAME_SOFAID) == 0) { if (iv_ctx != NULL) { attrValue = iv_ctx->mapToSofaID(attrValue).getSofaId(); } } handleFeature(type, addr, attrName, attrValue, false); } } if (sofaTypeCode == heapValue) { // If a Sofa, create CAS view to get new indexRepository SofaFS sofa = (SofaFS) uima::internal::FSPromoter::promoteFS(addr, *iv_cas); //also add to indexes so we can retrieve the Sofa later iv_cas->getBaseIndexRepository().addFS(sofa); CAS * tcas = iv_cas->getView(sofa); assert ( EXISTS(tcas) ); if (sofa.getSofaRef() == 1) { iv_cas->registerInitialSofa(); } else { // add indexRepo for views other than the initial view lowlevel::IndexRepository * indexRep = iv_cas->getIndexRepositoryForSofa(sofa); assert ( EXISTS(indexRep) ); indexRepositories.push_back(indexRep); } tcasInstances.push_back(tcas); } // sofaRef.size()==0 means not indexed FSInfo * fsInfo = new FSInfo(addr, sofaRef); if (id < 0) { idLess.push_back(fsInfo); } else { fsTree[id] = fsInfo; } iv_state = CONTENT_STATE; } void XCASDeserializerHandler::readArray(Type & type, const Attributes & attrs) { vector<int>* indexRep = new vector<int>; int id = -1; int size=0; icu::UnicodeString attrName; icu::UnicodeString attrValue; for (size_t i = 0; i < attrs.getLength(); i++) { assertWithMsg( sizeof(XMLCh) == sizeof(UChar), "Port required!"); attrName = (UChar*)attrs.getQName(i); attrValue = (UChar*)attrs.getValue(i); if (attrName.compare(ID_ATTR_NAME) == 0) { id = atoi(UnicodeStringRef(attrValue).asUTF8().c_str()); } else if (attrName.compare(ARRAY_SIZE_ATTR) == 0) { size = atoi(UnicodeStringRef(attrValue).asUTF8().c_str()); } else if (attrName.compare(INDEXED_ATTR_NAME)== 0) { // // suppress indexing of document annotation if old CAS // if (attrValue.compare(TRUE_VALUE) == 0) // indexRep = 1; // else if (!attrValue.compare("false") == 0) // indexRep = atoi(uniStr2SingleByteStr(attrValue,"UTF-8").c_str()); char indexes[256]; // we have a problem here if number of indexed views is ridiculously big strcpy(indexes, UnicodeStringRef(attrValue).asUTF8().c_str()); char* ptr = strtok (indexes," "); while (ptr != NULL) { indexRep->push_back(atoi(ptr)); ptr = strtok (NULL, " "); } } else { ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_FATALERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam( attrName ); msg.addParam( attrValue ); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } } arrayType = uima::internal::FSPromoter::demoteType(type); currentAddr = iv_casimpl.getHeap().createArrayFS(arrayType, size); arrayPos=0; // indexRep.size()==0 means not indexed FSInfo * fsInfo = new FSInfo(currentAddr, indexRep); if (id < 0) { idLess.push_back(fsInfo); } else { fsTree[id] = fsInfo; } iv_state = ARRAY_ELE_STATE; } void XCASDeserializerHandler::readArrayElement(icu::UnicodeString & qualifiedName, const Attributes & attrs) { if (qualifiedName.compare(ARRAY_ELEMENT_TAG) != 0) { ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_FATALERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam( qualifiedName ); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } if (attrs.getLength() > 0) { ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_FATALERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam( qualifiedName ); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } iv_state = ARRAY_ELE_CONTENT_STATE; } void XCASDeserializerHandler::addArrayElement(icu::UnicodeString & buffer) { if (arrayPos >= iv_casimpl.getHeap().getArraySize(currentAddr) ) { ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_FATALERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam("Invalid array FS in the CAS" ); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } FeatureStructure fs = uima::internal::FSPromoter::promoteFS(currentAddr, *iv_cas); switch (arrayType) { case internal::gs_tyIntArrayType: { int val = atoi(UnicodeStringRef(buffer).asUTF8().c_str()); IntArrayFS intFS(fs); intFS.set( (size_t) arrayPos, val); break; } case internal::gs_tyFloatArrayType: { float val = atof(UnicodeStringRef(buffer).asUTF8().c_str()); FloatArrayFS floatFS(fs); floatFS.set( (size_t) arrayPos, val); break; } case internal::gs_tyStringArrayType: { //add the string int stringoffset = iv_cas->getHeap()->addString(buffer); //set the array value in fs heap lowlevel::TyFS stringref = iv_cas->getHeap()->getStringAsFS(stringoffset); lowlevel::TyHeapCell * fsarray = iv_cas->getHeap()->getCArrayFromFS(currentAddr); fsarray[arrayPos] = stringref; break; } case internal::gs_tyByteArrayType: { short intval = atoi(UnicodeStringRef(buffer).asUTF8().c_str()); char charval[2]; sprintf(charval,"%c",intval); ByteArrayFS byteFS(fs); byteFS.set( (size_t) arrayPos, charval[0]); break; } case internal::gs_tyBooleanArrayType: { string val = UnicodeStringRef(buffer).asUTF8(); BooleanArrayFS booleanFS(fs); if (val.compare("1")==0) { booleanFS.set( (size_t) arrayPos, true); //cout << "bool buffer " << buffer << " val= " << val << "set " << true << endl; } else { booleanFS.set ( (size_t) arrayPos, false); //cout << arrayPos << " bool buffer " << buffer << " val= " << val << "set " << false << endl; } break; } case internal::gs_tyShortArrayType: { short val; string strval; UnicodeStringRef(buffer).extractUTF8(strval); stringstream s; s << strval.c_str(); s >> val; ShortArrayFS shortFS(fs); shortFS.set( (size_t) arrayPos, val); break; } case internal::gs_tyLongArrayType: { INT64 val; stringstream s; s << UnicodeStringRef(buffer).asUTF8(); s >> val; LongArrayFS longFS(fs); longFS.set( (size_t) arrayPos, val); break; } case internal::gs_tyDoubleArrayType: { DoubleArrayFS doubleFS(fs); stringstream s; s << UnicodeStringRef(buffer).asUTF8(); long double doubleval; s >> doubleval; doubleFS.set((size_t) arrayPos, doubleval); break; } default: { //array of FSs lowlevel::TyFS fsid = atoi(UnicodeStringRef(buffer).asUTF8().c_str()); FeatureStructure fsitem(fsid, *iv_cas); ArrayFS fsArrayfs(fs); fsArrayfs.set((size_t) arrayPos, fsitem); } } ++arrayPos; } // Create a feature value from a string representation. void XCASDeserializerHandler::handleFeature(lowlevel::TyFS addr, icu::UnicodeString & featName, icu::UnicodeString & featVal, bool lenient) { lowlevel::TyFSType fstype = iv_casimpl.getHeap().getType(addr); Type type = uima::internal::FSPromoter::promoteType(fstype, iv_cas->getTypeSystem().getLowlevelTypeSystem()); handleFeature(type, addr, featName, featVal, lenient); } void XCASDeserializerHandler::handleFeature(Type & type, lowlevel::TyFS addr, icu::UnicodeString & featName, icu::UnicodeString & featVal, bool lenient) { char charFeatVal[10]; // handle v1.x format annotations, mapping int to ref values lowlevel::TyFSType fstype = iv_casimpl.getHeap().getType(addr); if (0==featName.compare("sofa") && iv_typesystem->subsumes(internal::gs_tyAnnotationBaseType, fstype)) { int ifeatval = atoi(UnicodeStringRef(featVal).asUTF8().c_str()); sprintf(charFeatVal, "%d", sofaRefMap[ifeatval]); featVal.setTo(icu::UnicodeString(charFeatVal)); } // handle v1.x sofanum values, remapping so that _InitialView always == 1 if (0==featName.compare(CAS::FEATURE_BASE_NAME_SOFAID) && sofaTypeCode == fstype) { int sofaNum = iv_casimpl.getHeap().getIntValue(addr, internal::gs_tySofaNumFeature); iv_casimpl.getHeap().setIntValue(addr, internal::gs_tySofaNumFeature, indexMap[sofaNum]); } icu::UnicodeString prefix(REF_PREFIX); if (featName.startsWith(REF_PREFIX)) { featName.remove(0,prefix.length()); // Delete prefix } FeatureStructure fs = uima::internal::FSPromoter::promoteFS(addr, *iv_cas); Feature feat = type.getFeatureByBaseName(featName); // System.out.println("DEBUG - Feature map result: " + featName + " = " + feat.getName()); if (!feat.isValid()) { //feature does not exist in typesystem; //Out of typesystem data not supported. //we skip this feature /**ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_FATALERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam(type.getName()); msg.addParam(featName); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; **/ } else { Type rtype; feat.getRangeType(rtype); lowlevel::TyFSType rangeType = uima::internal::FSPromoter::demoteType(rtype); switch (rangeType) { case internal::gs_tyIntegerType: { if (featVal.length()>0) { fs.setIntValue(feat, atoi(UnicodeStringRef(featVal).asUTF8().c_str())); } break; } case internal::gs_tyFloatType: { if ( featVal.length() > 0) { fs.setFloatValue(feat, atof(UnicodeStringRef(featVal).asUTF8().c_str())); } break; } case internal::gs_tyStringType: { if (featVal.length() > 0) { fs.setStringValue(feat, featVal); } break; } case internal::gs_tyByteType: { if (featVal.length() > 0) { string val = UnicodeStringRef(featVal).asUTF8(); short intval = atoi(val.c_str()); char charval[2]; sprintf(charval,"%c",intval); fs.setByteValue(feat, charval[0] ); } break; } case internal::gs_tyBooleanType: { if (featVal.length() > 0) { string val = UnicodeStringRef(featVal).asUTF8(); if (val.compare("1")==0) fs.setBooleanValue(feat, true ); else fs.setBooleanValue(feat, false); } break; } case internal::gs_tyShortType: { if (featVal.length() > 0) { string strval = UnicodeStringRef(featVal).asUTF8(); short shortval; stringstream s; s << strval.c_str(); s >> shortval; fs.setShortValue(feat, shortval); } break; } case internal::gs_tyLongType: { if (featVal.length() > 0) { string strval = UnicodeStringRef(featVal).asUTF8(); INT64 longval; stringstream s; s << strval.c_str(); s >> longval; fs.setLongValue(feat, longval); } break; } case internal::gs_tyDoubleType: { if (featVal.length() > 0) { string strval = UnicodeStringRef(featVal).asUTF8(); long double doubleval; stringstream s; s << strval.c_str(); s >> doubleval; fs.setDoubleValue(feat, doubleval ); } break; } default: { if (rtype.isStringSubType()) { if (featVal.length() > 0) fs.setStringValue(feat, featVal); } else if (featVal.length() > 0) { lowlevel::TyFS val = (lowlevel::TyFS) atoi(UnicodeStringRef(featVal).asUTF8().c_str()); iv_casimpl.getHeap().setFeatureInternal(addr, uima::internal::FSPromoter::demoteFeature(feat), val); } break; } } } } void XCASDeserializerHandler::finalizeFS(FSInfo & fsInfo) { lowlevel::TyFS addr = fsInfo.addr; FeatureStructure fs = uima::internal::FSPromoter::promoteFS(addr, *iv_cas); Type type = fs.getType(); if (fsInfo.indexRep->size() >= 0) { // Now add FS to all specified index repositories for (int i = 0; i < (int)fsInfo.indexRep->size(); i++) { lowlevel::IndexRepository * pIndexRep; if (indexMap.size() == 1) { pIndexRep = indexRepositories[fsInfo.indexRep->at(i)]; } else { pIndexRep = indexRepositories[indexMap[fsInfo.indexRep->at(i)]]; } assert(EXISTS(pIndexRep)); pIndexRep->add(addr); } } if (iv_cas->getTypeSystem().isArrayType(uima::internal::FSPromoter::demoteType(type)) ) { finalizeArray(type, addr, fsInfo); return; } //update heap value of features that are references to other FS. vector<Feature> feats; type.getAppropriateFeatures(feats); FSInfo * fsValInfo; for (size_t i = 0; i < feats.size(); i++) { Feature feat = (Feature) feats[i]; Type rangeType; feat.getRangeType(rangeType); if (rangeType.isValid()) { lowlevel::TyFSType rangetypecode = uima::internal::FSPromoter::demoteType(rangeType); lowlevel::TyFSFeature featcode = uima::internal::FSPromoter::demoteFeature(feat); //if not primitive if (!iv_cas->getTypeSystem().isPrimitive(rangetypecode)) { //get the current feature value which is the id lowlevel::TyFS featVal = iv_casimpl.getHeap().getFeatureInternal(addr, featcode); //get the FSInfo object for that id fsValInfo = (FSInfo*) fsTree[featVal]; //if there is a FSInfo //set the feature value of this feature to the //address in FSInfo else set it to NULL; if (fsValInfo == NULL) { //nothing to do, reference value already = 0! //iv_casimpl.getHeap().setFSValue(addr, featcode, (lowlevel::TyFS) 0); } else { iv_casimpl.getHeap().setFSValue(addr, featcode, fsValInfo->addr); } } } } } void XCASDeserializerHandler::finalizeArray(Type & type, lowlevel::TyFS addr, FSInfo & fsInfo) { lowlevel::TyFSType typecode = uima::internal::FSPromoter::demoteType(type); if (!iv_cas->getTypeSystem().isFSArrayType(typecode)) { return; } // *** WARNING *** *** WARNING *** *** WARNING *** *** WARNING *** // if implementation of ArrayFS on the heap changes, this code will be invalid int size = (int)iv_cas->getHeap()->getHeap().getHeapValue(addr + 1); FSInfo * fsValInfo; for (int i=0; i<size; i++) { lowlevel::TyFS id = iv_cas->getHeap()->getHeap().getHeapValue(addr + 2 + i); fsValInfo = fsTree[id]; if (fsValInfo != NULL) { iv_cas->getHeap()->getHeap().setHeapValue(addr + 2 + i, fsValInfo->addr); } } } // --------------------------------------------------------------------------- // XCASDeserializerHandler: Overrides of the SAX ErrorHandler interface // --------------------------------------------------------------------------- void XCASDeserializerHandler::error(const SAXParseException& e) { ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_ERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam((UChar const *)e.getSystemId()); msg.addParam(e.getLineNumber()); msg.addParam(e.getColumnNumber()); msg.addParam((UChar const *) e.getMessage()); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } void XCASDeserializerHandler::fatalError(const SAXParseException& e) { ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_FATALERROR); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam((UChar const *)e.getSystemId()); msg.addParam(e.getLineNumber()); msg.addParam(e.getColumnNumber()); msg.addParam((UChar const *) e.getMessage()); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } void XCASDeserializerHandler::warning(const SAXParseException& e) { ErrorInfo errInfo; errInfo.setErrorId((TyErrorId)UIMA_ERR_RESOURCE_CORRUPTED); ErrorMessage msg(UIMA_MSG_ID_EXC_XML_SAXPARSE_WARNING); assertWithMsg(sizeof(XMLCh) == sizeof(UChar), "Port required"); msg.addParam((UChar const *)e.getSystemId()); msg.addParam(e.getLineNumber()); msg.addParam(e.getColumnNumber()); msg.addParam((UChar const *) e.getMessage()); errInfo.setMessage(msg); errInfo.setSeverity(ErrorInfo::unrecoverable); ExcIllFormedInputError exc(errInfo); throw exc; } char const * XCASDeserializerHandler::CASTAGNAME = "CAS"; char const * XCASDeserializerHandler::DEFAULT_DOC_TYPE_NAME = "uima.tcas.Document"; char const * XCASDeserializerHandler::DEFAULT_DOC_TEXT_FEAT = "text"; char const * XCASDeserializerHandler::INDEXED_ATTR_NAME = "_indexed"; char const * XCASDeserializerHandler::REF_PREFIX = "_ref_"; char const * XCASDeserializerHandler::ID_ATTR_NAME = "_id"; char const * XCASDeserializerHandler::CONTENT_ATTR_NAME = "_content"; char const * XCASDeserializerHandler::ARRAY_SIZE_ATTR = "size"; char const * XCASDeserializerHandler::ARRAY_ELEMENT_TAG = "i"; char const * XCASDeserializerHandler::TRUE_VALUE = "true"; char const * XCASDeserializerHandler::DEFAULT_CONTENT_FEATURE = "value"; } // namespace uima