public void processDocuments()

in connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java [711:1350]


  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
    throws ManifoldCFException, ServiceInterruption
  {
    // Get forced acls/security enable/disable
    String[] acls = getAcls(spec);
    // Sort it, in case it is needed.
    if (acls != null)
      java.util.Arrays.sort(acls);

    // Look at the metadata attributes.
    // So that the version strings are comparable, we will put them in an array first, and sort them.
    Set<String> holder = new HashSet<String>();

    String pathAttributeName = null;
    MatchMap matchMap = new MatchMap();
    boolean allMetadata = false;

    int i = 0;
    while (i < spec.getChildCount())
    {
      SpecificationNode n = spec.getChild(i++);
      if (n.getType().equals("ReturnedMetadata"))
      {
        String category = n.getAttributeValue("category");
        String attributeName = n.getAttributeValue("property");
        String metadataName;
        if (category == null || category.length() == 0)
          metadataName = attributeName;
        else
          metadataName = category + "." + attributeName;
        holder.add(metadataName);
      }
      else if (n.getType().equals("AllMetadata"))
      {
        String value = n.getAttributeValue("value");
        if (value != null && value.equals("true"))
        {
          allMetadata = true;
        }
      }
      else if (n.getType().equals("pathnameattribute"))
        pathAttributeName = n.getAttributeValue("value");
      else if (n.getType().equals("pathmap"))
      {
        // Path mapping info also needs to be looked at, because it affects what is
        // ingested.
        String pathMatch = n.getAttributeValue("match");
        String pathReplace = n.getAttributeValue("replace");
        matchMap.appendMatchPair(pathMatch,pathReplace);
      }
    }

    while (true)
    {

      getSession();

      // The version string returned must include everything that could affect what is ingested.  In meridio's
      // case, this includes the date stamp, but it also includes the part of the specification that describes
      // the metadata desired.

      // The code here relies heavily on the search method to do it's thing.  The search method originally
      // used the document specification to determine what metadata to return, which was problematic because that
      // meant this method had to modify the specification (not good practice), and was also wrong from the point
      // of view that we need to get the metadata specification appended to the version string in some way, and
      // use THAT data in processDocuments().  So I've broken all that up.

      try
      {
        // Put into an array
        ReturnMetadata[] categoryPropertyValues;
        String[] categoryPropertyStringValues;
        String[] sortArray;
        if (allMetadata)
        {
          categoryPropertyStringValues = getMeridioDocumentProperties();
        }
        else
        {
          categoryPropertyStringValues = new String[holder.size()];
          i = 0;
          for (String value : holder)
          {
            categoryPropertyStringValues[i++] = value;
          }
        }
        // Sort!
        java.util.Arrays.sort(categoryPropertyStringValues);
        categoryPropertyValues = new ReturnMetadata[categoryPropertyStringValues.length];
        i = 0;
        for (String value : categoryPropertyStringValues)
        {
          int dotIndex = value.indexOf(".");
          String categoryName = null;
          String propertyName;
          if (dotIndex == -1)
            propertyName = value;
          else
          {
            categoryName = value.substring(0,dotIndex);
            propertyName = value.substring(dotIndex+1);
          }

          categoryPropertyValues[i++] = new ReturnMetadata(categoryName,propertyName);
        }
        
        // Prepare the part of the version string that is decodeable
        StringBuilder decodeableString = new StringBuilder();

        // Add the metadata piece first
        packList(decodeableString,categoryPropertyStringValues,'+');
        
        // Now, put in the forced acls.
        // The version string needs only to contain the forced acls, since the version date captures changes
        // made to the acls that are actually associated with the document.
        if (acls == null)
          decodeableString.append('-');
        else
        {
          decodeableString.append('+');
          packList(decodeableString,acls,'+');
          decodeableString.append('+');
          pack(decodeableString,defaultAuthorityDenyToken,'+');
        }

        // Calculate the part of the version string that comes from path name and mapping.
        if (pathAttributeName != null)
        {
          decodeableString.append("+");
          pack(decodeableString,pathAttributeName,'+');
          pack(decodeableString,matchMap.toString(),'+');
        }
        else
          decodeableString.append("-");

        long[] docIds = new long[documentIdentifiers.length];
        for (i = 0; i < documentIdentifiers.length; i++)
        {
          docIds[i] = new Long(documentIdentifiers[i]).longValue();
        }
        
        /*=================================================================
        * Call the search, with the document specification and the list of
        * document ids - the search will never return more than exactly
        * one match per document id
        *
        * We are assuming that the maximum number of hits to return
        * should never be more than the maximum batch size set up for this
        * class
        *
        * We are just making one web service call (to the search API)
        * rather than iteratively calling a web service method for each
        * document passed in as part of the document array
        *
        * Additionally, re-using the same search method as for the
        * "getDocumentIdentifiers" method ensures that we are not
        * duplicating any logic which ensures that the document/records
        * in question match the search criteria or not.
        *================================================================*/
        DMSearchResults searchResults = documentSpecificationSearch(spec,
          0, 0, 1, this.getMaxDocumentRequest(), docIds, null);

        if (Logging.connectors.isDebugEnabled())
          Logging.connectors.debug("Found a total of <" + searchResults.totalHitsCount + "> hit(s) " +
          "and <" + searchResults.returnedHitsCount + "> were returned by the method call");

        // If we are searching based on document identifier, then it is possible that we will not
        // find a document we are looking for, if it was removed from the system between the time
        // it was put in the queue and when it's version is obtained.  Documents where this happens
        // should return a version string of null.

        // Let's go through the search results and build a hash based on the document identifier.
        Map<Long,SEARCHRESULTS_DOCUMENTS> documentMap = new HashMap<Long,SEARCHRESULTS_DOCUMENTS>();
        if (searchResults.dsDM != null)
        {
          SEARCHRESULTS_DOCUMENTS [] srd = searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS();
          for (i = 0; i < srd.length; i++)
          {
            documentMap.put(new Long(srd[i].getDocId()),srd[i]);
          }
        }

        // Now, walk through the individual documents.
        Map<Long,String> versionStrings = new HashMap<Long,String>();
        for (int j = 0; j < docIds.length; j++)
        {
          String documentIdentifier = documentIdentifiers[j];
          long docId = docIds[j];
          Long docKey = new Long(docId);
          // Look up the record.
          SEARCHRESULTS_DOCUMENTS doc = documentMap.get(docKey);
          if (doc != null)
          {
            // Set the version string.  The parseable stuff goes first, so parsing is easy.
            String version = doc.getStr_value();
            StringBuilder composedVersion = new StringBuilder();
            composedVersion.append(decodeableString);
            composedVersion.append(version);
            // Added 9/7/2007
            composedVersion.append("_").append(urlVersionBase);
            //
            String versionString = composedVersion.toString();
            if (Logging.connectors.isDebugEnabled())
              Logging.connectors.debug("Meridio: Document "+docKey+" has version "+versionString);
            if (activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
              versionStrings.put(docKey,versionString);
          }
          else
          {
            if (Logging.connectors.isDebugEnabled())
              Logging.connectors.debug("Meridio: Document "+docKey+" is no longer in the search set, or has been deleted - removing.");
            activities.deleteDocument(documentIdentifier);
          }
        }

        // Now submit search requests for all the documents requiring fetch.
        
        Map<Long,Map<String,String>> documentPropertyMap = new HashMap<Long,Map<String,String>>();

        // Only look up metadata if we need some!
        if (versionStrings.size() > 0 && categoryPropertyValues.length > 0)
        {
          long[] fetchIds = new long[versionStrings.size()];
          i = 0;
          for (Long docKey : versionStrings.keySet())
          {
            fetchIds[i++] = docKey;
          }

          /*=================================================================
          * Call the search, with the document specification and the list of
          * document ids - the search will never return more than exactly
          * one match per document id
          *
          * This call will return all the metadata that was specified in the
          * document specification for all the documents and
          * records in one call.
          *================================================================*/
          searchResults = documentSpecificationSearch(spec,
            0, 0, 1, fetchIds.length,
            fetchIds, categoryPropertyValues);

          // If we ask for a document and it is no longer there, we should treat this as a deletion.
          // The activity in that case is to delete the document.  A similar thing should happen if
          // any of the other methods (like getting the document's content) also fail to find the
          // document.

          // Let's build a hash which contains all the document metadata returned.  The form of
          // the hash will be: key = the document identifier, value = another hash, which is keyed
          // by the metadata category/property, and which has a value that is the metadata value.

          Map<Long,MutableInteger> counterMap = new HashMap<Long,MutableInteger>();

          if (searchResults.dsDM != null)
          {
            SEARCHRESULTS_DOCUMENTS [] searchResultsDocuments = searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS();
            for (SEARCHRESULTS_DOCUMENTS searchResultsDocument : searchResultsDocuments)
            {
              long docId = searchResultsDocument.getDocId();
              Long docKey = new Long(docId);
              MutableInteger counterMapItem = counterMap.get(docKey);
              if (counterMapItem == null)
              {
                counterMapItem = new MutableInteger();
                counterMap.put(docKey,counterMapItem);
              }

              String propertyName = categoryPropertyStringValues[counterMapItem.getValue()];
              counterMapItem.increment();
              String propertyValue = searchResultsDocuments[i].getStr_value();
              Map<String,String> propertyMap = documentPropertyMap.get(docKey);
              if (propertyMap == null)
              {
                propertyMap = new HashMap<String,String>();
                documentPropertyMap.put(docKey,propertyMap);
              }
              if (propertyValue != null && propertyValue.length() > 0)
                propertyMap.put(propertyName,propertyValue);
            }
          }
        }

        // Okay, we are ready now to go through the individual documents and do the ingestion or deletion.
        for (String documentIdentifier : documentIdentifiers)
        {
          Long docKey = new Long(documentIdentifier);
          long docId = docKey.longValue();
          String docVersion = versionStrings.get(docKey);
          if (docVersion != null)
          {
            if (Logging.connectors.isDebugEnabled())
              Logging.connectors.debug("Processing document identifier '" + documentIdentifier + "' " +
              "with version string '" + docVersion + "'");

            // For each document, be sure the job is still allowed to run.
            activities.checkJobStillActive();

            RepositoryDocument repositoryDocument = new RepositoryDocument();

            // Load the metadata items into the ingestion document object
            Map<String,String> docMetadataMap = documentPropertyMap.get(docKey);
            if (docMetadataMap != null)
            {
              for (String categoryPropertyName : categoryPropertyStringValues)
              {
                String propertyValue = docMetadataMap.get(categoryPropertyName);
                if (propertyValue != null && propertyValue.length() > 0)
                  repositoryDocument.addField(categoryPropertyName,propertyValue);
              }
            }

            /*=================================================================
            * Construct the URL to the object
            *
            * HTTP://HOST:PORT/meridio/browse/downloadcontent.aspx?documentId=<docId>&launchMode=1&launchAs=0
            *
            * I expect we need to add additional parameters to the configuration
            * specification
            *================================================================*/
            String fileURL = urlBase + new Long(docId).toString();
            if (Logging.connectors.isDebugEnabled())
              Logging.connectors.debug("URL for document '" + new Long(docId).toString() + "' is '" + fileURL + "'");

            /*=================================================================
            * Get the object's ACLs and owner information
            *================================================================*/
            DMDataSet documentData = null;
            documentData = meridio_.getDocumentData((int)docId, true, true, false, false,
              DmVersionInfo.LATEST, false, false, false);

            if (null == documentData)
            {
              if (Logging.connectors.isDebugEnabled())
                Logging.connectors.debug("Meridio: Could not retrieve document data for document id '" +
                new Long(docId).toString() + "' in processDocuments method - deleting document.");
              activities.noDocument(documentIdentifier,docVersion);
              continue;
            }

            if (null == documentData.getDOCUMENTS() ||
              documentData.getDOCUMENTS().length != 1)
            {
              if (Logging.connectors.isDebugEnabled())
                Logging.connectors.debug("Meridio: Could not retrieve document owner for document id '" +
                new Long(docId).toString() + "' in processDocuments method. No information or incorrect amount " +
                "of information was returned");
              activities.noDocument(documentIdentifier,docVersion);
              continue;
            }

            // Do path metadata
            if (pathAttributeName != null && pathAttributeName.length() > 0)
            {
              if (Logging.connectors.isDebugEnabled())
                Logging.connectors.debug("Meridio: Path attribute name is "+pathAttributeName);
              RMDataSet partList;
              int recordType = documentData.getDOCUMENTS()[0].getPROP_recordType();
              if (recordType == 0 || recordType == 4 || recordType == 19)
                partList = meridio_.getRecordPartList((int)docId, false, false);
              else
                partList = meridio_.getDocumentPartList((int)docId);
              if (partList != null)
              {
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("Meridio: Document '"+new Long(docId).toString()+"' has a part list with "+Integer.toString(partList.getRm2vPart().length)+" values");

                for (int k = 0; k < partList.getRm2vPart().length; k++)
                {
                  repositoryDocument.addField(pathAttributeName,matchMap.translate(partList.getRm2vPart()[k].getParentTitlePath()));
                }
              }
              else
              {
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("Meridio: Document '"+new Long(docId).toString()+"' has no part list, so no path attribute");
              }
            }

            // Process acls.  If there are forced acls, use those, otherwise get them from Meridio.
            String [] allowAcls;
            String [] denyAcls;

            // forcedAcls will be null if security is off, or nonzero length if security is on but hard-wired
            if (acls != null && acls.length == 0)
            {
              ACCESSCONTROL [] documentAcls = documentData.getACCESSCONTROL();
              List<String> allowAclsArrayList = new ArrayList<String>();
              List<String> denyAclsArrayList = new ArrayList<String>();

              // Allow a broken authority to disable all Meridio documents, even if the document is 'wide open', because
              // Meridio does not permit viewing of the document if the user does not exist (at least, I don't know of a way).
              denyAclsArrayList.add(defaultAuthorityDenyToken);

              if (documentAcls != null)
              {
                for (int j = 0; j < documentAcls.length; j++)
                {
                  if (Logging.connectors.isDebugEnabled())
                    Logging.connectors.debug(
                    "Object Id '" + documentAcls[j].getObjectId() + "' " +
                    "Object Type '" + documentAcls[j].getObjectType() + "' " +
                    "Permission '" + documentAcls[j].getPermission() + "' " +
                    "User Id '" + documentAcls[j].getUserId() + "' " +
                    "Group Id '" + documentAcls[j].getGroupId() + "'");

                  if (documentAcls[j].getPermission() == 0)  // prohibit permission
                  {
                    if (documentAcls[j].getGroupId() > 0)
                    {
                      denyAclsArrayList.add("G" + documentAcls[j].getGroupId());
                    } else if (documentAcls[j].getUserId() > 0)
                    {
                      denyAclsArrayList.add("U" + documentAcls[j].getUserId());
                    }
                  }
                  else                                       // read, amend or manage
                  {
                    if (documentAcls[j].getGroupId() > 0)
                    {
                      allowAclsArrayList.add("G" + documentAcls[j].getGroupId());
                    } else if (documentAcls[j].getUserId() > 0)
                    {
                      allowAclsArrayList.add("U" + documentAcls[j].getUserId());
                    }
                  }
                }
              }

              DOCUMENTS document = documentData.getDOCUMENTS()[0];

              if (Logging.connectors.isDebugEnabled())
                Logging.connectors.debug("Document id '" + new Long(docId).toString() + "' is owned by owner id '" +
                document.getPROP_ownerId() + "' having the owner name '" +
                document.getPROP_ownerName() + "' Record Type is '" +
                document.getPROP_recordType() + "'");

              if (document.getPROP_recordType() == 4 ||
                document.getPROP_recordType() == 19)
              {
                RMDataSet rmds = meridio_.getRecord((int)docId, false, false, false);
                Rm2vRecord record = rmds.getRm2vRecord()[0];

                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("Record User Id Owner is '" + record.getOwnerID() +
                  "' Record Group Owner Id is '" + record.getGroupOwnerID() + "'");

                /*=================================================================
                * Either a group or a user owns a record, cannot be both and the
                * group takes priority if it is set
                *================================================================*/
                if (record.getGroupOwnerID() > 0)
                {
                  allowAclsArrayList.add("G" + record.getGroupOwnerID());
                } else if (record.getOwnerID() > 0)
                {
                  allowAclsArrayList.add("U" + record.getOwnerID());
                }
              }
              else
              {
                allowAclsArrayList.add("U" + document.getPROP_ownerId());
              }

              /*=================================================================
              * Set up the string arrays and then set the ACLs in the
              * repository document
              *================================================================*/
              allowAcls = new String[allowAclsArrayList.size()];
              for (int j = 0; j < allowAclsArrayList.size(); j++)
              {
                allowAcls[j] = allowAclsArrayList.get(j);
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("Meridio: Adding '" + allowAcls[j] + "' to allow ACLs");
              }

              denyAcls = new String[denyAclsArrayList.size()];
              for (int j = 0; j < denyAclsArrayList.size(); j++)
              {
                denyAcls[j] = denyAclsArrayList.get(j);
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("Meridio: Adding '" + denyAcls[j] + "' to deny ACLs");
              }
            }
            else
            {
              allowAcls = acls;
              if (allowAcls == null)
                denyAcls = null;
              else
                denyAcls = new String[]{defaultAuthorityDenyToken};
            }

            repositoryDocument.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,allowAcls,denyAcls);

            /*=================================================================
            * Get the object's content, and ingest the document
            *================================================================*/
            try
            {
              AttachmentPart ap = meridio_.getLatestVersionFile((int)docId);
              if (null == ap)
              {
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("Meridio: Failed to get content for document '" + new Long(docId).toString() + "'");
                // No document.  Delete what's there
                activities.noDocument(documentIdentifier,docVersion);
                continue;
              }
              try
              {
                // Get the file name.
                String fileName = ap.getDataHandler().getName();
                // Log what we are about to do.
                if (Logging.connectors.isDebugEnabled())
                  Logging.connectors.debug("Meridio: File data is supposedly in "+fileName);
                File theTempFile = new File(fileName);
                if (theTempFile.isFile())
                {
                  long fileSize = theTempFile.length();                   // ap.getSize();
                  if (activities.checkLengthIndexable(fileSize))
                  {
                    InputStream is = new FileInputStream(theTempFile);      // ap.getDataHandler().getInputStream();
                    try
                    {
                      repositoryDocument.setBinary(is, fileSize);

                      if (null != activities)
                      {
                        activities.ingestDocumentWithException(documentIdentifier, docVersion,
                          fileURL, repositoryDocument);
                      }
                    }
                    finally
                    {
                      is.close();
                    }
                  }
                  else
                  {
                    activities.noDocument(documentIdentifier, docVersion);
                    continue;
                  }
                }
                else
                {
                  if (Logging.connectors.isDebugEnabled())
                    Logging.connectors.debug("Meridio: Expected temporary file was not present - skipping document '"+new Long(docId).toString() + "'");
                  activities.deleteDocument(documentIdentifier);
                  continue;
                }
              }
              finally
              {
                ap.dispose();
              }

            }
            catch (java.net.SocketTimeoutException ioex)
            {
              throw new ManifoldCFException("Socket timeout exception: "+ioex.getMessage(), ioex);
            }
            catch (ConnectTimeoutException ioex)
            {
              throw new ManifoldCFException("Connect timeout exception: "+ioex.getMessage(), ioex);
            }
            catch (InterruptedIOException e)
            {
              throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
            }
            catch (org.apache.axis.AxisFault e)
            {
              throw e;
            }
            catch (RemoteException e)
            {
              throw e;
            }
            catch (SOAPException soapEx)
            {
              throw new ManifoldCFException("SOAP Exception encountered while retrieving document content: "+soapEx.getMessage(),
                soapEx);
            }
            catch (IOException ioex)
            {
              throw new ManifoldCFException("Input stream failure: "+ioex.getMessage(), ioex);
            }
          }
        }

        Logging.connectors.debug("Meridio: Exiting 'processDocuments' method");
        return;
      }
      catch (org.apache.axis.AxisFault e)
      {
        long currentTime = System.currentTimeMillis();
        if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HTTP")))
        {
          org.w3c.dom.Element elem = e.lookupFaultDetail(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HttpErrorCode"));
          if (elem != null)
          {
            elem.normalize();
            String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
            throw new ManifoldCFException("Unexpected http error code "+httpErrorCode+" accessing Meridio: "+e.getMessage(),e);
          }
          throw new ManifoldCFException("Unknown http error occurred while getting doc versions: "+e.getMessage(),e);
        }
        if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server.userException")))
        {
          String exceptionName = e.getFaultString();
          if (exceptionName.equals("java.lang.InterruptedException"))
            throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
        }
        if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server")))
        {
          if (e.getFaultString().indexOf(" 23031#") != -1)
          {
            // This means that the session has expired, so reset it and retry
            meridio_ = null;
            continue;
          }
        }

        if (Logging.connectors.isDebugEnabled())
          Logging.connectors.debug("Meridio: Got an unknown remote exception getting doc versions - axis fault = "+e.getFaultCode().getLocalPart()+", detail = "+e.getFaultString()+" - retrying",e);
        throw new ServiceInterruption("Remote procedure exception: "+e.getMessage(),  e, currentTime + 300000L,
          currentTime + 3 * 60 * 60000L,-1,false);
      }
      catch (RemoteException remoteException)
      {
        throw new ManifoldCFException("Meridio: A remote exception occurred while getting doc versions: " +
          remoteException.getMessage(), remoteException);
      }
      catch (MeridioDataSetException meridioDataSetException)
      {
        throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " +
          "Service XML: "+meridioDataSetException.getMessage(), meridioDataSetException);
      }
    }
  }