in connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java [711:1350]
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption
{
// Get forced acls/security enable/disable
String[] acls = getAcls(spec);
// Sort it, in case it is needed.
if (acls != null)
java.util.Arrays.sort(acls);
// Look at the metadata attributes.
// So that the version strings are comparable, we will put them in an array first, and sort them.
Set<String> holder = new HashSet<String>();
String pathAttributeName = null;
MatchMap matchMap = new MatchMap();
boolean allMetadata = false;
int i = 0;
while (i < spec.getChildCount())
{
SpecificationNode n = spec.getChild(i++);
if (n.getType().equals("ReturnedMetadata"))
{
String category = n.getAttributeValue("category");
String attributeName = n.getAttributeValue("property");
String metadataName;
if (category == null || category.length() == 0)
metadataName = attributeName;
else
metadataName = category + "." + attributeName;
holder.add(metadataName);
}
else if (n.getType().equals("AllMetadata"))
{
String value = n.getAttributeValue("value");
if (value != null && value.equals("true"))
{
allMetadata = true;
}
}
else if (n.getType().equals("pathnameattribute"))
pathAttributeName = n.getAttributeValue("value");
else if (n.getType().equals("pathmap"))
{
// Path mapping info also needs to be looked at, because it affects what is
// ingested.
String pathMatch = n.getAttributeValue("match");
String pathReplace = n.getAttributeValue("replace");
matchMap.appendMatchPair(pathMatch,pathReplace);
}
}
while (true)
{
getSession();
// The version string returned must include everything that could affect what is ingested. In meridio's
// case, this includes the date stamp, but it also includes the part of the specification that describes
// the metadata desired.
// The code here relies heavily on the search method to do it's thing. The search method originally
// used the document specification to determine what metadata to return, which was problematic because that
// meant this method had to modify the specification (not good practice), and was also wrong from the point
// of view that we need to get the metadata specification appended to the version string in some way, and
// use THAT data in processDocuments(). So I've broken all that up.
try
{
// Put into an array
ReturnMetadata[] categoryPropertyValues;
String[] categoryPropertyStringValues;
String[] sortArray;
if (allMetadata)
{
categoryPropertyStringValues = getMeridioDocumentProperties();
}
else
{
categoryPropertyStringValues = new String[holder.size()];
i = 0;
for (String value : holder)
{
categoryPropertyStringValues[i++] = value;
}
}
// Sort!
java.util.Arrays.sort(categoryPropertyStringValues);
categoryPropertyValues = new ReturnMetadata[categoryPropertyStringValues.length];
i = 0;
for (String value : categoryPropertyStringValues)
{
int dotIndex = value.indexOf(".");
String categoryName = null;
String propertyName;
if (dotIndex == -1)
propertyName = value;
else
{
categoryName = value.substring(0,dotIndex);
propertyName = value.substring(dotIndex+1);
}
categoryPropertyValues[i++] = new ReturnMetadata(categoryName,propertyName);
}
// Prepare the part of the version string that is decodeable
StringBuilder decodeableString = new StringBuilder();
// Add the metadata piece first
packList(decodeableString,categoryPropertyStringValues,'+');
// Now, put in the forced acls.
// The version string needs only to contain the forced acls, since the version date captures changes
// made to the acls that are actually associated with the document.
if (acls == null)
decodeableString.append('-');
else
{
decodeableString.append('+');
packList(decodeableString,acls,'+');
decodeableString.append('+');
pack(decodeableString,defaultAuthorityDenyToken,'+');
}
// Calculate the part of the version string that comes from path name and mapping.
if (pathAttributeName != null)
{
decodeableString.append("+");
pack(decodeableString,pathAttributeName,'+');
pack(decodeableString,matchMap.toString(),'+');
}
else
decodeableString.append("-");
long[] docIds = new long[documentIdentifiers.length];
for (i = 0; i < documentIdentifiers.length; i++)
{
docIds[i] = new Long(documentIdentifiers[i]).longValue();
}
/*=================================================================
* Call the search, with the document specification and the list of
* document ids - the search will never return more than exactly
* one match per document id
*
* We are assuming that the maximum number of hits to return
* should never be more than the maximum batch size set up for this
* class
*
* We are just making one web service call (to the search API)
* rather than iteratively calling a web service method for each
* document passed in as part of the document array
*
* Additionally, re-using the same search method as for the
* "getDocumentIdentifiers" method ensures that we are not
* duplicating any logic which ensures that the document/records
* in question match the search criteria or not.
*================================================================*/
DMSearchResults searchResults = documentSpecificationSearch(spec,
0, 0, 1, this.getMaxDocumentRequest(), docIds, null);
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Found a total of <" + searchResults.totalHitsCount + "> hit(s) " +
"and <" + searchResults.returnedHitsCount + "> were returned by the method call");
// If we are searching based on document identifier, then it is possible that we will not
// find a document we are looking for, if it was removed from the system between the time
// it was put in the queue and when it's version is obtained. Documents where this happens
// should return a version string of null.
// Let's go through the search results and build a hash based on the document identifier.
Map<Long,SEARCHRESULTS_DOCUMENTS> documentMap = new HashMap<Long,SEARCHRESULTS_DOCUMENTS>();
if (searchResults.dsDM != null)
{
SEARCHRESULTS_DOCUMENTS [] srd = searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS();
for (i = 0; i < srd.length; i++)
{
documentMap.put(new Long(srd[i].getDocId()),srd[i]);
}
}
// Now, walk through the individual documents.
Map<Long,String> versionStrings = new HashMap<Long,String>();
for (int j = 0; j < docIds.length; j++)
{
String documentIdentifier = documentIdentifiers[j];
long docId = docIds[j];
Long docKey = new Long(docId);
// Look up the record.
SEARCHRESULTS_DOCUMENTS doc = documentMap.get(docKey);
if (doc != null)
{
// Set the version string. The parseable stuff goes first, so parsing is easy.
String version = doc.getStr_value();
StringBuilder composedVersion = new StringBuilder();
composedVersion.append(decodeableString);
composedVersion.append(version);
// Added 9/7/2007
composedVersion.append("_").append(urlVersionBase);
//
String versionString = composedVersion.toString();
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Document "+docKey+" has version "+versionString);
if (activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
versionStrings.put(docKey,versionString);
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Document "+docKey+" is no longer in the search set, or has been deleted - removing.");
activities.deleteDocument(documentIdentifier);
}
}
// Now submit search requests for all the documents requiring fetch.
Map<Long,Map<String,String>> documentPropertyMap = new HashMap<Long,Map<String,String>>();
// Only look up metadata if we need some!
if (versionStrings.size() > 0 && categoryPropertyValues.length > 0)
{
long[] fetchIds = new long[versionStrings.size()];
i = 0;
for (Long docKey : versionStrings.keySet())
{
fetchIds[i++] = docKey;
}
/*=================================================================
* Call the search, with the document specification and the list of
* document ids - the search will never return more than exactly
* one match per document id
*
* This call will return all the metadata that was specified in the
* document specification for all the documents and
* records in one call.
*================================================================*/
searchResults = documentSpecificationSearch(spec,
0, 0, 1, fetchIds.length,
fetchIds, categoryPropertyValues);
// If we ask for a document and it is no longer there, we should treat this as a deletion.
// The activity in that case is to delete the document. A similar thing should happen if
// any of the other methods (like getting the document's content) also fail to find the
// document.
// Let's build a hash which contains all the document metadata returned. The form of
// the hash will be: key = the document identifier, value = another hash, which is keyed
// by the metadata category/property, and which has a value that is the metadata value.
Map<Long,MutableInteger> counterMap = new HashMap<Long,MutableInteger>();
if (searchResults.dsDM != null)
{
SEARCHRESULTS_DOCUMENTS [] searchResultsDocuments = searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS();
for (SEARCHRESULTS_DOCUMENTS searchResultsDocument : searchResultsDocuments)
{
long docId = searchResultsDocument.getDocId();
Long docKey = new Long(docId);
MutableInteger counterMapItem = counterMap.get(docKey);
if (counterMapItem == null)
{
counterMapItem = new MutableInteger();
counterMap.put(docKey,counterMapItem);
}
String propertyName = categoryPropertyStringValues[counterMapItem.getValue()];
counterMapItem.increment();
String propertyValue = searchResultsDocuments[i].getStr_value();
Map<String,String> propertyMap = documentPropertyMap.get(docKey);
if (propertyMap == null)
{
propertyMap = new HashMap<String,String>();
documentPropertyMap.put(docKey,propertyMap);
}
if (propertyValue != null && propertyValue.length() > 0)
propertyMap.put(propertyName,propertyValue);
}
}
}
// Okay, we are ready now to go through the individual documents and do the ingestion or deletion.
for (String documentIdentifier : documentIdentifiers)
{
Long docKey = new Long(documentIdentifier);
long docId = docKey.longValue();
String docVersion = versionStrings.get(docKey);
if (docVersion != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Processing document identifier '" + documentIdentifier + "' " +
"with version string '" + docVersion + "'");
// For each document, be sure the job is still allowed to run.
activities.checkJobStillActive();
RepositoryDocument repositoryDocument = new RepositoryDocument();
// Load the metadata items into the ingestion document object
Map<String,String> docMetadataMap = documentPropertyMap.get(docKey);
if (docMetadataMap != null)
{
for (String categoryPropertyName : categoryPropertyStringValues)
{
String propertyValue = docMetadataMap.get(categoryPropertyName);
if (propertyValue != null && propertyValue.length() > 0)
repositoryDocument.addField(categoryPropertyName,propertyValue);
}
}
/*=================================================================
* Construct the URL to the object
*
* HTTP://HOST:PORT/meridio/browse/downloadcontent.aspx?documentId=<docId>&launchMode=1&launchAs=0
*
* I expect we need to add additional parameters to the configuration
* specification
*================================================================*/
String fileURL = urlBase + new Long(docId).toString();
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("URL for document '" + new Long(docId).toString() + "' is '" + fileURL + "'");
/*=================================================================
* Get the object's ACLs and owner information
*================================================================*/
DMDataSet documentData = null;
documentData = meridio_.getDocumentData((int)docId, true, true, false, false,
DmVersionInfo.LATEST, false, false, false);
if (null == documentData)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Could not retrieve document data for document id '" +
new Long(docId).toString() + "' in processDocuments method - deleting document.");
activities.noDocument(documentIdentifier,docVersion);
continue;
}
if (null == documentData.getDOCUMENTS() ||
documentData.getDOCUMENTS().length != 1)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Could not retrieve document owner for document id '" +
new Long(docId).toString() + "' in processDocuments method. No information or incorrect amount " +
"of information was returned");
activities.noDocument(documentIdentifier,docVersion);
continue;
}
// Do path metadata
if (pathAttributeName != null && pathAttributeName.length() > 0)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Path attribute name is "+pathAttributeName);
RMDataSet partList;
int recordType = documentData.getDOCUMENTS()[0].getPROP_recordType();
if (recordType == 0 || recordType == 4 || recordType == 19)
partList = meridio_.getRecordPartList((int)docId, false, false);
else
partList = meridio_.getDocumentPartList((int)docId);
if (partList != null)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Document '"+new Long(docId).toString()+"' has a part list with "+Integer.toString(partList.getRm2vPart().length)+" values");
for (int k = 0; k < partList.getRm2vPart().length; k++)
{
repositoryDocument.addField(pathAttributeName,matchMap.translate(partList.getRm2vPart()[k].getParentTitlePath()));
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Document '"+new Long(docId).toString()+"' has no part list, so no path attribute");
}
}
// Process acls. If there are forced acls, use those, otherwise get them from Meridio.
String [] allowAcls;
String [] denyAcls;
// forcedAcls will be null if security is off, or nonzero length if security is on but hard-wired
if (acls != null && acls.length == 0)
{
ACCESSCONTROL [] documentAcls = documentData.getACCESSCONTROL();
List<String> allowAclsArrayList = new ArrayList<String>();
List<String> denyAclsArrayList = new ArrayList<String>();
// Allow a broken authority to disable all Meridio documents, even if the document is 'wide open', because
// Meridio does not permit viewing of the document if the user does not exist (at least, I don't know of a way).
denyAclsArrayList.add(defaultAuthorityDenyToken);
if (documentAcls != null)
{
for (int j = 0; j < documentAcls.length; j++)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug(
"Object Id '" + documentAcls[j].getObjectId() + "' " +
"Object Type '" + documentAcls[j].getObjectType() + "' " +
"Permission '" + documentAcls[j].getPermission() + "' " +
"User Id '" + documentAcls[j].getUserId() + "' " +
"Group Id '" + documentAcls[j].getGroupId() + "'");
if (documentAcls[j].getPermission() == 0) // prohibit permission
{
if (documentAcls[j].getGroupId() > 0)
{
denyAclsArrayList.add("G" + documentAcls[j].getGroupId());
} else if (documentAcls[j].getUserId() > 0)
{
denyAclsArrayList.add("U" + documentAcls[j].getUserId());
}
}
else // read, amend or manage
{
if (documentAcls[j].getGroupId() > 0)
{
allowAclsArrayList.add("G" + documentAcls[j].getGroupId());
} else if (documentAcls[j].getUserId() > 0)
{
allowAclsArrayList.add("U" + documentAcls[j].getUserId());
}
}
}
}
DOCUMENTS document = documentData.getDOCUMENTS()[0];
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Document id '" + new Long(docId).toString() + "' is owned by owner id '" +
document.getPROP_ownerId() + "' having the owner name '" +
document.getPROP_ownerName() + "' Record Type is '" +
document.getPROP_recordType() + "'");
if (document.getPROP_recordType() == 4 ||
document.getPROP_recordType() == 19)
{
RMDataSet rmds = meridio_.getRecord((int)docId, false, false, false);
Rm2vRecord record = rmds.getRm2vRecord()[0];
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Record User Id Owner is '" + record.getOwnerID() +
"' Record Group Owner Id is '" + record.getGroupOwnerID() + "'");
/*=================================================================
* Either a group or a user owns a record, cannot be both and the
* group takes priority if it is set
*================================================================*/
if (record.getGroupOwnerID() > 0)
{
allowAclsArrayList.add("G" + record.getGroupOwnerID());
} else if (record.getOwnerID() > 0)
{
allowAclsArrayList.add("U" + record.getOwnerID());
}
}
else
{
allowAclsArrayList.add("U" + document.getPROP_ownerId());
}
/*=================================================================
* Set up the string arrays and then set the ACLs in the
* repository document
*================================================================*/
allowAcls = new String[allowAclsArrayList.size()];
for (int j = 0; j < allowAclsArrayList.size(); j++)
{
allowAcls[j] = allowAclsArrayList.get(j);
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Adding '" + allowAcls[j] + "' to allow ACLs");
}
denyAcls = new String[denyAclsArrayList.size()];
for (int j = 0; j < denyAclsArrayList.size(); j++)
{
denyAcls[j] = denyAclsArrayList.get(j);
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Adding '" + denyAcls[j] + "' to deny ACLs");
}
}
else
{
allowAcls = acls;
if (allowAcls == null)
denyAcls = null;
else
denyAcls = new String[]{defaultAuthorityDenyToken};
}
repositoryDocument.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,allowAcls,denyAcls);
/*=================================================================
* Get the object's content, and ingest the document
*================================================================*/
try
{
AttachmentPart ap = meridio_.getLatestVersionFile((int)docId);
if (null == ap)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Failed to get content for document '" + new Long(docId).toString() + "'");
// No document. Delete what's there
activities.noDocument(documentIdentifier,docVersion);
continue;
}
try
{
// Get the file name.
String fileName = ap.getDataHandler().getName();
// Log what we are about to do.
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: File data is supposedly in "+fileName);
File theTempFile = new File(fileName);
if (theTempFile.isFile())
{
long fileSize = theTempFile.length(); // ap.getSize();
if (activities.checkLengthIndexable(fileSize))
{
InputStream is = new FileInputStream(theTempFile); // ap.getDataHandler().getInputStream();
try
{
repositoryDocument.setBinary(is, fileSize);
if (null != activities)
{
activities.ingestDocumentWithException(documentIdentifier, docVersion,
fileURL, repositoryDocument);
}
}
finally
{
is.close();
}
}
else
{
activities.noDocument(documentIdentifier, docVersion);
continue;
}
}
else
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Expected temporary file was not present - skipping document '"+new Long(docId).toString() + "'");
activities.deleteDocument(documentIdentifier);
continue;
}
}
finally
{
ap.dispose();
}
}
catch (java.net.SocketTimeoutException ioex)
{
throw new ManifoldCFException("Socket timeout exception: "+ioex.getMessage(), ioex);
}
catch (ConnectTimeoutException ioex)
{
throw new ManifoldCFException("Connect timeout exception: "+ioex.getMessage(), ioex);
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (org.apache.axis.AxisFault e)
{
throw e;
}
catch (RemoteException e)
{
throw e;
}
catch (SOAPException soapEx)
{
throw new ManifoldCFException("SOAP Exception encountered while retrieving document content: "+soapEx.getMessage(),
soapEx);
}
catch (IOException ioex)
{
throw new ManifoldCFException("Input stream failure: "+ioex.getMessage(), ioex);
}
}
}
Logging.connectors.debug("Meridio: Exiting 'processDocuments' method");
return;
}
catch (org.apache.axis.AxisFault e)
{
long currentTime = System.currentTimeMillis();
if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HTTP")))
{
org.w3c.dom.Element elem = e.lookupFaultDetail(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HttpErrorCode"));
if (elem != null)
{
elem.normalize();
String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
throw new ManifoldCFException("Unexpected http error code "+httpErrorCode+" accessing Meridio: "+e.getMessage(),e);
}
throw new ManifoldCFException("Unknown http error occurred while getting doc versions: "+e.getMessage(),e);
}
if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server.userException")))
{
String exceptionName = e.getFaultString();
if (exceptionName.equals("java.lang.InterruptedException"))
throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
}
if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server")))
{
if (e.getFaultString().indexOf(" 23031#") != -1)
{
// This means that the session has expired, so reset it and retry
meridio_ = null;
continue;
}
}
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Meridio: Got an unknown remote exception getting doc versions - axis fault = "+e.getFaultCode().getLocalPart()+", detail = "+e.getFaultString()+" - retrying",e);
throw new ServiceInterruption("Remote procedure exception: "+e.getMessage(), e, currentTime + 300000L,
currentTime + 3 * 60 * 60000L,-1,false);
}
catch (RemoteException remoteException)
{
throw new ManifoldCFException("Meridio: A remote exception occurred while getting doc versions: " +
remoteException.getMessage(), remoteException);
}
catch (MeridioDataSetException meridioDataSetException)
{
throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " +
"Service XML: "+meridioDataSetException.getMessage(), meridioDataSetException);
}
}
}