in connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java [973:1308]
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption {
// Forced acls
String[] acls = getAcls(spec);
// Sort it,
java.util.Arrays.sort(acls);
for (String documentIdentifier : documentIdentifiers) {
File googleFile = getObject(documentIdentifier);
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("GOOGLEDRIVE: Processing document identifier '"
+ documentIdentifier + "'");
Logging.connectors.debug("GOOGLEDRIVE: have this file:\t" + googleFile.getName());
}
String versionString;
if (googleFile == null || (googleFile.containsKey("explicitlyTrashed") && googleFile.getExplicitlyTrashed()) ||
googleFile.getMimeType().equals("application/vnd.google-apps.shortcut")) {
//its deleted, move on
activities.deleteDocument(documentIdentifier);
continue;
}
if (!isDir(googleFile)) {
String rev = googleFile.getModifiedTime().toStringRfc3339();
if (StringUtils.isNotEmpty(rev)) {
StringBuilder sb = new StringBuilder();
// Acls
packList(sb,acls,'+');
if (acls.length > 0) {
sb.append('+');
pack(sb,defaultAuthorityDenyToken,'+');
}
else
sb.append('-');
sb.append(rev);
versionString = sb.toString();
} else {
//a google document that doesn't contain versioning information will NEVER be processed.
// I don't know what this means, and whether it can ever occur.
activities.deleteDocument(documentIdentifier);
continue;
}
} else {
//a google folder will always be processed
versionString = StringUtils.EMPTY;
}
if (versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier,versionString)) {
long startTime = System.currentTimeMillis();
String errorCode = null;
String errorDesc = StringUtils.EMPTY;
Long fileSize = null;
boolean doLog = false;
String nodeId = documentIdentifier;
String version = versionString;
try {
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("GOOGLEDRIVE: Processing document identifier '"
+ nodeId + "'");
Logging.connectors.debug("GOOGLEDRIVE: have this file:\t" + googleFile.getName());
}
if ("application/vnd.google-apps.folder".equals(googleFile.getMimeType())) {
//if directory add its children
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("GOOGLEDRIVE: its a directory");
}
// adding all the children + subdirs for a folder
getSession();
GetChildrenThread t = new GetChildrenThread(nodeId);
try {
t.start();
boolean wasInterrupted = false;
try {
XThreadStringBuffer childBuffer = t.getBuffer();
// Pick up the paths, and add them to the activities, before we join with the child thread.
while (true) {
// The only kind of exceptions this can throw are going to shut the process down.
String child = childBuffer.fetch();
if (child == null)
break;
// Add the pageID to the queue
activities.addDocumentReference(child, nodeId, RELATIONSHIP_CHILD);
}
} catch (InterruptedException e) {
wasInterrupted = true;
throw e;
} catch (ManifoldCFException e) {
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
wasInterrupted = true;
throw e;
} finally {
if (!wasInterrupted)
t.finishUp();
}
} catch (InterruptedException e) {
t.interrupt();
throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
ManifoldCFException.INTERRUPTED);
} catch (java.net.SocketTimeoutException e) {
Logging.connectors.warn("GOOGLEDRIVE: Socket timeout adding child documents: " + e.getMessage(), e);
handleIOException(e);
} catch (InterruptedIOException e) {
t.interrupt();
throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
ManifoldCFException.INTERRUPTED);
} catch (IOException e) {
Logging.connectors.warn("GOOGLEDRIVE: Error adding child documents: " + e.getMessage(), e);
handleIOException(e);
}
} else {
// its a file
doLog = true;
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("GOOGLEDRIVE: its a file");
}
// Get the file length
Long fileLengthLong = googleFile.getSize() != null ? googleFile.getSize() : 0L;
if (fileLengthLong != null) {
// Now do standard stuff
long fileLength = fileLengthLong.longValue();
String mimeType = googleFile.getMimeType();
DateTime createdDateObject = googleFile.getCreatedTime();
DateTime modifiedDateObject = googleFile.getModifiedTime();
String extension = googleFile.getFileExtension();
String title = cleanupFileFolderName(googleFile.getName());
Date createdDate = (createdDateObject==null)?null:new Date(createdDateObject.getValue());
Date modifiedDate = (modifiedDateObject==null)?null:new Date(modifiedDateObject.getValue());
// We always direct to the PDF except for Spreadsheets
String documentURI = null;
// if (!mimeType.equals("application/vnd.google-apps.spreadsheet")) {
// documentURI = getUrl(googleFile, "application/pdf");
// } else {
// documentURI = getUrl(googleFile,
// "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
// }
switch (mimeType) {
case "application/vnd.google-apps.spreadsheet":
documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
break;
case "application/vnd.google-apps.document":
documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
break;
case "application/vnd.google-apps.presentation":
documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
break;
default:
documentURI = getUrl(googleFile, "application/pdf");
break;
}
// Google native format documents may exist, but have 0 byte in size.
// In cases like this, there is no way to export it, and because of that, it is going to be ignored
if (documentURI == null) {
errorCode = "NOLENGTH";
errorDesc = "Document "+nodeId+" had no length; skipping";
continue;
}
String fullContentPath = getDocumentContentPath(googleFile, documentURI);
// Append the new parameters in the query string
if (StringUtils.contains(documentURI, '?')) {
documentURI = documentURI + "&" + CONTENT_PATH_PARAM + "=" + fullContentPath;
} else {
documentURI = documentURI + "?" + CONTENT_PATH_PARAM + "=" + fullContentPath;
}
if (!activities.checkLengthIndexable(fileLength)) {
errorCode = activities.EXCLUDED_LENGTH;
errorDesc = "Excluding document because of file length ('"+fileLength+"')";
activities.noDocument(nodeId,version);
continue;
}
if (!activities.checkURLIndexable(documentURI))
{
errorCode = activities.EXCLUDED_URL;
errorDesc = "Excluding document because of URL ('"+documentURI+"')";
activities.noDocument(nodeId,version);
continue;
}
if (!activities.checkMimeTypeIndexable(mimeType))
{
errorCode = activities.EXCLUDED_MIMETYPE;
errorDesc = "Excluding document because of mime type ("+mimeType+")";
activities.noDocument(nodeId,version);
continue;
}
if (!activities.checkDateIndexable(modifiedDate))
{
errorCode = activities.EXCLUDED_DATE;
errorDesc = "Excluding document because of date ("+modifiedDate+")";
activities.noDocument(nodeId,version);
continue;
}
RepositoryDocument rd = new RepositoryDocument();
if (acls != null) {
rd.setSecurityACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls);
if (acls.length > 0) {
String[] denyAclArray = new String[]{defaultAuthorityDenyToken};
rd.setSecurityDenyACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT,denyAclArray);
}
}
if (mimeType != null)
rd.setMimeType(getFixedMimeType(mimeType));
if (createdDate != null)
rd.setCreatedDate(createdDate);
if (modifiedDate != null)
rd.setModifiedDate(modifiedDate);
if (extension != null)
{
if (title == null)
title = "";
if (StringUtils.endsWithIgnoreCase(title, "." + extension)) {
rd.setFileName(title);
} else {
String name = title + "." + extension;
if (StringUtils.endsWithIgnoreCase(name, ".")) {
name = StringUtils.chomp(name, ".");
}
rd.setFileName(name);
}
} else {
if (title == null)
title = "";
String name = title + "." + getExtensionByMimeType(mimeType);
if (StringUtils.endsWithIgnoreCase(name, ".")) {
name = StringUtils.chomp(name, ".");
}
rd.setFileName(name);
}
// Get general document metadata
for (Entry<String, Object> entry : googleFile.entrySet()) {
rd.addField(entry.getKey(), entry.getValue().toString());
}
// Fire up the document reading thread
DocumentReadingThread t = new DocumentReadingThread(documentURI);
try {
t.start();
boolean wasInterrupted = false;
try {
InputStream is = t.getSafeInputStream();
try {
// Can only index while background thread is running!
//filter the fields selected in the query
List<String> sourcePath = new ArrayList<>();
sourcePath.add(fullContentPath);
rd.setSourcePath(sourcePath);
//ingestion
rd.setBinary(is, fileLength);
activities.ingestDocumentWithException(nodeId, version, documentURI, rd);
} finally {
is.close();
}
} catch (ManifoldCFException e) {
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
wasInterrupted = true;
throw e;
} catch (java.net.SocketTimeoutException e) {
throw e;
} catch (InterruptedIOException e) {
wasInterrupted = true;
throw e;
} finally {
if (!wasInterrupted)
t.finishUp();
}
// No errors. Record the fact that we made it.
fileSize = new Long(fileLength);
errorCode = "OK";
} catch (InterruptedException e) {
t.interrupt();
throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
ManifoldCFException.INTERRUPTED);
} catch (java.net.SocketTimeoutException e) {
Logging.connectors.warn("GOOGLEDRIVE: Socket timeout reading document: " + e.getMessage(), e);
handleIOException(e);
} catch (InterruptedIOException e) {
t.interrupt();
throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
ManifoldCFException.INTERRUPTED);
} catch (IOException e) {
errorCode = "IOEXCEPTION";
errorDesc = e.getMessage();
Logging.connectors.warn("GOOGLEDRIVE: Error reading document: " + e.getMessage(), e);
handleIOException(e);
}
} else {
errorCode = "NOLENGTH";
errorDesc = "Document "+nodeId+" had no length; skipping";
}
}
} finally {
if (doLog && errorCode != null)
activities.recordActivity(new Long(startTime), ACTIVITY_READ,
fileSize, nodeId, errorCode, errorDesc, null);
}
}
}
}