in pdq/java/src/main/java/pdqhashing/tools/Clusterize256Tool.java [193:329]
private static void snowballClusterize(
Vector<Hash256AndMetadata<String>> vectorOfPairs,
MIH256<String> mih,
boolean separateClusters,
int traceCount,
boolean doBruteForceQuery,
int distanceThreshold
) {
Map<String,Set<String>> adjacencyMatrix = new LinkedHashMap<String,Set<String>>();
Map<String,Hash256> metadataToHashes = new LinkedHashMap<String,Hash256>();
// INGEST DATA
int i = 0;
for (Hash256AndMetadata<String> hashAndMetadata : vectorOfPairs) {
Hash256 needleHash = hashAndMetadata.hash;
String needleMetadata = hashAndMetadata.metadata;
if (traceCount > 0) {
if ((i % traceCount) == 0) {
System.err.printf("o %d\n", i);
}
}
i++;
Vector<Hash256AndMetadata<String>> matches = new Vector<Hash256AndMetadata<String>>();
try {
if (doBruteForceQuery) {
mih.bruteForceQueryAll(needleHash, distanceThreshold, matches);
} else {
mih.queryAll(needleHash, distanceThreshold, matches);
}
} catch (MIHDimensionExceededException e) {
System.err.printf("%s: %s\n", PROGNAME, e.getErrorMessage());
System.exit(1);
}
metadataToHashes.put(needleMetadata, needleHash);
for (Hash256AndMetadata<String> match : matches) {
Hash256 haystackHash = match.hash;
String haystackMetadata = match.metadata;
metadataToHashes.put(haystackMetadata, haystackHash);
if (adjacencyMatrix.get(needleMetadata) == null) {
adjacencyMatrix.put(needleMetadata, new LinkedHashSet<String>());
}
adjacencyMatrix.get(needleMetadata).add(haystackMetadata);
if (adjacencyMatrix.get(haystackMetadata) == null) {
adjacencyMatrix.put(haystackMetadata, new LinkedHashSet<String>());
}
adjacencyMatrix.get(haystackMetadata).add(needleMetadata);
}
}
// IDENTIFY CLUSTER REPRESENTATIVES
// For the sake of discussion suppose the item IDs are A, B, C, D, E.
// Input data includes the adjacency matrix
//
// A B C D E
// A * . * * .
// B . * . * .
// C * . * . .
// D * * . * .
// E . . . . *
//
// We expect to get [A,B,C,D] as one equivalence class and [E] as the other.
// Representatives are just the first-found, e.g. A and E respectively.
Map<String,String> metadatasToClusterRepresentatives = new LinkedHashMap<String,String>();
// For each row of the adjacency matrix:
for (Map.Entry<String,Set<String>> row : adjacencyMatrix.entrySet()) {
String metadata_i = row.getKey();
Set<String> metadata_js = row.getValue();
// Already-visited items, found by off-diagonal on a previous row
if (metadatasToClusterRepresentatives.get(metadata_i) != null) {
continue;
}
// Each row of the adjacency matrix contributes to an equivalence class.
// E.g. the top row of the above example gives [A,C,D]. The only question
// is whether this is standalone or part of something already seen. For
// example, on the first row we get [A,C,D]. On the second row we have
// [B,D] but D was already seen.
// Find a representative for this item: Either the first-found in the
// row, or an already-seen (if there is one).
String representative = metadata_i; // E.g. A on first row, B on second row
for (String metadata_j : metadata_js) {
String other = metadatasToClusterRepresentatives.get(metadata_j);
if (other != null) {
representative = other;
break;
}
}
// Mark all the items in the current row as having that representative
for (String metadata_j : metadata_js) {
metadatasToClusterRepresentatives.put(metadata_j, representative);
}
}
// FORM EQUIVALENCE CLASSES
Map<String,Set<String>> equivalenceClasses = new LinkedHashMap<String,Set<String>>();
for (Map.Entry<String,Hash256> entry : metadataToHashes.entrySet()) {
String metadata = entry.getKey();
String representative = metadatasToClusterRepresentatives.get(metadata);
if (equivalenceClasses.get(representative) == null) {
equivalenceClasses.put(representative, new LinkedHashSet<String>());
}
equivalenceClasses.get(representative).add(metadata);
}
// OUTPUT
int clusterIndex = 0;
for (Map.Entry<String,Set<String>> entry : equivalenceClasses.entrySet()) {
Set<String> equivalenceClass = entry.getValue();
clusterIndex++;
int clusterSize = equivalenceClass.size();
if (separateClusters && clusterIndex > 1) {
System.out.println();
}
for (String metadata : equivalenceClass) {
System.out.printf("clidx=%d,clusz=%d,hash=%s,%s\n",
clusterIndex,
clusterSize,
metadataToHashes.get(metadata).toString(),
metadata);
}
}
}