private static void snowballClusterize()

in pdq/java/src/main/java/pdqhashing/tools/Clusterize256Tool.java [193:329]


  private static void snowballClusterize(
    Vector<Hash256AndMetadata<String>> vectorOfPairs,
    MIH256<String> mih,
    boolean separateClusters,
    int  traceCount,
    boolean  doBruteForceQuery,
    int  distanceThreshold
  ) {

    Map<String,Set<String>> adjacencyMatrix = new LinkedHashMap<String,Set<String>>();
    Map<String,Hash256> metadataToHashes = new LinkedHashMap<String,Hash256>();

    // INGEST DATA
    int i = 0;
    for (Hash256AndMetadata<String> hashAndMetadata : vectorOfPairs) {
      Hash256 needleHash = hashAndMetadata.hash;
      String needleMetadata = hashAndMetadata.metadata;

      if (traceCount > 0) {
        if ((i % traceCount) == 0) {
          System.err.printf("o %d\n", i);
        }
      }
      i++;

      Vector<Hash256AndMetadata<String>> matches = new Vector<Hash256AndMetadata<String>>();
      try {
        if (doBruteForceQuery) {
          mih.bruteForceQueryAll(needleHash, distanceThreshold, matches);
        } else {
          mih.queryAll(needleHash, distanceThreshold, matches);
        }
      } catch (MIHDimensionExceededException e) {
        System.err.printf("%s: %s\n", PROGNAME, e.getErrorMessage());
        System.exit(1);
      }

      metadataToHashes.put(needleMetadata, needleHash);
      for (Hash256AndMetadata<String> match : matches) {
        Hash256 haystackHash = match.hash;
        String haystackMetadata = match.metadata;
        metadataToHashes.put(haystackMetadata, haystackHash);

        if (adjacencyMatrix.get(needleMetadata) == null) {
          adjacencyMatrix.put(needleMetadata, new LinkedHashSet<String>());
        }
        adjacencyMatrix.get(needleMetadata).add(haystackMetadata);

        if (adjacencyMatrix.get(haystackMetadata) == null) {
          adjacencyMatrix.put(haystackMetadata, new LinkedHashSet<String>());
        }
        adjacencyMatrix.get(haystackMetadata).add(needleMetadata);
      }
    }

    // IDENTIFY CLUSTER REPRESENTATIVES

    // For the sake of discussion suppose the item IDs are A, B, C, D, E.
    // Input data includes the adjacency matrix
    //
    //     A B C D E
    //   A * . * * .
    //   B . * . * .
    //   C * . * . .
    //   D * * . * .
    //   E . . . . *
    //
    // We expect to get [A,B,C,D] as one equivalence class and [E] as the other.
    // Representatives are just the first-found, e.g. A and E respectively.

    Map<String,String> metadatasToClusterRepresentatives = new LinkedHashMap<String,String>();

    // For each row of the adjacency matrix:
    for (Map.Entry<String,Set<String>> row : adjacencyMatrix.entrySet()) {
      String metadata_i = row.getKey();
      Set<String> metadata_js = row.getValue();

      // Already-visited items, found by off-diagonal on a previous row
      if (metadatasToClusterRepresentatives.get(metadata_i) != null) {
        continue;
      }

      // Each row of the adjacency matrix contributes to an equivalence class.
      // E.g. the top row of the above example gives [A,C,D]. The only question
      // is whether this is standalone or part of something already seen. For
      // example, on the first row we get [A,C,D]. On the second row we have
      // [B,D] but D was already seen.

      // Find a representative for this item: Either the first-found in the
      // row, or an already-seen (if there is one).
      String representative = metadata_i; // E.g. A on first row, B on second row
      for (String metadata_j : metadata_js) {
        String other = metadatasToClusterRepresentatives.get(metadata_j);
        if (other != null) {
          representative = other;
          break;
        }
      }

      // Mark all the items in the current row as having that representative
      for (String metadata_j : metadata_js) {
        metadatasToClusterRepresentatives.put(metadata_j, representative);
      }
    }

    // FORM EQUIVALENCE CLASSES
    Map<String,Set<String>> equivalenceClasses = new LinkedHashMap<String,Set<String>>();
    for (Map.Entry<String,Hash256> entry : metadataToHashes.entrySet()) {
      String metadata = entry.getKey();
      String representative = metadatasToClusterRepresentatives.get(metadata);

      if (equivalenceClasses.get(representative) == null) {
        equivalenceClasses.put(representative, new LinkedHashSet<String>());
      }
      equivalenceClasses.get(representative).add(metadata);
    }

    // OUTPUT
    int clusterIndex = 0;
    for (Map.Entry<String,Set<String>> entry : equivalenceClasses.entrySet()) {
      Set<String> equivalenceClass = entry.getValue();
      clusterIndex++;
      int clusterSize = equivalenceClass.size();

      if (separateClusters && clusterIndex > 1) {
        System.out.println();
      }

      for (String metadata : equivalenceClass) {
        System.out.printf("clidx=%d,clusz=%d,hash=%s,%s\n",
          clusterIndex,
          clusterSize,
          metadataToHashes.get(metadata).toString(),
          metadata);
      }
    }
  }