def detect_duplicates()

in components/doc-registry/src/document_registry_service.py [0:0]


def detect_duplicates(folder_uri: str, registry_table: str):
    """Return all the file that already exist in the document registry"""
    folder_to_check = GCSFolder(folder_uri)
    crc32s = [str(doc.crc32) for doc in folder_to_check.get_documents_in_folder()]
    matches_found = look_up_document(registry_table, crc32s)
    duplicates = []
    match_dict = {row.crc32: row for row in matches_found}
    for doc in folder_to_check.get_documents_in_folder():
        doc_crc32 = str(doc.crc32)
        if doc_crc32 in match_dict:
            duplicates.append(
                {
                    "doc": doc.get_gcs_uri(),
                    "existing_doc": {
                        "uri": match_dict[doc_crc32].gcsUri,
                        "id": match_dict[doc_crc32].id,
                    },
                }
            )
    return duplicates