in components/doc-registry/src/document_registry_service.py [0:0]
def detect_duplicates(folder_uri: str, registry_table: str):
"""Return all the file that already exist in the document registry"""
folder_to_check = GCSFolder(folder_uri)
crc32s = [str(doc.crc32) for doc in folder_to_check.get_documents_in_folder()]
matches_found = look_up_document(registry_table, crc32s)
duplicates = []
match_dict = {row.crc32: row for row in matches_found}
for doc in folder_to_check.get_documents_in_folder():
doc_crc32 = str(doc.crc32)
if doc_crc32 in match_dict:
duplicates.append(
{
"doc": doc.get_gcs_uri(),
"existing_doc": {
"uri": match_dict[doc_crc32].gcsUri,
"id": match_dict[doc_crc32].id,
},
}
)
return duplicates