hasher-matcher-actioner/hmalib/indexers/metadata.py (28 lines of code) (raw):

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Defines metadata objects that are indexed. All hash indexes allow querying for a hash. This hash can be of type PDQ, MD5, TLSH, etc. The index returns a list of IndexMatch objects. Classes exposed in this module define the 'type' of the metadata attribute of an IndexMatch object. This allows us to ensure that all IndexMatch.metadata objects have a common shape so we can write logic around it. """ import typing as t from dataclasses import asdict, dataclass, field class BaseIndexMetadata: def get_source(self) -> str: raise NotImplemented def to_json(self) -> t.Dict[str, t.Any]: return asdict(self) # Note changing this value will change S3ThreatDataConfig.SOURCE_STR. Per my # lookup, this will not affect how data gets stored, only logs, but should # figure it out. THREAT_EXCHANGE_SOURCE_SHORT_CODE = "te" BANKS_SOURCE_SHORT_CODE = "bnk" @dataclass class ThreatExchangeIndicatorIndexMetadata(BaseIndexMetadata): """ A row of data returned from ThreatExchange. Should correspond to a single descriptor. """ # ThreatExchange Indicator ID. indicator_id: str # Actual value of the hash, use a string representation. signal_value: str # Which privacy groups report this indicator? Usually, will be a set of one, # but because an indicator can be in multiple privacy_groups, using a set. privacy_group: str # Tags reported by threatexchange for this privacy group. Will not contain # all tags, but a sub-set. tags: t.Set[str] = field(default_factory=set) def get_source(self): return THREAT_EXCHANGE_SOURCE_SHORT_CODE def to_json(self) -> t.Dict[str, t.Any]: result = asdict(self) result.update(tags=list(self.tags)) return result @dataclass class BankedSignalIndexMetadata(BaseIndexMetadata): """ A row of data stored as a bank member signal. A more compact view so that we can store along the index. """ # Bank signal_id, this is roughly the same as an indicator id in # ThreatExchange. Separate bank_members which hash to the same signal value # will have the same signal_id. signal_id: str # Actual value of the hash, use a string representation. signal_value: str # Along with a signal_id, this should suffice as a uniqueness constraint. bank_member_id: str def get_source(self): return BANKS_SOURCE_SHORT_CODE