hasher-matcher-actioner/hmalib/lambdas/fetcher.py

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ Implementation of the "fetcher" module of HMA. Fetching involves connecting to the ThreatExchange API and downloading signals to synchronize a local copy of the database, which will then be fed into various indices. """ import logging import time import os from collections import defaultdict from dataclasses import dataclass from datetime import datetime from functools import lru_cache from typing import DefaultDict import boto3 from threatexchange.api import ThreatExchangeAPI from threatexchange.signal_type.pdq import PdqSignal from threatexchange.signal_type.md5 import VideoMD5Signal from threatexchange.threat_updates import ThreatUpdateJSON from hmalib.aws_secrets import AWSSecrets from hmalib.common.config import HMAConfig from hmalib.common.logging import get_logger from hmalib.common.configs.fetcher import ThreatExchangeConfig from hmalib.common.s3_adapters import ThreatUpdateS3Store logger = get_logger(__name__) dynamodb = boto3.resource("dynamodb") # In one fetcher run, how many descriptor updates to fetch per privacy group # using /threat_updates MAX_DESCRIPTORS_UPDATED = 20000 # Print progress when polling threat_updates once every...<> seconds PROGRESS_PRINT_INTERVAL_SEC = 20 @lru_cache(maxsize=None) def get_s3_client(): return boto3.client("s3") # Lambda init tricks @lru_cache(maxsize=1) def lambda_init_once(): """ Do some late initialization for required lambda components. Lambda initialization is weird - despite the existence of perfectly good constructions like __name__ == __main__, there don't appear to be easy ways to split your lambda-specific logic from your module logic except by splitting up the files and making your lambda entry as small as possible. TODO: Just refactor this file to separate the lambda and functional components """ cfg = FetcherConfig.get() HMAConfig.initialize(cfg.config_table_name) @dataclass class FetcherConfig: """ Simple holder for getting typed environment variables """ s3_bucket: str s3_te_data_folder: str config_table_name: str data_store_table: str @classmethod @lru_cache(maxsize=None) # probably overkill, but at least it's consistent def get(cls): # These defaults are naive but can be updated for testing purposes. return cls( s3_bucket=os.environ["THREAT_EXCHANGE_DATA_BUCKET_NAME"], s3_te_data_folder=os.environ["THREAT_EXCHANGE_DATA_FOLDER"], config_table_name=os.environ["CONFIG_TABLE_NAME"], data_store_table=os.environ["DYNAMODB_DATASTORE_TABLE"], ) def is_int(int_string: str): """ Checks if string is convertible to int. """ try: int(int_string) return True except ValueError: return False class ProgressLogger: """ Use this to get a progress logger which counts up the number of items processed via /threat_updates. Returns a callable class. """ def __init__(self): self.processed = 0 self.last_update_time = None self.counts = defaultdict(lambda: 0) self.last_update_printed = 0 def __call__(self, update: ThreatUpdateJSON): self.processed += 1 self.counts[update.threat_type] += -1 if update.should_delete else 1 self.last_update_time = update.time now = time.time() if now - self.last_update_printed >= PROGRESS_PRINT_INTERVAL_SEC: self.last_update_printed = now logger.info("threat_updates/: processed %d descriptors.", self.processed) def lambda_handler(_event, _context): """ Run through threatexchange privacy groups and fetch updates to them. If this is the first time for a privacy group, will fetch from the start, else only updates since the last time. Note: since this is a scheduled job, we swallow all exceptions. We only log exceptions and move on. """ lambda_init_once() config = FetcherConfig.get() collabs = ThreatExchangeConfig.get_all() now = datetime.now() current_time = now.strftime("%H:%M:%S") names = [collab.privacy_group_name for collab in collabs[:5]] if len(names) < len(collabs): names[-1] = "..." data = f"Triggered at time {current_time}, found {len(collabs)} collabs: {', '.join(names)}" logger.info(data) api_token = AWSSecrets().te_api_token() api = ThreatExchangeAPI(api_token) for collab in collabs: logger.info( "Processing updates for collaboration %s", collab.privacy_group_name ) if not is_int(collab.privacy_group_id): logger.info( f"Fetch skipped because privacy_group_id({collab.privacy_group_id}) is not an int" ) continue if not collab.fetcher_active: logger.info( f"Fetch skipped because configs has `fetcher_active` set to false for privacy_group_id({collab.privacy_group_id})" ) continue indicator_store = ThreatUpdateS3Store( int(collab.privacy_group_id), api.app_id, s3_client=get_s3_client(), s3_bucket_name=config.s3_bucket, s3_te_data_folder=config.s3_te_data_folder, data_store_table=config.data_store_table, supported_signal_types=[VideoMD5Signal, PdqSignal], ) try: indicator_store.load_checkpoint() if indicator_store.stale: logger.warning( "Store for %s - %d stale! Resetting.", collab.privacy_group_name, int(collab.privacy_group_id), ) indicator_store.reset() if indicator_store.fetch_checkpoint >= now.timestamp(): continue delta = indicator_store.next_delta delta.incremental_sync_from_threatexchange( api, limit=MAX_DESCRIPTORS_UPDATED, progress_fn=ProgressLogger() ) except Exception: # pylint: disable=broad-except logger.exception( "Encountered exception while getting updates. Will attempt saving.." ) # Force delta to show finished delta.end = delta.current finally: if delta: logging.info("Fetch complete, applying %d updates", len(delta.updates)) indicator_store.apply_updates( delta, post_apply_fn=indicator_store.post_apply ) else: logging.error("Failed before fetching any records")

hasher-matcher-actioner/hmalib/lambdas/fetcher.py (128 lines of code) (raw):