python-threatexchange/threatexchange/extensions/text_tlsh/text_tlsh.py (36 lines of code) (raw):
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Wrapper around the pdf signal type.
"""
import re
import typing as t
from threatexchange.content_type.content_base import ContentType
from threatexchange.content_type.text import TextContent
from threatexchange.signal_type import signal_base
from threatexchange.signal_type.raw_text import RawTextSignal
import tlsh
TLSH_CONFIDENT_MATCH_THRESHOLD = 30
EXPECT_TLSH_HASH_LENGTH = 72
class TextTLSHSignal(signal_base.SimpleSignalType, signal_base.TextHasher):
"""
Simple signal type for text using TLSH.
Read about TLSH at https://github.com/trendmicro/tlsh
"""
INDICATOR_TYPE = "HASH_TEXT_TLSH"
@classmethod
def get_content_types(self) -> t.List[t.Type[ContentType]]:
return [TextContent]
@classmethod
def validate_signal_str(cls, signal_str: str) -> str:
"""'T1' followed 70 hexidecimal characters. Total length 72 characters."""
if not re.match("^T1[0-9A-F]{70}$", signal_str):
raise ValueError("invalid TLSH hash")
return signal_str
@classmethod
def hash_from_str(cls, text: str) -> str:
hash_str = str(tlsh.hash(text.encode()))
if hash_str == "TNULL": # Likely too short
return ""
return hash_str
@classmethod
def compare_hash(
cls, hash1: str, hash2: str, distance_threshold: t.Optional[int] = None
) -> signal_base.HashComparisonResult:
if distance_threshold is None:
distance_threshold = TLSH_CONFIDENT_MATCH_THRESHOLD
dist = tlsh.diffxlen(hash1, hash2)
return signal_base.HashComparisonResult.from_dist(dist, distance_threshold)
@staticmethod
def get_examples() -> t.List[str]:
return [TextTLSHSignal.hash_from_str(s) for s in RawTextSignal.get_examples()]