pdq/php/pdqhash.php (123 lines of code) (raw):

<?php // ================================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // ================================================================ /** * A PDQ hash is simply 256 bits. The only string representation is 64 hex * digits. Bit-ordering is a concern, but is a non-issue as long as the * to-string and from-string methods in this class are consistent. */ class PDQHash { const PDQHASH_NUM_SLOTS = 16; const PDQHASH_HEX_LENGTH = 64; // 16 16-bit words: sliced this way for mutually indexed hashing (MIH). private $slots = array(); /** * The constructor is private since the only valid representation is * hex-string. Yet we have a PDQHash class, not just strings, due to lessons * learned from other projects. Separating ser/des format (string) from * internal representation helps avoid false negatives. */ private function __construct() { } public static function makeZeroesHash() { $hash = new PDQHash(); for ($i = 0; $i < self::PDQHASH_NUM_SLOTS; $i++) { $hash->slots[$i] = 0; } return $hash; } /** * Creates a new instance of the hash from a hexadecimal-formatted PDQ * hash string. Throws MalformedPDQHashException for improperly * formatted hashes. * * @param hex_string string representation of the hash in 'hex' format. * @return PDQHash newly created hash. * @throws MalformedPDQHashException if the given hash is not valid. */ public static function fromHexString( /*string*/ $hex_string )/*: PDQHash*/ { if (strlen($hex_string) !== self::PDQHASH_HEX_LENGTH) { throw new MalformedPDQHashException( $hex_string, sprintf( 'Expected hash to have length %d, received hash with length %d', self::PDQHASH_HEX_LENGTH, strlen($hex_string) ) ); } $hash = new PDQHash(); try { // Unpack goes 1-up not 0-up $slots = unpack('n*', hex2bin($hex_string)); } catch (ErrorException $_) { throw new MalformedPDQHashException( $hex_string, 'Length is 64 but is not pure hexadecimal.' ); } $hash->slots = array(); $k = self::PDQHASH_NUM_SLOTS - 1; foreach ($slots as $slot) { $hash->slots[$k] = $slot; $k--; } return $hash; } public function toHexString() { $string = ""; for ($i = self::PDQHASH_NUM_SLOTS - 1; $i >= 0; $i--) { // dechex doesn't zero-pad $string .= sprintf("%04x", $this->slots[$i]); } return $string; } public function to64BitStrings() { $strings = array(); for ($i = self::PDQHASH_NUM_SLOTS - 1; $i >= 0; $i-=4) { $strings[] = sprintf("%04x", $this->slots[$i]) . sprintf("%04x", $this->slots[$i-1]) . sprintf("%04x", $this->slots[$i-2]) . sprintf("%04x", $this->slots[$i-3]); } return $strings; } public function to32BitStrings() { $strings = array(); for ($i = self::PDQHASH_NUM_SLOTS - 1; $i >= 0; $i-=2) { $strings[] = sprintf("%04x", $this->slots[$i]) . sprintf("%04x", $this->slots[$i-1]); } return $strings; } public function to16BitStrings() { $strings = array(); for ($i = self::PDQHASH_NUM_SLOTS - 1; $i >= 0; $i--) { $strings[] = sprintf("%04x", $this->slots[$i]); } return $strings; } public function setBit(/*int*/ $bit_index) { $slot_index = (int)($bit_index / self::PDQHASH_NUM_SLOTS); $slot_bit_index = (int)($bit_index % self::PDQHASH_NUM_SLOTS); $this->slots[$slot_index] |= 1 << $slot_bit_index; } /** * The Hamming distance between this hash and another. * * @param The hash to compare this hash to. * @return int */ public function hammingDistanceTo(PDQHash $that) /*: int*/ { $sum = 0; for ($i = 0; $i < self::PDQHASH_NUM_SLOTS; $i++) { $sum += self::popCount16($this->slots[$i] ^ $that->slots[$i]); } return $sum; } public function isWithinHammingDistanceOf( /*PDQHash*/ $that, /*int*/ $threshold )/*: bool*/ { $current = 0; for ($i = 0; $i < self::PDQHASH_NUM_SLOTS; $i++) { $current += self::popCount16($this->slots[$i] ^ $that->slots[$i]); if ($current > $threshold) { return false; } } return true; } public function hammingNorm()/*: int*/ { $sum = 0; for ($i = 0; $i < self::PDQHASH_NUM_SLOTS; $i++) { $sum += self::popCount16($this->slots[$i]); } return $sum; } private static function popCount16(/*int */$n)/*: int*/ { $n -= (($n >> 1) & 0x5555); $n = ((($n >> 2) & 0x3333) + ($n & 0x3333)); $n = ((($n >> 4) + $n) & 0x0f0f); $n += ($n >> 8); return($n & 0x1f); } private static function popCount16Slow(/*int */$n)/*: int*/ { $count = 0; for ($count = 0; $n != 0; $count++, $n &= $n-1) { } return $count; } } // class PDQHash