pdq/php/pdqhash.php (123 lines of code) (raw):
<?php
// ================================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// ================================================================
/**
* A PDQ hash is simply 256 bits. The only string representation is 64 hex
* digits. Bit-ordering is a concern, but is a non-issue as long as the
* to-string and from-string methods in this class are consistent.
*/
class PDQHash {
const PDQHASH_NUM_SLOTS = 16;
const PDQHASH_HEX_LENGTH = 64;
// 16 16-bit words: sliced this way for mutually indexed hashing (MIH).
private $slots = array();
/**
* The constructor is private since the only valid representation is
* hex-string. Yet we have a PDQHash class, not just strings, due to lessons
* learned from other projects. Separating ser/des format (string) from
* internal representation helps avoid false negatives.
*/
private function __construct() {
}
public static function makeZeroesHash() {
$hash = new PDQHash();
for ($i = 0; $i < self::PDQHASH_NUM_SLOTS; $i++) {
$hash->slots[$i] = 0;
}
return $hash;
}
/**
* Creates a new instance of the hash from a hexadecimal-formatted PDQ
* hash string. Throws MalformedPDQHashException for improperly
* formatted hashes.
*
* @param hex_string string representation of the hash in 'hex' format.
* @return PDQHash newly created hash.
* @throws MalformedPDQHashException if the given hash is not valid.
*/
public static function fromHexString(
/*string*/ $hex_string
)/*: PDQHash*/ {
if (strlen($hex_string) !== self::PDQHASH_HEX_LENGTH) {
throw new MalformedPDQHashException(
$hex_string,
sprintf(
'Expected hash to have length %d, received hash with length %d',
self::PDQHASH_HEX_LENGTH,
strlen($hex_string)
)
);
}
$hash = new PDQHash();
try {
// Unpack goes 1-up not 0-up
$slots = unpack('n*', hex2bin($hex_string));
} catch (ErrorException $_) {
throw new MalformedPDQHashException(
$hex_string,
'Length is 64 but is not pure hexadecimal.'
);
}
$hash->slots = array();
$k = self::PDQHASH_NUM_SLOTS - 1;
foreach ($slots as $slot) {
$hash->slots[$k] = $slot;
$k--;
}
return $hash;
}
public function toHexString() {
$string = "";
for ($i = self::PDQHASH_NUM_SLOTS - 1; $i >= 0; $i--) {
// dechex doesn't zero-pad
$string .= sprintf("%04x", $this->slots[$i]);
}
return $string;
}
public function to64BitStrings() {
$strings = array();
for ($i = self::PDQHASH_NUM_SLOTS - 1; $i >= 0; $i-=4) {
$strings[] =
sprintf("%04x", $this->slots[$i]) .
sprintf("%04x", $this->slots[$i-1]) .
sprintf("%04x", $this->slots[$i-2]) .
sprintf("%04x", $this->slots[$i-3]);
}
return $strings;
}
public function to32BitStrings() {
$strings = array();
for ($i = self::PDQHASH_NUM_SLOTS - 1; $i >= 0; $i-=2) {
$strings[] =
sprintf("%04x", $this->slots[$i]) .
sprintf("%04x", $this->slots[$i-1]);
}
return $strings;
}
public function to16BitStrings() {
$strings = array();
for ($i = self::PDQHASH_NUM_SLOTS - 1; $i >= 0; $i--) {
$strings[] = sprintf("%04x", $this->slots[$i]);
}
return $strings;
}
public function setBit(/*int*/ $bit_index) {
$slot_index = (int)($bit_index / self::PDQHASH_NUM_SLOTS);
$slot_bit_index = (int)($bit_index % self::PDQHASH_NUM_SLOTS);
$this->slots[$slot_index] |= 1 << $slot_bit_index;
}
/**
* The Hamming distance between this hash and another.
*
* @param The hash to compare this hash to.
* @return int
*/
public function hammingDistanceTo(PDQHash $that) /*: int*/ {
$sum = 0;
for ($i = 0; $i < self::PDQHASH_NUM_SLOTS; $i++) {
$sum += self::popCount16($this->slots[$i] ^ $that->slots[$i]);
}
return $sum;
}
public function isWithinHammingDistanceOf(
/*PDQHash*/ $that,
/*int*/ $threshold
)/*: bool*/ {
$current = 0;
for ($i = 0; $i < self::PDQHASH_NUM_SLOTS; $i++) {
$current += self::popCount16($this->slots[$i] ^ $that->slots[$i]);
if ($current > $threshold) {
return false;
}
}
return true;
}
public function hammingNorm()/*: int*/ {
$sum = 0;
for ($i = 0; $i < self::PDQHASH_NUM_SLOTS; $i++) {
$sum += self::popCount16($this->slots[$i]);
}
return $sum;
}
private static function popCount16(/*int */$n)/*: int*/ {
$n -= (($n >> 1) & 0x5555);
$n = ((($n >> 2) & 0x3333) + ($n & 0x3333));
$n = ((($n >> 4) + $n) & 0x0f0f);
$n += ($n >> 8);
return($n & 0x1f);
}
private static function popCount16Slow(/*int */$n)/*: int*/ {
$count = 0;
for ($count = 0; $n != 0; $count++, $n &= $n-1) { }
return $count;
}
} // class PDQHash