sub _construct_hash_table()

in tools/PerfectHash.pm [207:397]


sub _construct_hash_table
{
	my ($keys_ref, $hash_mult1, $hash_mult2, $hash_seed1, $hash_seed2) = @_;
	my @keys = @{$keys_ref};

	# This algorithm is based on a graph whose edges correspond to the
	# keys and whose vertices correspond to entries of the mapping table.
	# A key's edge links the two vertices whose indexes are the outputs of
	# the two hash functions for that key.  For K keys, the mapping
	# table must have at least 2*K+1 entries, guaranteeing that there's at
	# least one unused entry.  (In principle, larger mapping tables make it
	# easier to find a workable hash and increase the number of inputs that
	# can be rejected due to touching unused hashtable entries.  In practice,
	# neither effect seems strong enough to justify using a larger table.)
	my $nedges = scalar @keys;       # number of edges
	my $nverts = 2 * $nedges + 1;    # number of vertices

	# However, it would be very bad if $nverts were exactly equal to either
	# $hash_mult1 or $hash_mult2: effectively, that hash function would be
	# sensitive to only the last byte of each key.  Cases where $nverts is a
	# multiple of either multiplier likewise lose information.  (But $nverts
	# can't actually divide them, if they've been intelligently chosen as
	# primes.)  We can avoid such problems by adjusting the table size.
	while ($nverts % $hash_mult1 == 0
		|| $nverts % $hash_mult2 == 0)
	{
		$nverts++;
	}

	# Initialize the array of edges.
	my @E = ();
	foreach my $kw (@keys)
	{
		# Calculate hashes for this key.
		# The hashes are immediately reduced modulo the mapping table size.
		my $hash1 = _calc_hash($kw, $hash_mult1, $hash_seed1) % $nverts;
		my $hash2 = _calc_hash($kw, $hash_mult2, $hash_seed2) % $nverts;

		# If the two hashes are the same for any key, we have to fail
		# since this edge would itself form a cycle in the graph.
		return () if $hash1 == $hash2;

		# Add the edge for this key.
		push @E, { left => $hash1, right => $hash2 };
	}

	# Initialize the array of vertices, giving them all empty lists
	# of associated edges.  (The lists will be hashes of edge numbers.)
	my @V = ();
	for (my $v = 0; $v < $nverts; $v++)
	{
		push @V, { edges => {} };
	}

	# Insert each edge in the lists of edges connected to its vertices.
	for (my $e = 0; $e < $nedges; $e++)
	{
		my $v = $E[$e]{left};
		$V[$v]{edges}->{$e} = 1;

		$v = $E[$e]{right};
		$V[$v]{edges}->{$e} = 1;
	}

	# Now we attempt to prove the graph acyclic.
	# A cycle-free graph is either empty or has some vertex of degree 1.
	# Removing the edge attached to that vertex doesn't change this property,
	# so doing that repeatedly will reduce the size of the graph.
	# If the graph is empty at the end of the process, it was acyclic.
	# We track the order of edge removal so that the next phase can process
	# them in reverse order of removal.
	my @output_order = ();

	# Consider each vertex as a possible starting point for edge-removal.
	for (my $startv = 0; $startv < $nverts; $startv++)
	{
		my $v = $startv;

		# If vertex v is of degree 1 (i.e. exactly 1 edge connects to it),
		# remove that edge, and then consider the edge's other vertex to see
		# if it is now of degree 1.  The inner loop repeats until reaching a
		# vertex not of degree 1.
		while (scalar(keys(%{ $V[$v]{edges} })) == 1)
		{
			# Unlink its only edge.
			my $e = (keys(%{ $V[$v]{edges} }))[0];
			delete($V[$v]{edges}->{$e});

			# Unlink the edge from its other vertex, too.
			my $v2 = $E[$e]{left};
			$v2 = $E[$e]{right} if ($v2 == $v);
			delete($V[$v2]{edges}->{$e});

			# Push e onto the front of the output-order list.
			unshift @output_order, $e;

			# Consider v2 on next iteration of inner loop.
			$v = $v2;
		}
	}

	# We succeeded only if all edges were removed from the graph.
	return () if (scalar(@output_order) != $nedges);

	# OK, build the hash table of size $nverts.
	my @hashtab = (0) x $nverts;
	# We need a "visited" flag array in this step, too.
	my @visited = (0) x $nverts;

	# The goal is that for any key, the sum of the hash table entries for
	# its first and second hash values is the desired output (i.e., the key
	# number).  By assigning hash table values in the selected edge order,
	# we can guarantee that that's true.  This works because the edge first
	# removed from the graph (and hence last to be visited here) must have
	# at least one vertex it shared with no other edge; hence it will have at
	# least one vertex (hashtable entry) still unvisited when we reach it here,
	# and we can assign that unvisited entry a value that makes the sum come
	# out as we wish.  By induction, the same holds for all the other edges.
	foreach my $e (@output_order)
	{
		my $l = $E[$e]{left};
		my $r = $E[$e]{right};
		if (!$visited[$l])
		{
			# $hashtab[$r] might be zero, or some previously assigned value.
			$hashtab[$l] = $e - $hashtab[$r];
		}
		else
		{
			die "oops, doubly used hashtab entry" if $visited[$r];
			# $hashtab[$l] might be zero, or some previously assigned value.
			$hashtab[$r] = $e - $hashtab[$l];
		}
		# Now freeze both of these hashtab entries.
		$visited[$l] = 1;
		$visited[$r] = 1;
	}

	# Detect range of values needed in hash table.
	my $hmin = $nedges;
	my $hmax = 0;
	for (my $v = 0; $v < $nverts; $v++)
	{
		$hmin = $hashtab[$v] if $hashtab[$v] < $hmin;
		$hmax = $hashtab[$v] if $hashtab[$v] > $hmax;
	}

	# Choose width of hashtable entries.  In addition to the actual values,
	# we need to be able to store a flag for unused entries, and we wish to
	# have the property that adding any other entry value to the flag gives
	# an out-of-range result (>= $nedges).
	my $elemtype;
	my $unused_flag;

	if (   $hmin >= -0x7F
		&& $hmax <= 0x7F
		&& $hmin + 0x7F >= $nedges)
	{
		# int8 will work
		$elemtype    = 'int8';
		$unused_flag = 0x7F;
	}
	elsif ($hmin >= -0x7FFF
		&& $hmax <= 0x7FFF
		&& $hmin + 0x7FFF >= $nedges)
	{
		# int16 will work
		$elemtype    = 'int16';
		$unused_flag = 0x7FFF;
	}
	elsif ($hmin >= -0x7FFFFFFF
		&& $hmax <= 0x7FFFFFFF
		&& $hmin + 0x3FFFFFFF >= $nedges)
	{
		# int32 will work
		$elemtype    = 'int32';
		$unused_flag = 0x3FFFFFFF;
	}
	else
	{
		die "hash table values too wide";
	}

	# Set any unvisited hashtable entries to $unused_flag.
	for (my $v = 0; $v < $nverts; $v++)
	{
		$hashtab[$v] = $unused_flag if !$visited[$v];
	}

	return ($elemtype, \@hashtab);
}