sub print_radix_table()

in src/backend/utils/mb/Unicode/convutils.pm [247:638]


sub print_radix_table
{
	my ($out, $tblname, $c) = @_;

	###
	### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
	### radix tree is represented as a nested hash, each hash indexed by
	### input byte
	###
	my %b1map;
	my %b2map;
	my %b3map;
	my %b4map;
	foreach my $in (keys %$c)
	{
		my $out = $c->{$in};

		if ($in <= 0xff)
		{
			$b1map{$in} = $out;
		}
		elsif ($in <= 0xffff)
		{
			my $b1 = $in >> 8;
			my $b2 = $in & 0xff;

			$b2map{$b1}{$b2} = $out;
		}
		elsif ($in <= 0xffffff)
		{
			my $b1 = $in >> 16;
			my $b2 = ($in >> 8) & 0xff;
			my $b3 = $in & 0xff;

			$b3map{$b1}{$b2}{$b3} = $out;
		}
		elsif ($in <= 0xffffffff)
		{
			my $b1 = $in >> 24;
			my $b2 = ($in >> 16) & 0xff;
			my $b3 = ($in >> 8) & 0xff;
			my $b4 = $in & 0xff;

			$b4map{$b1}{$b2}{$b3}{$b4} = $out;
		}
		else
		{
			die sprintf("up to 4 byte code is supported: %x", $in);
		}
	}

	my @segments;

	###
	### Build a linear list of "segments", from the nested hashes.
	###
	### Each segment is a lookup table, keyed by the next byte in the input.
	### The segments are written out physically to one big array in the final
	### step, but logically, they form a radix tree. Or rather, four radix
	### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
	### inputs, and 4-byte inputs.
	###
	### Each segment is represented by a hash with following fields:
	###
	### comment => <string to output as a comment>
	### label => <label that can be used to refer to this segment from elsewhere>
	### values => <a hash, keyed by byte, 0-0xff>
	###
	### Entries in 'values' can be integers (for leaf-level segments), or
	### string labels, pointing to a segment with that label. Any missing
	### values are treated as zeros. If 'values' hash is missing altogether,
	### it's treated as all-zeros.
	###
	### Subsequent steps will enrich the segments with more fields.
	###

	# Add the segments for the radix trees themselves.
	push @segments,
	  build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
	push @segments,
	  build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
	push @segments,
	  build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
	push @segments,
	  build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);

	###
	### Find min and max index used in each level of each tree.
	###
	### These are stored separately, and we can then leave out the unused
	### parts of every segment. (When using the resulting tree, you must
	### check each input byte against the min and max.)
	###
	my %min_idx;
	my %max_idx;
	foreach my $seg (@segments)
	{
		my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} };
		my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} };

		foreach my $i (keys %{ $seg->{values} })
		{
			$this_min = $i if (!defined $this_min || $i < $this_min);
			$this_max = $i if (!defined $this_max || $i > $this_max);
		}

		$min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min;
		$max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max;
	}

	# Copy the mins and max's back to every segment, for convenience.
	foreach my $seg (@segments)
	{
		$seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} };
		$seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} };
	}

	###
	### Prepend a dummy all-zeros map to the beginning.
	###
	### A 0 is an invalid value anywhere in the table, and this allows us to
	### point to 0 offset from any table, to get a 0 result.
	###

	# Find the max range between min and max indexes in any of the segments.
	my $widest_range = 0;
	foreach my $seg (@segments)
	{
		my $this_range = $seg->{max_idx} - $seg->{min_idx};
		$widest_range = $this_range if ($this_range > $widest_range);
	}

	unshift @segments,
	  {
		header  => "Dummy map, for invalid values",
		min_idx => 0,
		max_idx => $widest_range,
		label   => "dummy map"
	  };

	###
	### Eliminate overlapping zeros
	###
	### For each segment, if there are zero values at the end of, and there
	### are also zero values at the beginning of the next segment, we can
	### overlay the tail of this segment with the head of next segment, to
	### save space.
	###
	### To achieve that, we subtract the 'max_idx' of each segment with the
	### amount of zeros that can be overlaid.
	###
	for (my $j = 0; $j < $#segments - 1; $j++)
	{
		my $seg     = $segments[$j];
		my $nextseg = $segments[ $j + 1 ];

		# Count the number of zero values at the end of this segment.
		my $this_trail_zeros = 0;
		for (
			my $i = $seg->{max_idx};
			$i >= $seg->{min_idx} && !$seg->{values}->{$i};
			$i--)
		{
			$this_trail_zeros++;
		}

		# Count the number of zeros at the beginning of next segment.
		my $next_lead_zeros = 0;
		for (
			my $i = $nextseg->{min_idx};
			$i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i};
			$i++)
		{
			$next_lead_zeros++;
		}

		# How many zeros in common?
		my $overlaid_trail_zeros =
		  ($this_trail_zeros > $next_lead_zeros)
		  ? $next_lead_zeros
		  : $this_trail_zeros;

		$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
		$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
	}

	###
	### Replace label references with real offsets.
	###
	### So far, the non-leaf segments have referred to other segments by
	### their labels. Replace them with numerical offsets from the beginning
	### of the final array. You cannot move, add, or remove segments after
	### this step, as that would invalidate the offsets calculated here!
	###
	my $flatoff = 0;
	my %segmap;

	# First pass: assign offsets to each segment, and build hash
	# of label => offset.
	foreach my $seg (@segments)
	{
		$seg->{offset} = $flatoff;
		$segmap{ $seg->{label} } = $flatoff;
		$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
	}
	my $tblsize = $flatoff;

	# Second pass: look up the offset of each label reference in the hash.
	foreach my $seg (@segments)
	{
		while (my ($i, $val) = each %{ $seg->{values} })
		{
			if (!($val =~ /^[0-9,.E]+$/))
			{
				my $segoff = $segmap{$val};
				if ($segoff)
				{
					$seg->{values}->{$i} = $segoff;
				}
				else
				{
					die "no segment with label $val";
				}
			}
		}
	}

	# Also look up the positions of the roots in the table.
	# Missing map represents dummy mapping.
	my $b1root = $segmap{"1-byte"} || 0;
	my $b2root = $segmap{"2-byte"} || 0;
	my $b3root = $segmap{"3-byte"} || 0;
	my $b4root = $segmap{"4-byte"} || 0;

	# And the lower-upper values of each level in each radix tree.
	# Missing values represent zero.
	my $b1_lower = $min_idx{1}{1} || 0;
	my $b1_upper = $max_idx{1}{1} || 0;

	my $b2_1_lower = $min_idx{2}{1} || 0;
	my $b2_1_upper = $max_idx{2}{1} || 0;
	my $b2_2_lower = $min_idx{2}{2} || 0;
	my $b2_2_upper = $max_idx{2}{2} || 0;

	my $b3_1_lower = $min_idx{3}{1} || 0;
	my $b3_1_upper = $max_idx{3}{1} || 0;
	my $b3_2_lower = $min_idx{3}{2} || 0;
	my $b3_2_upper = $max_idx{3}{2} || 0;
	my $b3_3_lower = $min_idx{3}{3} || 0;
	my $b3_3_upper = $max_idx{3}{3} || 0;

	my $b4_1_lower = $min_idx{4}{1} || 0;
	my $b4_1_upper = $max_idx{4}{1} || 0;
	my $b4_2_lower = $min_idx{4}{2} || 0;
	my $b4_2_upper = $max_idx{4}{2} || 0;
	my $b4_3_lower = $min_idx{4}{3} || 0;
	my $b4_3_upper = $max_idx{4}{3} || 0;
	my $b4_4_lower = $min_idx{4}{4} || 0;
	my $b4_4_upper = $max_idx{4}{4} || 0;

	###
	### Find the maximum value in the whole table, to determine if we can
	### use uint16 or if we need to use uint32.
	###
	my $max_val = 0;
	foreach my $seg (@segments)
	{
		foreach my $val (values %{ $seg->{values} })
		{
			$max_val = $val if ($val > $max_val);
		}
	}

	my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";

	# For formatting, determine how many values we can fit on a single
	# line, and how wide each value needs to be to align nicely.
	my $vals_per_line;
	my $colwidth;

	if ($max_val <= 0xffff)
	{
		$vals_per_line = 8;
		$colwidth      = 4;
	}
	elsif ($max_val <= 0xffffff)
	{
		$vals_per_line = 4;
		$colwidth      = 6;
	}
	else
	{
		$vals_per_line = 4;
		$colwidth      = 8;
	}

	###
	### Print the struct and array.
	###
	printf $out "static const $datatype ${tblname}_table[$tblsize];\n";
	printf $out "\n";
	printf $out "static const pg_mb_radix_tree $tblname =\n";
	printf $out "{\n";
	if ($datatype eq "uint16")
	{
		print $out "  ${tblname}_table,\n";
		print $out "  NULL, /* 32-bit table not used */\n";
	}
	if ($datatype eq "uint32")
	{
		print $out "  NULL, /* 16-bit table not used */\n";
		print $out "  ${tblname}_table,\n";
	}
	printf $out "\n";
	printf $out "  0x%04x, /* offset of table for 1-byte inputs */\n",
	  $b1root;
	printf $out "  0x%02x, /* b1_lower */\n", $b1_lower;
	printf $out "  0x%02x, /* b1_upper */\n", $b1_upper;
	printf $out "\n";
	printf $out "  0x%04x, /* offset of table for 2-byte inputs */\n",
	  $b2root;
	printf $out "  0x%02x, /* b2_1_lower */\n", $b2_1_lower;
	printf $out "  0x%02x, /* b2_1_upper */\n", $b2_1_upper;
	printf $out "  0x%02x, /* b2_2_lower */\n", $b2_2_lower;
	printf $out "  0x%02x, /* b2_2_upper */\n", $b2_2_upper;
	printf $out "\n";
	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
	  $b3root;
	printf $out "  0x%02x, /* b3_1_lower */\n", $b3_1_lower;
	printf $out "  0x%02x, /* b3_1_upper */\n", $b3_1_upper;
	printf $out "  0x%02x, /* b3_2_lower */\n", $b3_2_lower;
	printf $out "  0x%02x, /* b3_2_upper */\n", $b3_2_upper;
	printf $out "  0x%02x, /* b3_3_lower */\n", $b3_3_lower;
	printf $out "  0x%02x, /* b3_3_upper */\n", $b3_3_upper;
	printf $out "\n";
	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
	  $b4root;
	printf $out "  0x%02x, /* b4_1_lower */\n", $b4_1_lower;
	printf $out "  0x%02x, /* b4_1_upper */\n", $b4_1_upper;
	printf $out "  0x%02x, /* b4_2_lower */\n", $b4_2_lower;
	printf $out "  0x%02x, /* b4_2_upper */\n", $b4_2_upper;
	printf $out "  0x%02x, /* b4_3_lower */\n", $b4_3_lower;
	printf $out "  0x%02x, /* b4_3_upper */\n", $b4_3_upper;
	printf $out "  0x%02x, /* b4_4_lower */\n", $b4_4_lower;
	printf $out "  0x%02x  /* b4_4_upper */\n", $b4_4_upper;
	print $out "};\n";
	print $out "\n";
	print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
	print $out "{";
	my $off = 0;

	foreach my $seg (@segments)
	{
		printf $out "\n";
		printf $out "  /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
		printf $out "\n";

		for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};)
		{

			# Print the next line's worth of values.
			# XXX pad to begin at a nice boundary
			printf $out "  /* %02x */ ", $i;
			for (my $j = 0;
				$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
			{
				# missing values represent zero.
				my $val = $seg->{values}->{$i} || 0;

				printf $out " 0x%0*x", $colwidth, $val;
				$off++;
				if ($off != $tblsize)
				{
					print $out ",";
				}
				$i++;
			}
			print $out "\n";
		}
		if ($seg->{overlaid_trail_zeros})
		{
			printf $out
			  "    /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
		}
	}

	# Sanity check.
	if ($off != $tblsize) { die "table size didn't match!"; }

	print $out "};\n";
	return;
}