in experiments/jsonmaps.py [0:0]
def run_single_experiment(seq_length, seed):
np.random.seed(seed)
# Load JSON maps and convert each to a list of 2D tuples of UTF-8 bytes,
# representing the (key, value) pairs. All key-value pairs are
# cast to strings for simplicity. The list of tuples is sorted,
# to impose an ordering between JSONs that can be used by bits-back.
path = 'data/github-users.jsonl'
jsons_as_lists_of_tuples = [
sorted(parse_json(json).items())
for json in load_corpus(path)[:seq_length]
]
sequence = [[(str(k).encode('utf-8'), str(v).encode('utf-8'))
for k, v in json]
for json in jsons_as_lists_of_tuples]
# We assume all JSON maps are unique and have exactly 17 key-value pairs,
# although this is trivially generalizable by encoding the number of
# key-value pairs as the last step of encoding.
assert all(len(json) == 17 for json in sequence)
assert len(sequence) == seq_length
assert len({frozenset(json) for json in sequence}) == seq_length
# Populate outer and inner multisets via successive applications of the
# insert operation. The inner multisets, each representing some JSON map,
# are the elements of the nodes of the outer multiset.
multiset_outer = \
build_multiset([build_multiset(json) for json in sequence])
# Initialize codec for sampling without replacement
swor_codec = codecs.SamplingWithoutReplacement()
# The ANS state is a compound data-structure composed of a head and tail.
# It allows performing encode and decode operations in parallel, as long
# as the head shape is larger than the input symbol shape. The symbols
# are UTF-8 bytearrays (i.e. keys and values), which vary in length.
# We initialize the ANS head shape to 1000, which means that neither
# the keys or values can exceed 1000 UTF-8 bytes in size.
ans_state = cs.base_message(1000)
utf8_codec = codecs.ByteArray(1000)
# Encode
while multiset_outer:
# 1) Sample, without replacement, an inner multiset using ANS decode
ans_state, multiset_inner, multiset_outer = \
swor_codec.decode(ans_state, multiset_outer)
while multiset_inner:
# 2) Sample, without replacement, a key-value pair using ANS decode
ans_state, (key, value), multiset_inner = \
swor_codec.decode(ans_state, multiset_inner)
# 3) Encode the selected key-value pair onto the same ANS state.
(ans_state,) = utf8_codec.encode(ans_state, key)
(ans_state,) = utf8_codec.encode(ans_state, value)
# Calculate the size of the compressed state in bits
# This is the number of bits used to compress the JSON maps as a multiset.
compressed_length_multiset = calculate_state_bits(ans_state)
# Decode
multiset_outer_decoded = ()
for _ in range(seq_length):
multiset_inner_decoded = ()
for _ in range(17):
# Decode key-value pair on top of stack (reverses step 3)
ans_state, value = utf8_codec.decode(ans_state)
ans_state, key = utf8_codec.decode(ans_state)
# Encode bits used to sample key-value pair (reverses step 2)
# This is the first bits-back step!
ans_state, multiset_inner_decoded = \
swor_codec\
.encode(ans_state, (key, value), multiset_inner_decoded)
# Encode bits used to sample inner multiset (reverses step 1)
# Sorting the inner multiset is needed to properly account for
# the ordering between JSON items defined on line 31. Alternatively,
# we could create a class for the inner multisets to override the
# comparison operators __le__ and __ge__.
# This is the second bits-back step!
json = build_multiset(sorted(to_sequence(multiset_inner_decoded)))
ans_state, multiset_outer_decoded = \
swor_codec.encode(ans_state, json, multiset_outer_decoded)
# Check if decoded correctly
# Rebuild the original multiset, as it was consumed during encoding
multiset_outer = \
build_multiset([build_multiset(json) for json in sequence])
assert check_multiset_equality(multiset_outer, multiset_outer_decoded)
# Calculate number of bits needed to compress JSON maps as an ordered
# sequence. Since all JSONs are assumed to have exactly 17 key-value pairs
# then encoding all key-value elements sequentially is sufficient to
# reconstruct the JSON maps losslessly.
ans_state = cs.base_message(1000)
sequence_of_json_elements = \
[el for json in sequence
for key_value_pair in json
for el in key_value_pair]
(ans_state,) = codecs.Sequence(utf8_codec)\
.encode(ans_state, sequence_of_json_elements)
compressed_length_sequence = calculate_state_bits(ans_state)
# Calculate saved bits and theoretical limit. Assumes all maps are unique,
# so that the theoretical limit on savings is
# log2(seq_length!) + seq_length*log2(17!).
saved_bits = compressed_length_sequence - compressed_length_multiset
saved_bits_limit_unnested = \
np.sum(np.log2(np.arange(seq_length)+1))
saved_bits_limit = \
saved_bits_limit_unnested \
+ seq_length*np.sum(np.log2(np.arange(17)+1))
return {
'seq_length': seq_length,
'saved_bits': saved_bits,
'saved_bits_limit': saved_bits_limit,
'saved_bits_limit_unnested': saved_bits_limit_unnested,
'compressed_length_multiset': compressed_length_multiset,
'compressed_length_sequence': compressed_length_sequence,
}