in chunking/chunkers/json_chunker.py [0:0]
def _recursive_chunk_json(self, obj):
"""
Recursively partition a JSON object (list or dict) so that each partition's
pretty-printed string does not exceed self.max_chunk_size tokens.
Returns a list of JSON-compatible Python objects.
"""
def token_count_of(data):
dumped = json.dumps(data, indent=2, ensure_ascii=False)
return self.token_estimator.estimate_tokens(dumped)
# If obj is a list, partition its items.
if isinstance(obj, list):
partitions = []
current = []
for item in obj:
candidate = current + [item]
if token_count_of(candidate) <= self.max_chunk_size:
current.append(item)
else:
if current:
# Recursively check the current partition in case a single element is too large.
if token_count_of(current) > self.max_chunk_size and len(current) == 1:
partitions.extend(self._recursive_chunk_json(current[0]))
else:
partitions.append(current)
# If the item itself is too big, try to partition it further.
if token_count_of([item]) > self.max_chunk_size and isinstance(item, (list, dict)):
partitions.extend(self._recursive_chunk_json(item))
else:
current = [item]
if current:
partitions.append(current)
return partitions
# If obj is a dict, partition its key-value pairs.
elif isinstance(obj, dict):
partitions = []
current = {}
for key, value in obj.items():
candidate = current.copy()
candidate[key] = value
if token_count_of(candidate) <= self.max_chunk_size:
current[key] = value
else:
if current:
# If a single key-value pair is too large, try to partition its value.
if token_count_of(current) > self.max_chunk_size and len(current) == 1:
# current has one key; try partitioning its value if possible.
k = list(current.keys())[0]
v = current[k]
if isinstance(v, (list, dict)):
subparts = self._recursive_chunk_json(v)
for sub in subparts:
partitions.append({k: sub})
else:
partitions.append(current)
else:
partitions.append(current)
# Try partitioning the new key-value pair if it's too large.
single = {key: value}
if token_count_of(single) > self.max_chunk_size and isinstance(value, (list, dict)):
subparts = self._recursive_chunk_json(value)
for sub in subparts:
partitions.append({key: sub})
else:
current = {key: value}
if current:
partitions.append(current)
return partitions
# For primitives, just return them as a single partition.
else:
return [obj]