def _recursive_chunk_json()

in chunking/chunkers/json_chunker.py [0:0]


    def _recursive_chunk_json(self, obj):
        """
        Recursively partition a JSON object (list or dict) so that each partition's
        pretty-printed string does not exceed self.max_chunk_size tokens.
        
        Returns a list of JSON-compatible Python objects.
        """
        def token_count_of(data):
            dumped = json.dumps(data, indent=2, ensure_ascii=False)
            return self.token_estimator.estimate_tokens(dumped)

        # If obj is a list, partition its items.
        if isinstance(obj, list):
            partitions = []
            current = []
            for item in obj:
                candidate = current + [item]
                if token_count_of(candidate) <= self.max_chunk_size:
                    current.append(item)
                else:
                    if current:
                        # Recursively check the current partition in case a single element is too large.
                        if token_count_of(current) > self.max_chunk_size and len(current) == 1:
                            partitions.extend(self._recursive_chunk_json(current[0]))
                        else:
                            partitions.append(current)
                    # If the item itself is too big, try to partition it further.
                    if token_count_of([item]) > self.max_chunk_size and isinstance(item, (list, dict)):
                        partitions.extend(self._recursive_chunk_json(item))
                    else:
                        current = [item]
            if current:
                partitions.append(current)
            return partitions

        # If obj is a dict, partition its key-value pairs.
        elif isinstance(obj, dict):
            partitions = []
            current = {}
            for key, value in obj.items():
                candidate = current.copy()
                candidate[key] = value
                if token_count_of(candidate) <= self.max_chunk_size:
                    current[key] = value
                else:
                    if current:
                        # If a single key-value pair is too large, try to partition its value.
                        if token_count_of(current) > self.max_chunk_size and len(current) == 1:
                            # current has one key; try partitioning its value if possible.
                            k = list(current.keys())[0]
                            v = current[k]
                            if isinstance(v, (list, dict)):
                                subparts = self._recursive_chunk_json(v)
                                for sub in subparts:
                                    partitions.append({k: sub})
                            else:
                                partitions.append(current)
                        else:
                            partitions.append(current)
                    # Try partitioning the new key-value pair if it's too large.
                    single = {key: value}
                    if token_count_of(single) > self.max_chunk_size and isinstance(value, (list, dict)):
                        subparts = self._recursive_chunk_json(value)
                        for sub in subparts:
                            partitions.append({key: sub})
                    else:
                        current = {key: value}
            if current:
                partitions.append(current)
            return partitions

        # For primitives, just return them as a single partition.
        else:
            return [obj]