def flatten_preserve_lists()

in visualizeConnectData/lambdas/modifyCTR/flatten_json.py [0:0]


def flatten_preserve_lists(nested_dict, separator="_",
                           root_keys_to_ignore=set(),
                           max_list_index=3, max_depth=3):
    """
    Flattens a dictionary with nested structure to a dictionary with no
    hierarchy
    Consider ignoring keys that you are not interested in to prevent
    unnecessary processing
    This is specially true for very deep objects
    This preserves list structure, and
    you can specify max_list_index and max_depth to limit processing

    Child elements with only one value inside
    will be unwrapped and become parent's value.

    :param nested_dict: dictionary we want to flatten
    :param separator: string to separate dictionary keys by
    :param root_keys_to_ignore: set of root keys to ignore from flattening
    :param max_list_index: maximum list index to process
    :param max_depth: maximum nesting depth to process
    :return: flattened dictionary
    """

    assert isinstance(nested_dict, dict), "flatten requires a dictionary input"
    assert isinstance(separator, six.string_types), \
        "separator must be a string"

    # This global dictionary stores the flattened keys and values and is
    # ultimately returned
    flattened_dict = dict()

    def _flatten(object_, key):
        """
        For dict, list and set objects_ calls itself on the elements and for
        other types assigns the object_ to
        the corresponding key in the global flattened_dict
        :param object_: object to flatten
        :param key: carries the concatenated key for the object_
        :return: None
        """

        # Empty object can't be iterated, take as is
        if not object_:
            flattened_dict[key] = object_

        # These object types support iteration
        # dict always go into columns
        elif isinstance(object_, dict):
            first_key = list(object_.keys())[0]
            # if only 1 child value, and child value not a dict or list
            # flatten immediately
            if len(object_) == 1 \
                    and not (isinstance(object_[first_key], dict)
                             or isinstance(object_[first_key], list)
                             ):
                flattened_dict[key] = object_[first_key]
            else:
                for object_key in object_:
                    if not (not key and object_key in root_keys_to_ignore):
                        _flatten(object_[object_key],
                                 _construct_key(key, separator, object_key)
                                 )

        elif isinstance(object_, list) or isinstance(object_, set):
            for index, item in enumerate(object_):
                _flatten(item, _construct_key(key, separator, index))

        else:
            flattened_dict[key] = object_

    def _flatten_low_entropy(object_, key, cur_depth, max_depth_inner):
        """
        For dict, list and set objects_ calls itself on the elements and for
        other types assigns the object_ to
        the corresponding key in the global flattened_dict

        :param object_: object to flatten
        :param key: carries the concatenated key for the object_
        :return: None
        """
        cur_depth = cur_depth + 1  # increase current_depth
        debug = 0

        # write latest child as value if max_depth exceeded
        if cur_depth > max_depth_inner:
            global_max_record = int(max(list(
                list_prebuilt_flattened_dict.keys())))
            for d in list_prebuilt_flattened_dict[str(global_max_record)]:
                d[key] = object_

        else:
            # Empty object can't be iterated, take as is
            if not object_:
                global_max_record = int(max(list(
                    list_prebuilt_flattened_dict.keys())))
                for d in list_prebuilt_flattened_dict[str(global_max_record)]:
                    d[key] = object_

            # These object types support iteration
            # dict always go into columns
            elif isinstance(object_, dict):
                first_key = list(object_.keys())[0]
                # if only 1 child value, and child value
                # not a dict or list, flatten immediately
                if len(object_) == 1 \
                        and not (isinstance(object_[first_key], dict)
                                 or isinstance(object_[first_key], list)):
                    global_max_record = int(max(list(
                        list_prebuilt_flattened_dict.keys())))

                    for d in list_prebuilt_flattened_dict[
                        str(global_max_record)
                    ]:
                        d[key] = object_[first_key]

                else:
                    for object_key, val in \
                            sorted(object_.items(),
                                   key=lambda x:
                                   (str(type(x[1])), len(str(x[1]))),
                                   reverse=False):
                        if not (not key and object_key in root_keys_to_ignore):
                            _flatten_low_entropy(object_[object_key],
                                                 _construct_key(key,
                                                                separator,
                                                                object_key),
                                                 cur_depth, max_depth_inner)

            # lists could go into rows, like in a relational database
            elif isinstance(object_, list) or isinstance(object_, set):
                if debug:
                    print("\nparent key of list:",
                          key, "| length: ",
                          str(len(object_)))

                # need to remember global list state when we entered
                # this recursion
                global_max_record_start = int(max(list(
                    list_prebuilt_flattened_dict.keys())))
                entry = copy.deepcopy(list_prebuilt_flattened_dict[
                                          str(global_max_record_start)
                                      ])

                for index, item in enumerate(object_):

                    if debug:
                        print("  list key:", key,
                              " index: " + str(index), "vals: ", item)

                    sub = -1
                    if isinstance(item, dict):
                        first_value = list(item.values())[0]
                        if isinstance(first_value, float):
                            sub = first_value

                    if not isnan(sub) and index < max_list_index:
                        # start from second element, 1st element is like column
                        if index > 0:
                            global_max_record = int(max(list(
                                list_prebuilt_flattened_dict.keys())))

                            list_prebuilt_flattened_dict[
                                str(global_max_record + 1)
                            ] = copy.deepcopy(entry)

                        _flatten_low_entropy(item, key, cur_depth,
                                             max_depth_inner)
                    else:
                        pass

                list_prebuilt_flattened_dict['0'] = \
                    [subel for k, v in
                     sorted(list_prebuilt_flattened_dict.items())
                     for idx, subel in enumerate(v)]

                for key in list(sorted(list_prebuilt_flattened_dict.keys())):
                    if key != '0':
                        del list_prebuilt_flattened_dict[key]
                if debug:
                    print("collapsed global list")

            # Anything left take as is, assuming you hit the end of the line.
            else:
                # in this case, there may be
                # a list of prebuilt_flattened_dict by now
                # so need to update them all.
                global_max_record = int(max(list(
                    list_prebuilt_flattened_dict.keys())))

                for d in list_prebuilt_flattened_dict[str(global_max_record)]:
                    d[key] = object_

                    # decrease depth counter
        cur_depth -= 1

    _flatten(nested_dict, None)

    # get unique column names, without the integers
    # TODO: potential issue: what if column names have digits naturally?
    reskeys = list(flattened_dict.keys())
    unique_integers = list(set([separator + char for key
                                in reskeys for char in key if char.isdigit()]))
    regex = '|'.join(unique_integers)
    regex += "|" + regex.replace(".", "")
    unique_columns = list(set([re.sub("(" + regex + ")", "", key)
                               for key in reskeys]))

    # create global dict, now with unique column names
    prebuilt_flattened_dict = {column: None for column in unique_columns}

    # initialize global record list
    list_prebuilt_flattened_dict = {'0': [prebuilt_flattened_dict]}

    _flatten_low_entropy(nested_dict, None, cur_depth=0,
                         max_depth_inner=max_depth)

    return list_prebuilt_flattened_dict['0']