def process_sequences()

in data_management/megadb/converters/cct_to_megadb.py [0:0]
140 lines of code
49 McCabe index (conditional complexity)

def process_sequences(embedded_image_objects, dataset_name, deepcopy_embedded=True):
    """
    Combine the image entries in an embedded COCO Camera Trap json from make_cct_embedded()
    into sequence objects that can be ingested to the `sequences` table in MegaDB.

    Image-level properties that have the same value are moved to the sequence level;
    sequence-level properties that have the same value are removed with a print-out
    describing what should be added to the `datasets` table instead.

    All strings in the array for the `class` property are lower-cased.

    Args:
        embedded_image_objects: array of image objects returned by make_cct_embedded()
        dataset_name: Make sure this is the desired name for the dataset
        deepcopy_embedded: True if to make a deep copy of `docs`; otherwise the `docs` object passed in will be modified!

    Returns:
        an array of sequence objects
    """
    print('The dataset_name is set to {}. Please make sure this is correct!'.format(dataset_name))

    if deepcopy_embedded:
        print('Making a deep copy of docs...')
        docs = deepcopy(embedded_image_objects)
    else:
        docs = embedded_image_objects

    print('Putting {} images into sequences...'.format(
        len(docs)))
    img_level_properties = set()
    sequences = defaultdict(list)

    # a dummy sequence ID will be generated if the image entry does not have a seq_id field
    # seq_id only needs to be unique within this dataset; MegaDB does not rely on it as the _id field

    # "annotations" fields are opened and have its sub-field surfaced one level up
    for im in tqdm(docs):
        if 'seq_id' in im:
            seq_id = im['seq_id']
            del im['seq_id']
        else:
            seq_id = 'dummy_' + uuid.uuid4().hex  # if this will be sent for annotation, may need a sequence ID based on file name to group potential sequences together
            img_level_properties.add('file')
            img_level_properties.add('image_id')

        for old_name, new_name in old_to_new_prop_name_mapping.items():
            if old_name in im:
                im[new_name] = im[old_name]
                del im[old_name]

        for obsolete_prop in ['seq_num_frames', 'width', 'height']:
            if obsolete_prop in im:
                del im[obsolete_prop]

        if 'annotations' in im:
            for prop, prop_val in im['annotations'].items():
                im[prop] = prop_val
                if prop == 'bbox':
                    for bbox_item in im['bbox']:
                        if 'bbox_rel' not in bbox_item:
                            print('Missing relative coordinates for bbox! Exiting...')
                            print(im)
                            sys.exit(1)
                        else:
                            bbox_item['bbox'] = bbox_item['bbox_rel']
                            del bbox_item['bbox_rel']

                            if 'bbox_abs' in bbox_item:
                                del bbox_item['bbox_abs']

                if prop == 'species':
                    im['class'] = im['species']
                    del im['species']

            del im['annotations']

        sequences[seq_id].append(im)

    # set the `dataset` property on each sequence to the provided dataset_name
    new_sequences = []
    for seq_id, images in sequences.items():
        new_sequences.append({
            'seq_id': seq_id,
            'dataset': dataset_name,
            'images': images
        })
    sequences = new_sequences
    print('Number of sequences: {}'.format(len(sequences)))

    # check that the location field is the same for all images in a sequence
    print('Checking the location field...')
    for seq in sequences:
        locations = []
        for im in seq['images']:
            locations.append(im.get('location', ''))  # empty string if no location provided
        assert len(set(locations)) == 1, 'Location fields in images of the sequence {} are different.'.format(seq['seq_id'])

    # check which fields in a CCT image entry are sequence-level
    print('Checking which fields in a CCT image entry are sequence-level...')
    all_img_properties = set()
    for seq in sequences:
        if 'images' not in seq:
            continue

        image_properties = defaultdict(set)  # property name to stringfied property value

        for im in seq['images']:
            for prop_name, prop_value in im.items():
                image_properties[prop_name].add(str(prop_value))  # make all hashable
                all_img_properties.add(prop_name)

        for prop_name, prop_values in image_properties.items():
            if len(prop_values) > 1:
                img_level_properties.add(prop_name)

    # image-level properties that really should be sequence-level
    seq_level_properties = all_img_properties - img_level_properties

    # need to add (misidentified) seq properties not present for each image in a sequence to img_level_properties
    # (some properties act like flags - all have the same value, but not present on each img)
    bool_img_level_properties = set()
    for seq in sequences:
        if 'images' not in seq:
            continue
        for im in seq['images']:
            for seq_property in seq_level_properties:
                if seq_property not in im:
                    bool_img_level_properties.add(seq_property)
    seq_level_properties -= bool_img_level_properties
    img_level_properties |= bool_img_level_properties

    print('\nall_img_properties')
    print(all_img_properties)
    print('\nimg_level_properties')
    print(img_level_properties)
    print('\nimage-level properties that really should be sequence-level')
    print(seq_level_properties)
    print('')

    # add the sequence-level properties to the sequence objects
    for seq in sequences:
        if 'images' not in seq:
            continue

        for seq_property in seq_level_properties:
            # not every sequence have to have all the seq_level_properties
            if seq_property in seq['images'][0]:
                # get the value of this sequence-level property from the first image entry
                seq[seq_property] = seq['images'][0][seq_property]
                for im in seq['images']:
                    del im[seq_property]  # and remove it from the image level

    # check which fields are really dataset-level and should be included in the dataset table instead.
    seq_level_prop_values = defaultdict(set)
    for seq in sequences: 
        for prop_name in seq:
            if prop_name not in ['dataset', 'seq_id', 'class', 'images', 'location', 'bbox']:
                seq_level_prop_values[prop_name].add(seq[prop_name])
    dataset_props = []
    for prop_name, values in seq_level_prop_values.items():
        if prop_name == 'season':
            continue  # always keep 'season'
        if len(values) == 1:
            dataset_props.append(prop_name)
            print('! Sequence-level property {} with value {} should be a dataset-level property. Removed from sequences.'.format(prop_name, list(values)[0]))

    # delete sequence-level properties that should be dataset-level
    # make all `class` fields lower-case; cast `seq_id` to type string in case they're integers
    sequences_neat = []
    for seq in sequences:
        for dataset_prop in dataset_props:
            del seq[dataset_prop]

        seq['seq_id'] = str(seq['seq_id'])

        if 'class' in seq:
            seq['class'] = [c.lower() for c in set(seq['class'])]
        if 'images' in seq:
            for im in seq['images']:
                if 'class' in im:
                    im['class'] = [c.lower() for c in set(im['class'])]
        sequences_neat.append(sequences_schema_check.order_seq_properties(seq))

    # turn all float NaN values into None so it gets converted to null when serialized
    # this was an issue in the Snapshot Safari datasets
    for seq in sequences_neat:
        for seq_prop, seq_prop_value in seq.items():
            if isinstance(seq_prop_value, float) and np.isnan(seq_prop_value):
                seq[seq_prop] = None

            if seq_prop == 'images':
                for im_idx, im in enumerate(seq['images']):
                    for im_prop, im_prop_value in im.items():
                        if isinstance(im_prop_value, float) and np.isnan(im_prop_value):
                            seq['images'][im_idx][im_prop] = None

    print('Finished processing sequences.')
    #%% validation
    print('Example sequence items:')
    print()
    print(json.dumps(sequences_neat[0]))
    print()
    print(json.dumps(sample(sequences_neat, 1)[0]))
    print()

    return sequences_neat