in data_management/megadb/converters/cct_to_megadb.py [0:0]
def process_sequences(embedded_image_objects, dataset_name, deepcopy_embedded=True):
"""
Combine the image entries in an embedded COCO Camera Trap json from make_cct_embedded()
into sequence objects that can be ingested to the `sequences` table in MegaDB.
Image-level properties that have the same value are moved to the sequence level;
sequence-level properties that have the same value are removed with a print-out
describing what should be added to the `datasets` table instead.
All strings in the array for the `class` property are lower-cased.
Args:
embedded_image_objects: array of image objects returned by make_cct_embedded()
dataset_name: Make sure this is the desired name for the dataset
deepcopy_embedded: True if to make a deep copy of `docs`; otherwise the `docs` object passed in will be modified!
Returns:
an array of sequence objects
"""
print('The dataset_name is set to {}. Please make sure this is correct!'.format(dataset_name))
if deepcopy_embedded:
print('Making a deep copy of docs...')
docs = deepcopy(embedded_image_objects)
else:
docs = embedded_image_objects
print('Putting {} images into sequences...'.format(
len(docs)))
img_level_properties = set()
sequences = defaultdict(list)
# a dummy sequence ID will be generated if the image entry does not have a seq_id field
# seq_id only needs to be unique within this dataset; MegaDB does not rely on it as the _id field
# "annotations" fields are opened and have its sub-field surfaced one level up
for im in tqdm(docs):
if 'seq_id' in im:
seq_id = im['seq_id']
del im['seq_id']
else:
seq_id = 'dummy_' + uuid.uuid4().hex # if this will be sent for annotation, may need a sequence ID based on file name to group potential sequences together
img_level_properties.add('file')
img_level_properties.add('image_id')
for old_name, new_name in old_to_new_prop_name_mapping.items():
if old_name in im:
im[new_name] = im[old_name]
del im[old_name]
for obsolete_prop in ['seq_num_frames', 'width', 'height']:
if obsolete_prop in im:
del im[obsolete_prop]
if 'annotations' in im:
for prop, prop_val in im['annotations'].items():
im[prop] = prop_val
if prop == 'bbox':
for bbox_item in im['bbox']:
if 'bbox_rel' not in bbox_item:
print('Missing relative coordinates for bbox! Exiting...')
print(im)
sys.exit(1)
else:
bbox_item['bbox'] = bbox_item['bbox_rel']
del bbox_item['bbox_rel']
if 'bbox_abs' in bbox_item:
del bbox_item['bbox_abs']
if prop == 'species':
im['class'] = im['species']
del im['species']
del im['annotations']
sequences[seq_id].append(im)
# set the `dataset` property on each sequence to the provided dataset_name
new_sequences = []
for seq_id, images in sequences.items():
new_sequences.append({
'seq_id': seq_id,
'dataset': dataset_name,
'images': images
})
sequences = new_sequences
print('Number of sequences: {}'.format(len(sequences)))
# check that the location field is the same for all images in a sequence
print('Checking the location field...')
for seq in sequences:
locations = []
for im in seq['images']:
locations.append(im.get('location', '')) # empty string if no location provided
assert len(set(locations)) == 1, 'Location fields in images of the sequence {} are different.'.format(seq['seq_id'])
# check which fields in a CCT image entry are sequence-level
print('Checking which fields in a CCT image entry are sequence-level...')
all_img_properties = set()
for seq in sequences:
if 'images' not in seq:
continue
image_properties = defaultdict(set) # property name to stringfied property value
for im in seq['images']:
for prop_name, prop_value in im.items():
image_properties[prop_name].add(str(prop_value)) # make all hashable
all_img_properties.add(prop_name)
for prop_name, prop_values in image_properties.items():
if len(prop_values) > 1:
img_level_properties.add(prop_name)
# image-level properties that really should be sequence-level
seq_level_properties = all_img_properties - img_level_properties
# need to add (misidentified) seq properties not present for each image in a sequence to img_level_properties
# (some properties act like flags - all have the same value, but not present on each img)
bool_img_level_properties = set()
for seq in sequences:
if 'images' not in seq:
continue
for im in seq['images']:
for seq_property in seq_level_properties:
if seq_property not in im:
bool_img_level_properties.add(seq_property)
seq_level_properties -= bool_img_level_properties
img_level_properties |= bool_img_level_properties
print('\nall_img_properties')
print(all_img_properties)
print('\nimg_level_properties')
print(img_level_properties)
print('\nimage-level properties that really should be sequence-level')
print(seq_level_properties)
print('')
# add the sequence-level properties to the sequence objects
for seq in sequences:
if 'images' not in seq:
continue
for seq_property in seq_level_properties:
# not every sequence have to have all the seq_level_properties
if seq_property in seq['images'][0]:
# get the value of this sequence-level property from the first image entry
seq[seq_property] = seq['images'][0][seq_property]
for im in seq['images']:
del im[seq_property] # and remove it from the image level
# check which fields are really dataset-level and should be included in the dataset table instead.
seq_level_prop_values = defaultdict(set)
for seq in sequences:
for prop_name in seq:
if prop_name not in ['dataset', 'seq_id', 'class', 'images', 'location', 'bbox']:
seq_level_prop_values[prop_name].add(seq[prop_name])
dataset_props = []
for prop_name, values in seq_level_prop_values.items():
if prop_name == 'season':
continue # always keep 'season'
if len(values) == 1:
dataset_props.append(prop_name)
print('! Sequence-level property {} with value {} should be a dataset-level property. Removed from sequences.'.format(prop_name, list(values)[0]))
# delete sequence-level properties that should be dataset-level
# make all `class` fields lower-case; cast `seq_id` to type string in case they're integers
sequences_neat = []
for seq in sequences:
for dataset_prop in dataset_props:
del seq[dataset_prop]
seq['seq_id'] = str(seq['seq_id'])
if 'class' in seq:
seq['class'] = [c.lower() for c in set(seq['class'])]
if 'images' in seq:
for im in seq['images']:
if 'class' in im:
im['class'] = [c.lower() for c in set(im['class'])]
sequences_neat.append(sequences_schema_check.order_seq_properties(seq))
# turn all float NaN values into None so it gets converted to null when serialized
# this was an issue in the Snapshot Safari datasets
for seq in sequences_neat:
for seq_prop, seq_prop_value in seq.items():
if isinstance(seq_prop_value, float) and np.isnan(seq_prop_value):
seq[seq_prop] = None
if seq_prop == 'images':
for im_idx, im in enumerate(seq['images']):
for im_prop, im_prop_value in im.items():
if isinstance(im_prop_value, float) and np.isnan(im_prop_value):
seq['images'][im_idx][im_prop] = None
print('Finished processing sequences.')
#%% validation
print('Example sequence items:')
print()
print(json.dumps(sequences_neat[0]))
print()
print(json.dumps(sample(sequences_neat, 1)[0]))
print()
return sequences_neat