in data_management/importers/idaho-camera-traps.py [0:0]
def csv_to_sequences(csv_file):
print('Processing {}'.format(csv_file))
csv_file_absolute = os.path.join(input_base,csv_file)
# os.startfile(csv_file_absolute)
sequences = []
# survey = csv_file.split('\\')[0]
# Sample paths from which we need to derive locations:
#
# St.Joe_elk\AM99\Trip 1\100RECNX\TimelapseData.csv
# Beaverhead_elk\AM34\Trip 1\100RECNX\TimelapseData.csv
#
# ClearCreek_mustelids\Winter2015-16\FS-001-P\FS-001-P.csv
# ClearCreek_mustelids\Summer2015\FS-001\FS-001.csv
# ClearCreek_mustelids\Summer2016\IDFG-016\IDFG-016.csv
#
# I:\idfg-images\ClearCreek_mustelids\Summer2016\IDFG-017b
# I:\idfg-images\ClearCreek_mustelids\Summer2016\IDFG-017a
if 'St.Joe_elk' in csv_file or 'Beaverhead_elk' in csv_file:
location_name = '_'.join(csv_file.split('\\')[0:2]).replace(' ','')
else:
assert 'ClearCreek_mustelids' in csv_file
tokens = csv_file.split('\\')
assert 'FS-' in tokens[2] or 'IDFG-' in tokens[2]
location_name = '_'.join([tokens[0],tokens[2]]).replace('-P','')
if location_name.endswith('017a') or location_name.endswith('017b'):
location_name = location_name[:-1]
# Load .csv file
df = pd.read_csv(csv_file_absolute)
df['datetime'] = None
df['seq_id'] = None
df['synthetic_frame_number'] = None
# Validate the opstate column
opstates = set(df['opstate'])
for s in opstates:
if isinstance(s,str):
s = s.strip()
if len(s) > 0:
assert s in valid_opstates,'Invalid opstate: {}'.format(s)
column_names = list(df.columns)
for s in required_columns:
assert s in column_names
count_columns = [s for s in column_names if s in expected_count_columns]
presence_columns = [s for s in column_names if s.endswith('present')]
for s in presence_columns:
if s not in expected_presence_columns:
assert 'Unexpected presence column {} in {}'.format(s,csv_file)
for s in expected_presence_columns:
if s not in presence_columns:
assert 'Missing presence column {} in {}'.format(s,csv_file)
if False:
for s in expected_count_columns:
if s not in count_columns:
print('Missing count column {} in {}'.format(s,csv_file))
## Create datetimes
# print('Creating datetimes')
# i_row = 0; row = df.iloc[i_row]
for i_row,row in df.iterrows():
date = row['Date']
time = row['Time']
datestring = date + ' ' + time
dt = dateutil.parser.parse(datestring)
assert dt.year >= 2015 and dt.year <= 2019
df.loc[i_row,'datetime'] = dt
# Make sure data are sorted chronologically
#
# In odd circumstances, they are not... so sort them first, but warn
datetimes = list(df['datetime'])
if not list_is_sorted(datetimes):
print('Datetimes not sorted for {}'.format(csv_file))
df = df.sort_values('datetime')
df.reset_index(drop=True, inplace=True)
datetimes = list(df['datetime'])
assert list_is_sorted(datetimes)
# Debugging when I was trying to see what was up with the unsorted dates
if False:
for i in range(0,len(datetimes)-1):
dt = datetimes[i+1]
prev_dt = datetimes[i]
delta = dt - prev_dt
assert delta >= datetime.timedelta(0)
## Parse into sequences
# print('Creating sequences')
current_sequence_id = None
next_frame_number = 0
previous_datetime = None
sequence_id_to_rows = defaultdict(list)
# i_row = 0; row = df.iloc[i_row]
for i_row,row in df.iterrows():
dt = row['datetime']
assert dt is not None and isinstance(dt,datetime.datetime)
# Start a new sequence if:
#
# * This image has no timestamp
# * This image has a frame number of zero
# * We have no previous image timestamp
#
if previous_datetime is None:
delta = None
else:
delta = (dt - previous_datetime).total_seconds()
# Start a new sequence if necessary
if delta is None or delta > max_gap_within_sequence:
next_frame_number = 0
current_sequence_id = location_name + '_seq_' + str(dt) # str(uuid.uuid1())
assert current_sequence_id is not None
sequence_id_to_rows[current_sequence_id].append(i_row)
df.loc[i_row,'seq_id'] = current_sequence_id
df.loc[i_row,'synthetic_frame_number'] = next_frame_number
next_frame_number = next_frame_number + 1
previous_datetime = dt
# ...for each row
location_sequences = list(set(list(df['seq_id'])))
location_sequences.sort()
inconsistent_sequences = []
## Parse labels for each sequence
# sequence_id = location_sequences[0]
for sequence_id in location_sequences:
sequence_row_indices = sequence_id_to_rows[sequence_id]
assert len(sequence_row_indices) > 0
# Row indices in a sequence should be adjacent
if len(sequence_row_indices) > 1:
d = np.diff(sequence_row_indices)
assert(all(d==1))
# sequence_df = df[df['seq_id']==sequence_id]
sequence_df = df.iloc[sequence_row_indices]
## Determine what's present
presence_columns_marked = []
survey_species = []
other_species = []
# Be conservative; assume humans are present in all maintenance images
opstates = set(sequence_df['opstate'])
assert all([ ( (isinstance(s,float)) or (len(s.strip())== 0) or (s.strip() in valid_opstates)) for s in opstates]),\
'Invalid optstate in: {}'.format(' | '.join(opstates))
for presence_column in presence_columns:
presence_values = list(sequence_df[presence_column])
# The presence columns are *almost* always identical for all images in a sequence
single_presence_value = (len(set(presence_values)) == 1)
# assert single_presence_value
if not single_presence_value:
# print('Warning: presence value for {} is inconsistent for {}'.format(presence_column,sequence_id))
inconsistent_sequences.append(sequence_id)
if any(presence_values):
presence_columns_marked.append(presence_column)
# ...for each presence column
# Tally up the standard (survey) species
survey_species = [s.replace('present','') for s in presence_columns_marked if s != 'otherpresent']
for opstate in opstates:
if not isinstance(opstate,str):
continue
opstate = opstate.strip()
if len(opstate) == 0:
continue
if opstate in opstate_mappings:
opstate = opstate_mappings[opstate]
if (opstate != 'normal') and (opstate not in survey_species):
survey_species.append(opstate)
# If no presence columns are marked, all counts should be zero
if len(presence_columns_marked) == 0:
# count_column = count_columns[0]
for count_column in count_columns:
values = list(set(list(sequence_df[count_column])))
# Occasionally a count gets entered (correctly) without the presence column being marked
# assert len(values) == 1 and values[0] == 0, 'Non-zero counts with no presence columns marked for sequence {}'.format(sequence_id)
if (not(len(values) == 1 and values[0] == 0)):
print('Warning: presence and counts are inconsistent for {}'.format(sequence_id))
# Handle this by virtually checking the "right" box
for presence_column in presence_to_count_columns.keys():
count_columns_this_species = presence_to_count_columns[presence_column]
if count_column in count_columns_this_species:
if presence_column not in presence_columns_marked:
presence_columns_marked.append(presence_column)
# Make sure we found a match
assert len(presence_columns_marked) > 0
# Handle 'other' tags
if 'otherpresent' in presence_columns_marked:
sequence_otherwhats = set()
sequence_comments = set()
for i,r in sequence_df.iterrows():
otherwhat = r['otherwhat']
if isinstance(otherwhat,str):
otherwhat = otherwhat.strip()
if len(otherwhat) > 0:
sequence_otherwhats.add(otherwhat)
comment = r['comment']
if isinstance(comment,str):
comment = comment.strip()
if len(comment) > 0:
sequence_comments.add(comment)
freetext_species = []
for s in sequence_otherwhats:
freetext_species.append(s)
for s in sequence_comments:
freetext_species.append(s)
counted_species = []
otherpresent_columns = presence_to_count_columns['otherpresent']
# column_name = otherpresent_columns[0]
for column_name in otherpresent_columns:
if column_name in sequence_df and column_name != 'other':
column_counts = list(sequence_df[column_name])
column_count_positive = any([c > 0 for c in column_counts])
if column_count_positive:
# print('Found non-survey counted species column: {}'.format(column_name))
counted_species.append(column_name)
# ...for each non-empty presence column
# Very rarely, the "otherpresent" column is checked, but no more detail is available
if not ( (len(freetext_species) > 0) or (len(counted_species) > 0) ):
other_species.append('unknown')
other_species += freetext_species
other_species += counted_species
# ...handling non-survey species
all_species = other_species + survey_species
# Build the sequence data
images = []
# i_row = 0; row = sequence_df.iloc[i_row]
for i_row,row in sequence_df.iterrows():
im = {}
# Only one folder used a single .csv file for two subfolders
if ('RelativePath' in row) and (isinstance(row['RelativePath'],str)) and (len(row['RelativePath'].strip()) > 0):
assert 'IDFG-028' in location_name
im['file_name'] = os.path.join(row['RelativePath'],row['File'])
else:
im['file_name'] = row['File']
im['datetime'] = row['datetime']
images.append(im)
sequence = {}
sequence['csv_source'] = csv_file
sequence['sequence_id'] = sequence_id
sequence['images'] = images
sequence['species_present'] = all_species
sequence['location'] = location_name
sequences.append(sequence)
# ...for each sequence
return sequences