def csv_to_sequences()

in data_management/importers/idaho-camera-traps.py [0:0]
173 lines of code
40 McCabe index (conditional complexity)

def csv_to_sequences(csv_file):
    
    print('Processing {}'.format(csv_file))
    
    csv_file_absolute = os.path.join(input_base,csv_file)
    # os.startfile(csv_file_absolute)
    
    sequences = []
    # survey = csv_file.split('\\')[0]

    # Sample paths from which we need to derive locations:
    #
    # St.Joe_elk\AM99\Trip 1\100RECNX\TimelapseData.csv
    # Beaverhead_elk\AM34\Trip 1\100RECNX\TimelapseData.csv
    #
    # ClearCreek_mustelids\Winter2015-16\FS-001-P\FS-001-P.csv
    # ClearCreek_mustelids\Summer2015\FS-001\FS-001.csv
    # ClearCreek_mustelids\Summer2016\IDFG-016\IDFG-016.csv
    #
    # I:\idfg-images\ClearCreek_mustelids\Summer2016\IDFG-017b
    # I:\idfg-images\ClearCreek_mustelids\Summer2016\IDFG-017a
    if 'St.Joe_elk' in csv_file or 'Beaverhead_elk' in csv_file:
        location_name = '_'.join(csv_file.split('\\')[0:2]).replace(' ','')
    else:
        assert 'ClearCreek_mustelids' in csv_file
        tokens = csv_file.split('\\') 
        assert 'FS-' in tokens[2] or 'IDFG-' in tokens[2]
        location_name = '_'.join([tokens[0],tokens[2]]).replace('-P','')
        if location_name.endswith('017a') or location_name.endswith('017b'):
            location_name = location_name[:-1]
    
    # Load .csv file
    df = pd.read_csv(csv_file_absolute)
    df['datetime'] = None
    df['seq_id'] = None
    df['synthetic_frame_number'] = None
    
    # Validate the opstate column
    opstates = set(df['opstate'])
    for s in opstates:
        if isinstance(s,str):
            s = s.strip()
            if len(s) > 0:
                assert s in valid_opstates,'Invalid opstate: {}'.format(s)
    
    column_names = list(df.columns)
    
    for s in required_columns:
        assert s in column_names
    
    count_columns = [s for s in column_names if s in expected_count_columns]
    
    presence_columns = [s for s in column_names if s.endswith('present')]
    
    for s in presence_columns:
        if s not in expected_presence_columns:
            assert 'Unexpected presence column {} in {}'.format(s,csv_file)
    for s in expected_presence_columns:
        if s not in presence_columns:
            assert 'Missing presence column {} in {}'.format(s,csv_file)
    
    if False:
        for s in expected_count_columns:
            if s not in count_columns:
                print('Missing count column {} in {}'.format(s,csv_file))
        
    ## Create datetimes
    
    # print('Creating datetimes')
    
    # i_row = 0; row = df.iloc[i_row]
    for i_row,row in df.iterrows():
        
        date = row['Date']
        time = row['Time']
        datestring = date + ' ' + time
        dt = dateutil.parser.parse(datestring)
        assert dt.year >= 2015 and dt.year <= 2019
        df.loc[i_row,'datetime'] = dt
        
    # Make sure data are sorted chronologically
    #
    # In odd circumstances, they are not... so sort them first, but warn
    datetimes = list(df['datetime'])
    if not list_is_sorted(datetimes):
        print('Datetimes not sorted for {}'.format(csv_file))
    
    df = df.sort_values('datetime') 
    df.reset_index(drop=True, inplace=True)
    datetimes = list(df['datetime'])
    assert list_is_sorted(datetimes)

    # Debugging when I was trying to see what was up with the unsorted dates    
    if False:
        for i in range(0,len(datetimes)-1):
            dt = datetimes[i+1]
            prev_dt = datetimes[i]
            delta = dt - prev_dt
            assert delta >= datetime.timedelta(0)
    
    ## Parse into sequences    
    
    # print('Creating sequences')
    
    current_sequence_id = None
    next_frame_number = 0
    previous_datetime = None
        
    sequence_id_to_rows = defaultdict(list)
    
    # i_row = 0; row = df.iloc[i_row]    
    for i_row,row in df.iterrows():
        
        dt = row['datetime']
        assert dt is not None and isinstance(dt,datetime.datetime)
        
        # Start a new sequence if:
        #
        # * This image has no timestamp
        # * This image has a frame number of zero
        # * We have no previous image timestamp
        #
        if previous_datetime is None:
            delta = None
        else:
            delta = (dt - previous_datetime).total_seconds()
        
        # Start a new sequence if necessary
        if delta is None or delta > max_gap_within_sequence:
            next_frame_number = 0
            current_sequence_id = location_name + '_seq_' + str(dt) # str(uuid.uuid1())
            
        assert current_sequence_id is not None
        
        sequence_id_to_rows[current_sequence_id].append(i_row)
        df.loc[i_row,'seq_id'] = current_sequence_id
        df.loc[i_row,'synthetic_frame_number'] = next_frame_number
        next_frame_number = next_frame_number + 1
        previous_datetime = dt
        
    # ...for each row
    
    location_sequences = list(set(list(df['seq_id'])))
    location_sequences.sort()
    
    inconsistent_sequences = []
    
    
    ## Parse labels for each sequence
    
    # sequence_id = location_sequences[0]
    for sequence_id in location_sequences:
        
        sequence_row_indices = sequence_id_to_rows[sequence_id]
        assert len(sequence_row_indices) > 0
        
        # Row indices in a sequence should be adjacent
        if len(sequence_row_indices) > 1:
            d = np.diff(sequence_row_indices)
            assert(all(d==1))
        
        # sequence_df = df[df['seq_id']==sequence_id]
        sequence_df = df.iloc[sequence_row_indices]
        
        
        ## Determine what's present
        
        presence_columns_marked = []
        survey_species = []
        other_species = []
        
        # Be conservative; assume humans are present in all maintenance images
        opstates = set(sequence_df['opstate'])
        assert all([ ( (isinstance(s,float)) or (len(s.strip())== 0) or (s.strip() in valid_opstates)) for s in opstates]),\
            'Invalid optstate in: {}'.format(' | '.join(opstates))
        
        for presence_column in presence_columns:
                    
            presence_values = list(sequence_df[presence_column])
            
            # The presence columns are *almost* always identical for all images in a sequence        
            single_presence_value = (len(set(presence_values)) == 1)
            # assert single_presence_value
            if not single_presence_value:
                # print('Warning: presence value for {} is inconsistent for {}'.format(presence_column,sequence_id))
                inconsistent_sequences.append(sequence_id)                
            
            if any(presence_values):
                presence_columns_marked.append(presence_column)                
                
        # ...for each presence column
        
        # Tally up the standard (survey) species
        survey_species = [s.replace('present','') for s in presence_columns_marked if s != 'otherpresent']
        for opstate in opstates:
            if not isinstance(opstate,str):
                continue
            opstate = opstate.strip()
            if len(opstate) == 0:
                continue
            if opstate in opstate_mappings:
                    opstate = opstate_mappings[opstate]                
            if (opstate != 'normal') and (opstate not in survey_species):
                survey_species.append(opstate)
            
        # If no presence columns are marked, all counts should be zero
        if len(presence_columns_marked) == 0:
            
            # count_column = count_columns[0]
            for count_column in count_columns:
                
                values = list(set(list(sequence_df[count_column])))
                
                # Occasionally a count gets entered (correctly) without the presence column being marked
                # assert len(values) == 1 and values[0] == 0, 'Non-zero counts with no presence columns marked for sequence {}'.format(sequence_id)
                if (not(len(values) == 1 and values[0] == 0)):
                    print('Warning: presence and counts are inconsistent for {}'.format(sequence_id))
                    
                    # Handle this by virtually checking the "right" box
                    for presence_column in presence_to_count_columns.keys():
                        count_columns_this_species = presence_to_count_columns[presence_column]
                        if count_column in count_columns_this_species:
                            if presence_column not in presence_columns_marked:
                                presence_columns_marked.append(presence_column)
                    
                    # Make sure we found a match
                    assert len(presence_columns_marked) > 0
                
        # Handle 'other' tags
        if 'otherpresent' in presence_columns_marked:
            
            sequence_otherwhats = set()
            sequence_comments = set()
            
            for i,r in sequence_df.iterrows():            
                otherwhat = r['otherwhat']
                if isinstance(otherwhat,str):
                    otherwhat = otherwhat.strip()
                    if len(otherwhat) > 0:
                        sequence_otherwhats.add(otherwhat)
                comment = r['comment']
                if isinstance(comment,str):
                    comment = comment.strip()
                    if len(comment) > 0:
                        sequence_comments.add(comment)
                
            freetext_species = []
            for s in sequence_otherwhats:
                freetext_species.append(s)
            for s in sequence_comments:
                freetext_species.append(s)
                
            counted_species = []
            
            otherpresent_columns = presence_to_count_columns['otherpresent']
            
            # column_name = otherpresent_columns[0]
            for column_name in otherpresent_columns:
            
                if column_name in sequence_df and column_name != 'other':
            
                    column_counts = list(sequence_df[column_name])
                    column_count_positive = any([c > 0 for c in column_counts])
                    
                    if column_count_positive:
                        # print('Found non-survey counted species column: {}'.format(column_name))
                        counted_species.append(column_name)
            
            # ...for each non-empty presence column
        
            # Very rarely, the "otherpresent" column is checked, but no more detail is available
            if not ( (len(freetext_species) > 0) or (len(counted_species) > 0) ):
                other_species.append('unknown')
                
            other_species += freetext_species
            other_species += counted_species
            
        # ...handling non-survey species
        
        all_species = other_species + survey_species
                 
        # Build the sequence data
        
        images = []
        # i_row = 0; row = sequence_df.iloc[i_row]
        for i_row,row in sequence_df.iterrows():
            im = {}
            # Only one folder used a single .csv file for two subfolders
            if ('RelativePath' in row) and (isinstance(row['RelativePath'],str)) and (len(row['RelativePath'].strip()) > 0):
                assert 'IDFG-028' in location_name
                im['file_name'] = os.path.join(row['RelativePath'],row['File'])
            else:
                im['file_name'] = row['File']
            im['datetime'] = row['datetime']
            images.append(im)
            
        sequence = {}
        sequence['csv_source'] = csv_file
        sequence['sequence_id'] = sequence_id
        sequence['images'] = images
        sequence['species_present'] = all_species
        sequence['location'] = location_name
        
        sequences.append(sequence)
        
    # ...for each sequence

    return sequences