def select_videos()

in contentselection/oracle.py [0:0]


def select_videos(df, target_hours=4500):
    target_seconds = target_hours * 3600  # Convert hours to seconds
    selected_videos = pd.DataFrame()

    # Calculate the total number of inferred categories
    inferred_categories = df['inferred_category'].unique()
    total_categories = len(inferred_categories)
    
    # Calculate the initial target seconds per inferred category
    target_seconds_per_category = target_seconds / total_categories
    
    # Shuffle rows to mix categories and channels
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Initialize dictionary to keep track of selected durations per inferred category
    category_durations = {category: 0 for category in inferred_categories}
    
    # Define a progressive penalty for repeated channels
    channel_penalty_increment = 0.1  # Incremental penalty for each additional video from the same channel
    
    # Process each inferred category
    for inferred_category in inferred_categories:
        category_df = df[df['inferred_category'] == inferred_category]
        
        # Sort by user activity score and channel follower count in reverse order
        category_df = category_df.sort_values(
            by=['user_activity_score', 'channel_follower_count'],
            ascending=[False, True]
        )
        
        current_duration = 0
        channel_counter = {}
        
        for _, row in category_df.iterrows():
            if current_duration >= target_seconds_per_category:
                break
            
            channel = row['channel']
            
            # Calculate the penalty based on the number of videos already selected from this channel
            penalty_factor = 1 - (channel_counter.get(channel, 0) * channel_penalty_increment)
            penalty_factor = max(penalty_factor, 0)  # Ensure penalty factor doesn't go negative
            
            # Apply penalty by using a probability check
            if np.random.rand() < penalty_factor:
                selected_videos = pd.concat([selected_videos, pd.DataFrame([row])])
                current_duration += row['duration_seconds']
                category_durations[inferred_category] += row['duration_seconds']
                channel_counter[channel] = channel_counter.get(channel, 0) + 1
        
        # Update target duration if some categories can't meet the target
        remaining_seconds = target_seconds - selected_videos['duration_seconds'].sum()
        remaining_categories = total_categories - len(selected_videos['inferred_category'].unique())
        if remaining_categories > 0:
            target_seconds_per_category = remaining_seconds / remaining_categories
    
    # Adjust to match exactly the target duration or close
    selected_videos = selected_videos.sort_values(by='duration_seconds', ascending=True)
    
    final_selected = pd.DataFrame()
    total_duration = 0
    
    for _, row in selected_videos.iterrows():
        if total_duration + row['duration_seconds'] <= target_seconds:
            final_selected = pd.concat([final_selected, pd.DataFrame([row])])
            total_duration += row['duration_seconds']
    
    return final_selected