in contentselection/oracle.py [0:0]
def select_videos(df, target_hours=4500):
target_seconds = target_hours * 3600 # Convert hours to seconds
selected_videos = pd.DataFrame()
# Calculate the total number of inferred categories
inferred_categories = df['inferred_category'].unique()
total_categories = len(inferred_categories)
# Calculate the initial target seconds per inferred category
target_seconds_per_category = target_seconds / total_categories
# Shuffle rows to mix categories and channels
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Initialize dictionary to keep track of selected durations per inferred category
category_durations = {category: 0 for category in inferred_categories}
# Define a progressive penalty for repeated channels
channel_penalty_increment = 0.1 # Incremental penalty for each additional video from the same channel
# Process each inferred category
for inferred_category in inferred_categories:
category_df = df[df['inferred_category'] == inferred_category]
# Sort by user activity score and channel follower count in reverse order
category_df = category_df.sort_values(
by=['user_activity_score', 'channel_follower_count'],
ascending=[False, True]
)
current_duration = 0
channel_counter = {}
for _, row in category_df.iterrows():
if current_duration >= target_seconds_per_category:
break
channel = row['channel']
# Calculate the penalty based on the number of videos already selected from this channel
penalty_factor = 1 - (channel_counter.get(channel, 0) * channel_penalty_increment)
penalty_factor = max(penalty_factor, 0) # Ensure penalty factor doesn't go negative
# Apply penalty by using a probability check
if np.random.rand() < penalty_factor:
selected_videos = pd.concat([selected_videos, pd.DataFrame([row])])
current_duration += row['duration_seconds']
category_durations[inferred_category] += row['duration_seconds']
channel_counter[channel] = channel_counter.get(channel, 0) + 1
# Update target duration if some categories can't meet the target
remaining_seconds = target_seconds - selected_videos['duration_seconds'].sum()
remaining_categories = total_categories - len(selected_videos['inferred_category'].unique())
if remaining_categories > 0:
target_seconds_per_category = remaining_seconds / remaining_categories
# Adjust to match exactly the target duration or close
selected_videos = selected_videos.sort_values(by='duration_seconds', ascending=True)
final_selected = pd.DataFrame()
total_duration = 0
for _, row in selected_videos.iterrows():
if total_duration + row['duration_seconds'] <= target_seconds:
final_selected = pd.concat([final_selected, pd.DataFrame([row])])
total_duration += row['duration_seconds']
return final_selected