contentselection/oracle.py (97 lines of code) (raw):
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import json
#
# Given a pandas dataframe with a list of videos and the metadata extracted from YT-Commons,
# this script creates a new dataframe with a list of videoids that the target the hours of video that we want to collect.
#
### CONFIG ###
input_pkl = 'path_to_your_current_videos_df.pkl'
output_pkl = 'path_to_your_output_df.pkl'
taxonomy_path = 'content_taxonomy.json'
target_hours = 4500
###
# Step 1: Preprocess the Data
def preprocess_df(df):
# Fill NaNs with 0 or suitable values
df['comment_count'] = df['comment_count'].fillna(0)
df['view_count'] = df['view_count'].fillna(0)
df['like_count'] = df['like_count'].fillna(0)
df['channel_follower_count'] = df['channel_follower_count'].fillna(0)
df['duration_seconds'] = df['duration_seconds'].fillna(0)
# Normalize numerical columns for fair weighting
scaler = MinMaxScaler()
df[['comment_count', 'view_count', 'like_count']] = scaler.fit_transform(
df[['comment_count', 'view_count', 'like_count']]
)
return df
# Step 2: Compute User Activity Score
def compute_user_activity(df, weights=(0.2, 0.5, 0.3)):
# Weights: 0.2 for comments, 0.5 for views, 0.3 for likes
df['user_activity_score'] = (
weights[0] * df['comment_count'] +
weights[1] * df['view_count'] +
weights[2] * df['like_count']
)
return df
# Step 3: Map Inferred Categories to Higher Taxonomy Levels
# Note: this was not used in the final version of the content selection algorithm but is useful data that we let in the dataset.
def map_to_parent_categories(df, taxonomy):
"""
Maps each inferred category in the DataFrame to its top-level parent category
in the hierarchical taxonomy.
:param df: DataFrame containing video data with an 'inferred_category' column.
:param taxonomy: A nested dictionary representing the hierarchical taxonomy.
:return: DataFrame with an added 'parent_category' column representing the top-level parent category.
"""
# Helper function to find the top-level parent category
def find_top_parent_category(leaf_name, taxonomy):
"""
Finds the top-level parent category of a given leaf in the hierarchical taxonomy.
:param leaf_name: A string representing the leaf node to search for.
:param taxonomy: A dictionary representing the full hierarchical taxonomy.
:return: The top-level parent category of the given leaf if found, else None.
"""
def recursive_search(taxonomy, leaf_name, current_top_category):
for category, subcategories in taxonomy.items():
if category == leaf_name:
# Found the leaf node; return the top-level category
return current_top_category
if isinstance(subcategories, dict):
# Continue searching deeper
found_category = recursive_search(subcategories, leaf_name, current_top_category)
if found_category:
return found_category
return None
# Start the search with top-level categories
for top_category, subcategories in taxonomy.items():
result = recursive_search(subcategories, leaf_name, top_category)
if result:
return result
return None
# Map each inferred category to its top-level parent category
df['parent_category'] = df['inferred_category'].apply(lambda x: find_top_parent_category(x, taxonomy))
return df
# Step 4: Select Videos for Diversity and Total Duration
def select_videos(df, target_hours=4500):
target_seconds = target_hours * 3600 # Convert hours to seconds
selected_videos = pd.DataFrame()
# Calculate the total number of inferred categories
inferred_categories = df['inferred_category'].unique()
total_categories = len(inferred_categories)
# Calculate the initial target seconds per inferred category
target_seconds_per_category = target_seconds / total_categories
# Shuffle rows to mix categories and channels
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Initialize dictionary to keep track of selected durations per inferred category
category_durations = {category: 0 for category in inferred_categories}
# Define a progressive penalty for repeated channels
channel_penalty_increment = 0.1 # Incremental penalty for each additional video from the same channel
# Process each inferred category
for inferred_category in inferred_categories:
category_df = df[df['inferred_category'] == inferred_category]
# Sort by user activity score and channel follower count in reverse order
category_df = category_df.sort_values(
by=['user_activity_score', 'channel_follower_count'],
ascending=[False, True]
)
current_duration = 0
channel_counter = {}
for _, row in category_df.iterrows():
if current_duration >= target_seconds_per_category:
break
channel = row['channel']
# Calculate the penalty based on the number of videos already selected from this channel
penalty_factor = 1 - (channel_counter.get(channel, 0) * channel_penalty_increment)
penalty_factor = max(penalty_factor, 0) # Ensure penalty factor doesn't go negative
# Apply penalty by using a probability check
if np.random.rand() < penalty_factor:
selected_videos = pd.concat([selected_videos, pd.DataFrame([row])])
current_duration += row['duration_seconds']
category_durations[inferred_category] += row['duration_seconds']
channel_counter[channel] = channel_counter.get(channel, 0) + 1
# Update target duration if some categories can't meet the target
remaining_seconds = target_seconds - selected_videos['duration_seconds'].sum()
remaining_categories = total_categories - len(selected_videos['inferred_category'].unique())
if remaining_categories > 0:
target_seconds_per_category = remaining_seconds / remaining_categories
# Adjust to match exactly the target duration or close
selected_videos = selected_videos.sort_values(by='duration_seconds', ascending=True)
final_selected = pd.DataFrame()
total_duration = 0
for _, row in selected_videos.iterrows():
if total_duration + row['duration_seconds'] <= target_seconds:
final_selected = pd.concat([final_selected, pd.DataFrame([row])])
total_duration += row['duration_seconds']
return final_selected
def main_algorithm(df, taxonomy_file, target_hours = 4500):
df = preprocess_df(df)
df = compute_user_activity(df)
# Load taxonomy from JSON file
with open(taxonomy_file, 'r') as file:
taxonomy = json.load(file)
# Map inferred categories to their parent categories
df = map_to_parent_categories(df, taxonomy)
# Select videos based on updated criteria
selected_videos = select_videos(df, target_hours=target_hours)
print(f"Total selected videos: {len(selected_videos)}")
print(f"Total duration (seconds): {selected_videos['duration_seconds'].sum()}")
return selected_videos
# Run the algorithm
df = pd.read_pickle(input_pkl)
selected_videos_df = main_algorithm(df, taxonomy_path, target_hours=target_hours)
select_videos.to_pickle(output_pkl)