dynamicfilters/worddensityfiltering.py (35 lines of code) (raw):
import pandas as pd
#
# Given a pandas dataframe with a list of videos and the metadata extracted from YT-Commons,
# this script creates the columns duration_seconds and word_density with the goal to study word_density across the dataset
# Finally it drops all entries in the dataframe with word density < 0.5
#
### CONFIG ###
input_pkl = 'path_to_your_input_df.pkl'
output_pkl = 'path_to_your_output_df.pkl'
visualize = False # Toggle to true to inspect some results close to 1 and 0.5 word density values.
###
df = pd.read_pickle(input_pkl)
#Adding word_density and duration_seconds to the dataframe
def duration_to_seconds(duration):
if pd.isnull(duration):
return 0 # or np.nan or another default
parts = duration.split(':')
parts = [int(p) for p in parts]
if len(parts) == 3: # hh:mm:ss
return parts[0] * 3600 + parts[1] * 60 + parts[2]
elif len(parts) == 2: # mm:ss
return parts[0] * 60 + parts[1]
elif len(parts) == 1: # ss
return parts[0]
else:
return 0 # or np.nan if format is unrecognized
# Apply the conversion function to the 'duration_string' column
df['duration_seconds'] = df['duration_string'].apply(duration_to_seconds)
# Calculate word density
# Word density is the number of words per second, so we divide word_count by duration_seconds
df['word_density'] = df.apply(lambda row: row['word_count'] / row['duration_seconds']
if row['duration_seconds'] > 0 else 0, axis=1)
if visualize:
from tabulate import tabulate
#Visualizing some results
def get_samples_near_target(df, target, range_width=0.1, num_samples=3):
"""
Get samples from the DataFrame that have 'word_density' close to the target value.
:param df: DataFrame to sample from.
:param target: The target word density to find samples around.
:param range_width: The width of the range around the target value.
:param num_samples: Number of samples to return.
:return: A DataFrame with samples close to the target density.
"""
# Define the range around the target
lower_bound = target - range_width
upper_bound = target + range_width
# Filter and sample
samples = df[(df['word_density'] >= lower_bound) & (df['word_density'] <= upper_bound)].sample(n=num_samples, random_state=1)
return samples
close_to_1 = get_samples_near_target(df, 1, num_samples = 100)[['video_id', 'duration_string', 'title']]
print(tabulate(close_to_1,headers='keys', tablefmt='pretty', showindex=False))
close_to_05 = get_samples_near_target(df, 0.5, num_samples = 100)[['video_id', 'duration_string', 'title']]
print(tabulate(close_to_05,headers='keys', tablefmt='pretty', showindex=False))
# We cut at 0.5
df = df.loc[df['word_density'] > 0.5]
print(f"Total videos: {len(df)}")
df.to_pickle(output_pkl)