analysis/count_words_in_dataset.py (14 lines of code) (raw):

from collections import Counter import string def count_words(df, column_name): # Initialize a Counter to count all words overall_counter = Counter() # List to store word counts per row word_counts = [] for text in df[column_name]: text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) words = text.lower().split() word_count = len(words) word_counts.append(word_count) # Update overall word counter overall_counter.update(words) # Add word counts as a new column df['word_count'] = word_counts # Get the most common words most_common_words = overall_counter.most_common(100) return df, most_common_words # df_texts, most_common_words = count_words(df_texts, 'question')