in sourcecode/scoring/post_selection_similarity_old.py [0:0]
def _get_pair_tuples_optimized(ratings, windowMillis):
# Sort ratings by noteIdKey and createdAtMillisKey
ratings_sorted = ratings.sort_values([c.noteIdKey, c.createdAtMillisKey])
# Initialize an empty list to store the result
tuples = []
# Group by noteIdKey to process each note individually
grouped = ratings_sorted.groupby(c.noteIdKey, sort=False)
for noteId, group in grouped:
# Extract relevant columns as numpy arrays for efficient computation
times = group[c.createdAtMillisKey].values
raters = group[c.raterParticipantIdKey].values
priorTweet = group[c.tweetIdKey].iloc[0]
n = len(group)
window_start = 0 # Start index of the sliding window
for i in range(n):
# Move the window start forward if the time difference exceeds windowMillis
while times[i] - times[window_start] > windowMillis:
window_start += 1
# For all indices within the sliding window (excluding the current index)
for j in range(window_start, i):
# Check if raters are different
if raters[i] != raters[j]:
# Sort raters to maintain consistency
leftRater, rightRater = tuple(sorted((raters[i], raters[j])))
tuples.append((leftRater, rightRater, priorTweet))
return tuples