In [None]:
# Developed by Energize AI, Inc. (energize.ai)
# For OpenAI, Democratic AI Grant
# Aggregation Module
# October 13, 2023

In [None]:
from datetime import datetime
current_datetime = datetime.now()
print("Current Date and Time:", current_datetime)

Current Date and Time: 2023-09-25 17:30:16.939333


In [3]:
!pip install jax
!pip install jaxlib
!pip install python-decouple

Collecting jax
  Using cached jax-0.3.25-py3-none-any.whl
Collecting opt-einsum (from jax)
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Installing collected packages: opt-einsum, jax
Successfully installed jax-0.3.25 opt-einsum-3.3.0
Collecting jaxlib
  Using cached jaxlib-0.3.25-cp37-cp37m-manylinux2014_x86_64.whl (71.2 MB)
Installing collected packages: jaxlib
Successfully installed jaxlib-0.3.25
Collecting python-decouple
  Using cached python_decouple-3.8-py3-none-any.whl (9.9 kB)
Installing collected packages: python-decouple
Successfully installed python-decouple-3.8


Load the latest user and guideline data from Energize.

In [4]:
import requests
import pandas as pd
import numpy as np

ENV = "PROD"
BASE = "https://dev.api.energize.ai/api/openapi" if ENV != "PROD" else "https://api.energize.ai/api/openapi"

API_KEY = "add energize api key"

headers = {
    "authorization": "Bearer " + API_KEY,
    "Content-Type": "application/json"
}

params = {'topics':True, 'guidelines': True, 'ratings': True, 'users': True, 'mu': True }

# Make the request
data = requests.get(
    f"{BASE}/exports",
    headers=headers,
    params=params
).json()

mu = data.pop("mu")
full_df = pd.DataFrame.from_dict(data, orient = "index")


The loss function is based on the Twitter Community Notes rating algorithm. You can read their open-sourced documentation [here](https://communitynotes.twitter.com/guide/en/under-the-hood/ranking-notes).

For each user/guideline pair, the model predicts whether the user rated the guideline as helpful using the following:

The model learns three things: embeddings for the guidelines and users and user/guideline specific intercepts, as well as a global intercept.

In [5]:
import jax
import jax.numpy as jnp
"""
Calculate the loss of a set of parameters given a response matrix.

Inputs:
  mu (float): intercept adjustment across all users
  user_embeddings (array): vector embeddings of each user
  guideline_embeddings (float array): vector embeddings of each guideline, in the same space as user_embeddings
  user_intercepts (array): user-specific intercept adjustment
  guideline_intercepts (array): guideline-specific intercept adjustment
  response_matrix (array): sparse matrix, with each row corresponding to a user and each column to a guideline.
                           consists of 0,1,and nan corresponding to disliked, liked, and not rated.
  lambda_i (float): regularization weighing of the intercepts
  lambda_f (float): regularization weighing of the embeddings.
Outputs:
  (float) loss given parameters
"""
def cn_loss(mu,user_embeddings,guideline_embeddings,user_intercepts,guideline_intercepts, response_matrix,
            lambda_i = .15, lambda_f = .03):
    ypred = mu+ user_embeddings @ (guideline_embeddings.T) + (user_intercepts.reshape(-1,1) + guideline_intercepts) # calculated the predicted responses
    answered = 1-jnp.isnan(response_matrix) # mask array to determine if a user rated a guideline

    squared_error = jnp.sum(((jnp.nan_to_num(response_matrix) - ypred)* answered )**2 ) # calculate mse then mask using answered before summing
    user_frequences = jnp.sum(answered, axis=1) # number of responses each user made
    guideline_frequences = jnp.sum(answered, axis=0)  #number of ratings each guideline has

    # calculate l2 regularization on the intercepts using frequencies and on mu using answered
    intercept_regularization = jnp.sum(user_frequences * (user_intercepts **2)) + jnp.sum((answered * mu)**2) + jnp.sum(guideline_frequences * (guideline_intercepts **2))
    # calculate l1 regularization on the embeddings frequencies
    vector_regularization = jnp.sum(jnp.abs(user_embeddings) * user_frequences.reshape((-1,1)) ) + jnp.sum(jnp.abs(guideline_embeddings) * guideline_frequences.reshape((-1,1)))

    #loss: (mse + regularization terms weighted by lambdas) / total number of responses
    return (squared_error + lambda_i * intercept_regularization + lambda_f * vector_regularization) / jnp.sum(answered)


In [6]:
from tqdm import tqdm


"""
Learn parameters and embeddings from scratch based on a loss function until convergence or an iteration cap.

Inputs:
  loss (function): loss function to use to train
  data (array): sparse response matrix of response data
  iteration_cap (int): maximum number of gradient descent iterations
  pbar (bool): whether to show a tqdm progress bar while training.
  convergence_threshold (float): upper bound on change in loss between iterations for convergence
  ndim (int): embedding dimensionality.
Output:
  (dict) dictionary with the learned parameters.
"""

def train_model(loss, data, iteration_cap=10000, pbar = False, lr = .1, convergence_threshold= 10e-9, ndim = 1):
    mu = 0.0
    user_embeddings = np.random.random((data.shape[0],ndim))
    guideline_embeddings = np.random.random((data.shape[1],ndim))
    user_intercepts = np.random.random(data.shape[0])
    guideline_intercepts = np.random.random(data.shape[1])
    losses = [] # we append losses to this through the training process

    # use jax to produce a gradient of the loss function. Pass argnums to not also differentiate over the data.
    loss_gradient = jax.grad(loss, argnums=range(0,5))

    #we define generator for the training loop
    def generator():
        while (len(losses)  < iteration_cap) and (len(losses) < 100 or not np.isclose(losses[-1], losses[-2], atol = convergence_threshold)):
            yield
    iterator = generator()
    if pbar: iterator = tqdm(iterator) #if pbar is passed as a parameter then tqdm the generator


    for _ in iterator:
        # calculate loss then the gradient of that loss
        losses.append(loss(mu,user_embeddings, guideline_embeddings, user_intercepts, guideline_intercepts, data))
        lg = loss_gradient(mu,user_embeddings, guideline_embeddings, user_intercepts, guideline_intercepts, data)

        # gradient descent
        mu -= (lg[0] * lr)
        user_embeddings -= (lg[1] * lr)
        guideline_embeddings -= (lg[2] * lr)
        user_intercepts -= (lg[3] * lr)
        guideline_intercepts -= (lg[4] * lr)

    return {"mu": mu,
            "user_embeddings": user_embeddings,
            "guideline_embeddings": guideline_embeddings,
            "user_intercepts": user_intercepts,
            "guideline_intercepts": guideline_intercepts,
            "training_losses": losses
            }
"""
Learn parameters and embeddings from prexisting parameters and embeddings based on a loss function until convergence or an iteration cap.

Inputs:
  loss (function): loss function to use to train
  mu (float): previous mu
  response_data (DataFrame): user by guideline response matrix
  guideline_data (DataFrame): dataframe with prior guideline embeddings and intercepts
  user_data (DataFrame): dataframe with prior user embeddings and intercepts
  iteration_cap (int): maximum number of gradient descent iterations
  pbar (bool): whether to show a tqdm progress bar while training.
  convergence_threshold (float): upper bound on change in loss between iterations for convergence
  ndim (int): embedding dimensionality.
Output:
  (dict) dictionary with the learned parameters.
"""

def train_model_initialized(loss, mu, response_data, guideline_data, user_data,
                            iteration_cap=10000, pbar = False, lr = .1, convergence_threshold= 10e-9, ndim = 1):
    # clean data
    user_data["embeddings"] = user_data["embeddings"].apply(lambda x: np.nan if isinstance(x, list) and not x else x)
    user_data = user_data.dropna()
    guideline_data["embeddings"] = guideline_data["embeddings"].apply(lambda x: np.nan if isinstance(x, list) and not x else x)
    guideline_data = guideline_data.dropna()
    # get ids that show up in the response matrix
    data = response_data.values
    author_ids = ratings_to_response_matrix(response_df, asArray = False).index
    guideline_ids = ratings_to_response_matrix(response_df, asArray = False).columns

    # these one-liners use the previous embedding if provided or randomly initialize if not
    user_embeddings = np.array([np.array(user_data[user_data["id"] == a]["embeddings"])[0] if a in user_data["id"].tolist() else np.random.random((ndim)) for a in author_ids])
    guideline_embeddings = np.array([np.array(guideline_data[guideline_data["id"] == a]["embeddings"])[0] if a in guideline_data["id"].tolist() else np.random.random((ndim)) for a in guideline_ids])
    user_intercepts = np.array([np.array(user_data[user_data["id"] == a]["intercept"])[0] if a in user_data["id"].tolist() else np.random.random() for a in author_ids])
    guideline_intercepts = np.array([np.array(guideline_data[guideline_data["id"] == a]["intercept"])[0] if a in guideline_data["id"].tolist() else np.random.random() for a in guideline_ids])

    losses = [] # we append losses to this through the training process

    # use jax to produce a gradient of the loss function. Pass argnums to not also differentiate over the data.
    loss_gradient = jax.grad(loss, argnums=range(0,5))

    #we define generator for the training loop
    def generator():
        while (len(losses)  < iteration_cap) and (len(losses) < 100 or not np.isclose(losses[-1], losses[-2], atol = convergence_threshold)):
            yield
    iterator = generator()
    if pbar: iterator = tqdm(iterator)


    for _ in iterator: # training loop
        losses.append(loss(mu,user_embeddings, guideline_embeddings, user_intercepts, guideline_intercepts, data)) # calcuate loss
        lg = loss_gradient(mu,user_embeddings, guideline_embeddings, user_intercepts, guideline_intercepts, data) # gradient of loss

        # gradient descent
        mu -= (lg[0] * lr)
        user_embeddings -= (lg[1] * lr)
        guideline_embeddings -= (lg[2] * lr)
        user_intercepts -= (lg[3] * lr)
        guideline_intercepts -= (lg[4] * lr)

    return {"mu": mu,
            "user_embeddings": dict(map(lambda i,j : (i,j.tolist()) , author_ids,user_embeddings)),
            "guideline_embeddings": dict(map(lambda i,j : (i,j.tolist()) , guideline_ids, guideline_embeddings)),
            "user_intercepts": dict(map(lambda i,j : (i,float(j)) , author_ids,user_intercepts)),
            "guideline_intercepts": dict(map(lambda i,j : (i,float(j)) , guideline_ids,guideline_intercepts)),
            "training_losses": losses
            }

In [7]:
"""
Helper functions for working with Energize data.
"""

"""
Loads a dataframe of responses and produces a sparse matrix.
Inputs:
  df (Dataframe): response df generated from Energize data.
  asArray (Bool): whether the output should be returned as an array or dataframe.
                 defaults True, corresponding to array output.
Output:
  Depending on asArray: Either sparse array or sparse dataframe of the response matrix.
"""
def ratings_to_response_matrix(df, asArray = True):
    df["nrating"] = (df["rating"]=="helpful").astype(float)
    out =  df[["nrating", "guidelineId", "authorId"]].pivot(columns = "guidelineId", index= "authorId")
    if asArray:
        return out.values
    else:
        return out["nrating"]

"""
Loads a dataframe of guidelines and returns a dictionary mapping guideline ids to guideline texts.
Input:
  df (DataFrame): guideline df generated from Energize data.
Output:
  (dict) guidelineId-guideline mapping.
"""
def make_guideline_map(df):
    d = dict()
    df.apply(lambda x: d.update({x["id"]:x["value"]}), axis=1)
    return d

def get_guideline_data(df):
  guideline_embeddings = full_df.loc["guidelines"].dropna().apply(lambda x: x[["embedding", "intercept"]])
  return




In [8]:
response_df = pd.DataFrame.from_records(full_df.loc["ratings"].dropna())
guideline_df = pd.DataFrame.from_records(full_df.loc["guidelines"].dropna())[["id", "embeddings", "intercept"]]
user_df = pd.DataFrame.from_records(full_df.loc["users"].dropna())[["id", "embeddings", "intercept"]]
response_matrix = ratings_to_response_matrix(response_df, asArray = False)

training_result = train_model_initialized(cn_loss,  mu, response_matrix, guideline_df, user_df, pbar = True)


100it [00:09, 10.82it/s]


In [9]:
"""
Using the data from the model training, produce a constitution.

Inputs:
  training_result (dict): dictionary produced from the train_model function, including all embeddings and intercepts.
  response_df (DataFrame): response df generated from Energize data.
  guideline_df (DataFrame): response df generated from Energize data.
  response_matrix (DataFrame): user-guideline response matrix
  threshold (float): lower bound on the intercept for a guideline to make it to the constitution.
  min_ratings (int): lower bound on number of ratings for a response to be accepted
  consensus_min (float): lower bound of raw support for acceptance
Output (dict): dictionary containing accepted guideline ids and the corresponding consensus scores
"""

def produce_constitution (training_result, response_df, guideline_df, response_matrix, threshold = .375, min_ratings = 5, consensus_min= .8):
    # find subset which will be part of constitution
    accepted = {key:training_result["guideline_intercepts"][key] for key in training_result["guideline_intercepts"] if training_result["guideline_intercepts"][key] > threshold}

    df=pd.DataFrame(np.sum((~np.isnan(response_matrix)).astype(int), axis=0))
    df["raw_rates"] =  np.apply_along_axis(lambda x: np.mean(x[~np.isnan(x)]), 0, response_matrix)
    df = df.loc[(df[0] >= min_ratings) * (df["raw_rates"] >= consensus_min)] # only accept guidelines with at least min_ratings responses and <= 80% support
    df = df.loc[[i in accepted.keys() for i in df.index]] # subset by accepted
    df["intercepts"] = [accepted[id] for id in df.index] # fill intercepts
    df["consensus_score"] = (1.1-df["raw_rates"]) * df["intercepts"] + df["raw_rates"] - .1

    accepted_consensus_scores = df["consensus_score"].to_dict()
    accepted_ids = list(accepted_consensus_scores.keys())

    return {"accepted_ids": accepted_ids,
            "accepted_consensus_scores": accepted_consensus_scores,
    }

guideline_map_df = pd.DataFrame.from_records(full_df.loc["guidelines"].dropna().apply(lambda x: x["value"]))
constitution = produce_constitution(training_result, response_df, guideline_map_df, response_matrix)

In [10]:
data = {
            "accepted_guideline_ids": constitution["accepted_ids"],
            "guideline_intercepts": training_result["guideline_intercepts"],
            "guideline_embeddings": training_result["guideline_embeddings"],
            "user_embeddings": training_result["user_embeddings"],
            "user_intercepts": training_result["user_intercepts"],
            "guideline_scores": constitution["accepted_consensus_scores"],
            "mu": training_result["mu"].tolist()
          }

In [11]:
def post_to_public(constitution, training_result, data):
    headers = {
        "authorization": "Bearer " + API_KEY,
        "Content-Type": "application/json"
    }
    url = f"{BASE}/constitution"
    res = requests.post(url, json=data, headers=headers)
    return res

post_to_public(constitution, training_result, data).json()

{'success': True}

In [None]:
# Update demographic data (optional)

prolific_api_key = "add prolific api key"

# get all prolific study ids

base_url = "https://api.prolific.co/api/v1/studies/"
params = {"state": "(active|completed)"}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Token {prolific_api_key}"
}

response = requests.get(base_url, params=params, headers=headers)

if response.status_code == 200:
    study_data = response.json()
    study_ids = [study['id'] for study in study_data['results']]
else:
    print(f"Error: {response.status_code}")
    study_ids = []

print(study_ids)

import csv
from io import StringIO

def csv_to_json(csv_content: str):
    csv_file = StringIO(csv_content)
    reader = csv.DictReader(csv_file)

    result = [dict(row) for row in reader]
    return result

study_id_to_demo_data = dict()

for study_id in study_ids:
  demo_url = f"https://api.prolific.co/api/v1/studies/{study_id}/export/"

  print(demo_url)

  headers = {
      "Content-Type": "text/csv",
      "Authorization": f"Token {prolific_api_key}"
  }

  response = requests.get(demo_url, headers=headers, stream=True)

  if response.status_code == 200:
      # Reading content in chunks and joining to get the full data
      demo_data = ''.join(chunk.decode('utf-8') for chunk in response.iter_content(chunk_size=8192))
      
      parsed_data = csv_to_json(demo_data)
      
      study_id_to_demo_data[study_id] = parsed_data

  else:
      print(response.text)
      print(f"Error: {response.status_code}")

      break


# Define the API endpoint and headers
headers = {
    "authorization": "Bearer " + API_KEY,
    "Content-Type": "application/json"
}
url = f"{BASE}/prolific/demographics"

data = {
    'studyIdToDemoData': study_id_to_demo_data,
}

# Send the POST request
response = requests.post(url, json=data, headers=headers)

# Handle the response
if response.status_code == 200:
    print("Successfully sent data!")
    # print(response.json())  # Assuming the response is a JSON object
else:
    print(f"Failed to send data. Status code: {response.status_code}")
    # print(response.text)