retail/recommendation-system/bqml-scann/sql_scripts/sp_ComputePMI.sql (85 lines of code) (raw):

CREATE OR REPLACE PROCEDURE @DATASET_NAME.sp_ComputePMI( IN min_item_frequency INT64, IN max_group_size INT64 ) BEGIN DECLARE total INT64; # Get items with minimum frequency CREATE OR REPLACE TABLE @DATASET_NAME.valid_item_groups AS # Create valid item set WITH valid_items AS ( SELECT item_Id, COUNT(group_Id) AS item_frequency FROM @DATASET_NAME.vw_item_groups GROUP BY item_Id HAVING item_frequency >= min_item_frequency ), # Create valid group set valid_groups AS ( SELECT group_Id, COUNT(item_Id) AS group_size FROM @DATASET_NAME.vw_item_groups WHERE item_Id IN (SELECT item_Id FROM valid_items) GROUP BY group_Id HAVING group_size BETWEEN 2 AND max_group_size ) SELECT item_Id, group_Id FROM @DATASET_NAME.vw_item_groups WHERE item_Id IN (SELECT item_Id FROM valid_items) AND group_Id IN (SELECT group_Id FROM valid_groups); # Compute pairwise cooc CREATE OR REPLACE TABLE @DATASET_NAME.item_cooc AS SELECT item1_Id, item2_Id, SUM(cooc) AS cooc FROM ( SELECT a.item_Id item1_Id, b.item_Id item2_Id, 1 as cooc FROM @DATASET_NAME.valid_item_groups a JOIN @DATASET_NAME.valid_item_groups b ON a.group_Id = b.group_Id AND a.item_Id < b.item_Id ) GROUP BY item1_Id, item2_Id; ################################### # Compute item frequencies CREATE OR REPLACE TABLE @DATASET_NAME.item_frequency AS SELECT item_Id, COUNT(group_Id) AS frequency FROM @DATASET_NAME.valid_item_groups GROUP BY item_Id; ################################### # Compute total frequency |D| SET total = ( SELECT SUM(frequency) AS total FROM @DATASET_NAME.item_frequency ); ################################### # Add mirror item-pair cooc and same item frequency as cooc CREATE OR REPLACE TABLE @DATASET_NAME.item_cooc AS SELECT item1_Id, item2_Id, cooc FROM @DATASET_NAME.item_cooc UNION ALL SELECT item2_Id as item1_Id, item1_Id AS item2_Id, cooc FROM @DATASET_NAME.item_cooc UNION ALL SELECT item_Id as item1_Id, item_Id AS item2_Id, frequency as cooc FROM @DATASET_NAME.item_frequency; ################################### # Compute PMI CREATE OR REPLACE TABLE @DATASET_NAME.item_cooc AS SELECT a.item1_Id, a.item2_Id, a.cooc, LOG(a.cooc, 2) - LOG(b.frequency, 2) - LOG(c.frequency, 2) + LOG(total, 2) AS pmi FROM @DATASET_NAME.item_cooc a JOIN @DATASET_NAME.item_frequency b ON a.item1_Id = b.item_Id JOIN @DATASET_NAME.item_frequency c ON a.item2_Id = c.item_Id; END