perf_dashboard/posts_stats.py (40 lines of code) (raw):

# Copyright 2017 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """A script to collect the number of StackOverflow posts related to Python and Google Cloud Platform.""" import datetime import os import sys import time import uuid from collections import Counter from google.cloud import bigquery import bq_utils GCLOUD_PROJECT_ENV = 'GCLOUD_PROJECT' DATASET_NAME = 'stackoverflow' TAG_COUNT_TABLE_NAME = 'tag_count_timestamp' UNANSWERED_POSTS_TABLE_NAME = 'unanswered_posts' def get_stackoverflow_tags_count(): """Get all the tags contains python and cloud key words""" query = """ SELECT SPLIT(tags, '|') tags FROM `bigquery-public-data.stackoverflow.posts_questions` WHERE tags LIKE '%python%' AND (tags LIKE '%google-cloud-platform%' OR tags LIKE '%gcp%') """ results = bq_utils.execute_query(query) rows = [row[0] for row in results] return rows def get_posts_list_unanswered(): # Get the list of posts that are unanswered query = """ SELECT id, title, tags FROM `bigquery-public-data.stackoverflow.posts_questions` WHERE tags LIKE '%python%' AND (tags LIKE '%google-cloud-platform%' OR tags LIKE '%gcp%') AND accepted_answer_id is NULL AND answer_count = 0; """ results = bq_utils.execute_query(query) # Add current timestamp to the rows date_time = datetime.datetime.now() rows = [(date_time,) + row for row in results] return rows def count_unique_tags(data): flattened_tag_list = [tag for tag_list in data for tag in tag_list] tag_count = Counter(flattened_tag_list) # Add current timestamp to the rows date_time = datetime.datetime.now() time_tag_count = [(date_time,) + item for item in tag_count.items()] return time_tag_count def main(): project = os.environ.get(GCLOUD_PROJECT_ENV) # Get the posts count for each tag rows = get_stackoverflow_tags_count() tag_count = count_unique_tags(rows) bq_utils.insert_rows( project, DATASET_NAME, TAG_COUNT_TABLE_NAME, tag_count) # Get the list of unanswered posts unanswered_posts = get_posts_list_unanswered() bq_utils.insert_rows( project, DATASET_NAME, UNANSWERED_POSTS_TABLE_NAME, unanswered_posts) if __name__ == '__main__': main()