aoo-stats/detail-by-day.py (82 lines of code) (raw):

################################################################ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # ################################################################ # This script queries the SourceForge REST API for download statistics for # sets of files on SourceForge, for a range of dates, in ISO format (YYYY-MM-DD) # passed in as a command line argument. The data, in CSV format is written to stdout. import urllib import json import sys import datetime from urllib import urlencode def getSourceForgeStats(download, startDate, endDate): url = download + "/stats/json?start_date=" + startDate + "&" "end_date=" + endDate #print >> sys.stderr, url attempts = 0 while attempts < 3: try: conn = urllib.urlopen(url) data = conn.read() return data except: attempts += 1 print url print >> sys.stderr, "error " + download + "(" + str(attempts) + ")" return "" if len(sys.argv) != 4: print "syntax: python detail-by-day.py <urls.lst> <start-date> <end-date>" print "where <urls.lst> is a list of files URLs to gather stats on, and <start-date> and <end-date> are in YYYY-MM-DD format." exit(-1) downloads = [line.strip() for line in open(sys.argv[1])] start_date = datetime.datetime.strptime(sys.argv[2], '%Y-%m-%d') end_date = datetime.datetime.strptime(sys.argv[3], '%Y-%m-%d') # columns of interest columns = [ "count_total", "count_340", "count_341", "count_400", "count_401", "count_410", "count_411", "count_412", "count_413", "count_414", "count_415", "count_416", "count_417", "count_418", "count_419", "count_4110", "count_4111", "count_4112", "count_4113", "count_4114", "count_4115", "windows", "mac", "linux", "linux32", "linux64", "deb", "rpm", "ar", "ast", "bg", "ca", "ca-XR", "ca-XV", "cs", "da", "de", "el", "en-GB", "en-US", "es", "eu", "fi", "fr", "gd", "gl", "he", "hi", "hu", "it", "ja", "km", "ko", "lt", "nb", "nl", "pl", "pt", "pt-BR", "ru", "sk", "sl", "sr", "sv", "ta", "th", "tr", "vi", "zh-CN", "zh-TW"] # Column counters are updated if the download name contains a matching pattern. # The dictionary below maps the column names to these search patterns. # If there is no entry for a column then the pattern for language columns is assumed. patternDict = { "count_total" : "", "count_340" : "3.4.0", "count_341" : "3.4.1", "count_400" : "4.0.0", "count_401" : "4.0.1", "count_410" : "4.1.0", "count_411" : "4.1.1", "count_412" : "4.1.2", "count_413" : "4.1.3", "count_414" : "4.1.4", "count_415" : "4.1.5", "count_416" : "4.1.6", "count_417" : "4.1.7", "count_418" : "4.1.8", "count_419" : "4.1.9", "count_4110" : "4.1.10", "count_4111" : "4.1.11", "count_4112" : "4.1.12", "count_4113" : "4.1.13", "count_4114" : "4.1.14", "count_4115" : "4.1.15", "windows" : "Win_x86", "mac" : "MacOS", "linux" : "Linux", "linux32" : "Linux_x86_", "linux64" : "Linux_x86-64_", "deb" : "install-deb_", "rpm" : "install-rpm_" } print( '"date","' + '","'.join(columns) + '"') today = start_date while today <= end_date: counts = dict( [(c,0) for c in columns]) date_string = today.strftime("%Y-%m-%d") print >> sys.stderr, date_string for download in downloads : try: data = json.loads(getSourceForgeStats(download,date_string,date_string)) day_count = data["total"] except ValueError: continue # update the per column counts for c in columns: pattern = patternDict[c] if c in patternDict else ("_%s." % c) if download.find(pattern) != -1: counts[c] += day_count print( date_string + ',' + ','.join( [str(counts[c]) for c in columns])) today += datetime.timedelta(days=1)