filtering_notebooks/blindspots.ipynb (2,462 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": 50,
"id": "b25e0643",
"metadata": {},
"outputs": [],
"source": [
"import jsonlines\n",
"import pandas as pd\n",
"import pickle\n",
"import pprint\n",
"\n",
"from collections import Counter\n",
"from datasets import load_from_disk\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "33879ae4",
"metadata": {},
"outputs": [],
"source": [
"pp = pprint.PrettyPrinter(indent=2)\n",
"pd.set_option(\"display.max_rows\", 500)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "4bcbc961",
"metadata": {},
"outputs": [],
"source": [
"def get_domain(uri):\n",
" return uri.split(\"/\")[2]\n",
"\n",
"\n",
"def get_df(domains_count):\n",
" domains = []\n",
" counts = []\n",
" for domain, count in domains_count.most_common():\n",
" domains.append(domain)\n",
" counts.append(count)\n",
"\n",
" total = sum(counts)\n",
" ratio = [(100 * i) / total for i in counts]\n",
" return pd.DataFrame.from_dict({\"domain\": domains, \"count\": counts, \"ratio\": ratio})"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "3c7a42e7",
"metadata": {},
"outputs": [],
"source": [
"with open(\"/home/piktus_huggingface_co/1tt/data/wiki_domains.pkl\", \"rb\") as handle:\n",
" wiki_domains = pickle.load(handle)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "931edb4c",
"metadata": {},
"outputs": [],
"source": [
"wiki_df = get_df(wiki_domains)\n",
"wiki_df = wiki_df[:100]"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "075cb7ec",
"metadata": {},
"outputs": [],
"source": [
"with open(\"/home/piktus_huggingface_co/1tt/data/oscar_domains.pkl\", \"rb\") as handle:\n",
" oscar_domains = pickle.load(handle)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "6cc1109e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>domain</th>\n",
" <th>count</th>\n",
" <th>ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>pubmed.ncbi.nlm.nih.gov</td>\n",
" <td>164199</td>\n",
" <td>0.038010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>www.theguardian.com</td>\n",
" <td>102966</td>\n",
" <td>0.023835</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>unistore.www.microsoft.com</td>\n",
" <td>67582</td>\n",
" <td>0.015644</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>us.vestiairecollective.com</td>\n",
" <td>64876</td>\n",
" <td>0.015018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>imgur.com</td>\n",
" <td>62244</td>\n",
" <td>0.014409</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>www.reuters.com</td>\n",
" <td>61665</td>\n",
" <td>0.014275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>espas.secure.europarl.europa.eu</td>\n",
" <td>60999</td>\n",
" <td>0.014120</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>www.forbes.com</td>\n",
" <td>58727</td>\n",
" <td>0.013594</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" domain count ratio\n",
"0 pubmed.ncbi.nlm.nih.gov 164199 0.038010\n",
"1 www.theguardian.com 102966 0.023835\n",
"2 unistore.www.microsoft.com 67582 0.015644\n",
"3 us.vestiairecollective.com 64876 0.015018\n",
"4 imgur.com 62244 0.014409\n",
"5 www.reuters.com 61665 0.014275\n",
"6 espas.secure.europarl.europa.eu 60999 0.014120\n",
"7 www.forbes.com 58727 0.013594"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oscar_df = get_df(oscar_domains)\n",
"oscar_df[:8]"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "4131dc3a",
"metadata": {},
"outputs": [],
"source": [
"both = wiki_df.join(\n",
" oscar_df.set_index(\"domain\"),\n",
" on=\"domain\",\n",
" how=\"left\",\n",
" lsuffix=\"_wiki\",\n",
" rsuffix=\"_oscar\",\n",
").fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "290bf0cd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>domain</th>\n",
" <th>count_wiki</th>\n",
" <th>ratio_wiki</th>\n",
" <th>count_oscar</th>\n",
" <th>ratio_oscar</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>4321781</td>\n",
" <td>14.761861</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>books.google.com</td>\n",
" <td>923239</td>\n",
" <td>3.153498</td>\n",
" <td>11530.0</td>\n",
" <td>2.669027e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>archive.org</td>\n",
" <td>422327</td>\n",
" <td>1.442538</td>\n",
" <td>2138.0</td>\n",
" <td>4.949158e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>www.nytimes.com</td>\n",
" <td>286414</td>\n",
" <td>0.978301</td>\n",
" <td>2090.0</td>\n",
" <td>4.838045e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>www.ncbi.nlm.nih.gov</td>\n",
" <td>282416</td>\n",
" <td>0.964645</td>\n",
" <td>1811.0</td>\n",
" <td>4.192201e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>www.bbc.co.uk</td>\n",
" <td>222875</td>\n",
" <td>0.761272</td>\n",
" <td>6305.0</td>\n",
" <td>1.459516e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>www.theguardian.com</td>\n",
" <td>192091</td>\n",
" <td>0.656123</td>\n",
" <td>102966.0</td>\n",
" <td>2.383513e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>news.bbc.co.uk</td>\n",
" <td>166381</td>\n",
" <td>0.568306</td>\n",
" <td>41252.0</td>\n",
" <td>9.549236e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>www.billboard.com</td>\n",
" <td>165065</td>\n",
" <td>0.563811</td>\n",
" <td>3915.0</td>\n",
" <td>9.062654e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>www.census.gov</td>\n",
" <td>161907</td>\n",
" <td>0.553024</td>\n",
" <td>16765.0</td>\n",
" <td>3.880853e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>www.youtube.com</td>\n",
" <td>149568</td>\n",
" <td>0.510878</td>\n",
" <td>35.0</td>\n",
" <td>8.101990e-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>news.google.com</td>\n",
" <td>127045</td>\n",
" <td>0.433946</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>www.newspapers.com</td>\n",
" <td>126038</td>\n",
" <td>0.430507</td>\n",
" <td>9259.0</td>\n",
" <td>2.143323e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>www.allmusic.com</td>\n",
" <td>99734</td>\n",
" <td>0.340660</td>\n",
" <td>3159.0</td>\n",
" <td>7.312624e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>nla.gov.au</td>\n",
" <td>84899</td>\n",
" <td>0.289989</td>\n",
" <td>151.0</td>\n",
" <td>3.495430e-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>www.washingtonpost.com</td>\n",
" <td>82463</td>\n",
" <td>0.281668</td>\n",
" <td>14755.0</td>\n",
" <td>3.415567e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>www.telegraph.co.uk</td>\n",
" <td>82355</td>\n",
" <td>0.281299</td>\n",
" <td>37889.0</td>\n",
" <td>8.770751e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>www.espncricinfo.com</td>\n",
" <td>64787</td>\n",
" <td>0.221292</td>\n",
" <td>9020.0</td>\n",
" <td>2.087998e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>www.imdb.com</td>\n",
" <td>64419</td>\n",
" <td>0.220035</td>\n",
" <td>4381.0</td>\n",
" <td>1.014138e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>www.independent.co.uk</td>\n",
" <td>62139</td>\n",
" <td>0.212248</td>\n",
" <td>30507.0</td>\n",
" <td>7.061926e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>www.sports-reference.com</td>\n",
" <td>59964</td>\n",
" <td>0.204818</td>\n",
" <td>57.0</td>\n",
" <td>1.319467e-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>itunes.apple.com</td>\n",
" <td>59719</td>\n",
" <td>0.203982</td>\n",
" <td>8590.0</td>\n",
" <td>1.988460e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>tvbythenumbers.zap2it.com</td>\n",
" <td>58393</td>\n",
" <td>0.199452</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>timesofindia.indiatimes.com</td>\n",
" <td>57124</td>\n",
" <td>0.195118</td>\n",
" <td>3485.0</td>\n",
" <td>8.067267e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>www.baseball-reference.com</td>\n",
" <td>54084</td>\n",
" <td>0.184734</td>\n",
" <td>1767.0</td>\n",
" <td>4.090347e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>www.bbc.com</td>\n",
" <td>53718</td>\n",
" <td>0.183484</td>\n",
" <td>5123.0</td>\n",
" <td>1.185900e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>www.stat.gov.pl</td>\n",
" <td>51732</td>\n",
" <td>0.176700</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>variety.com</td>\n",
" <td>51519</td>\n",
" <td>0.175973</td>\n",
" <td>1841.0</td>\n",
" <td>4.261646e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>twitter.com</td>\n",
" <td>48608</td>\n",
" <td>0.166030</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>www.animenewsnetwork.com</td>\n",
" <td>48513</td>\n",
" <td>0.165705</td>\n",
" <td>1446.0</td>\n",
" <td>3.347279e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>www.hollywoodreporter.com</td>\n",
" <td>48282</td>\n",
" <td>0.164916</td>\n",
" <td>1651.0</td>\n",
" <td>3.821824e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>www.reuters.com</td>\n",
" <td>46416</td>\n",
" <td>0.158543</td>\n",
" <td>61665.0</td>\n",
" <td>1.427455e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>www.thehindu.com</td>\n",
" <td>45338</td>\n",
" <td>0.154861</td>\n",
" <td>2269.0</td>\n",
" <td>5.252404e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>cricketarchive.com</td>\n",
" <td>43944</td>\n",
" <td>0.150099</td>\n",
" <td>492.0</td>\n",
" <td>1.138908e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>articles.latimes.com</td>\n",
" <td>43628</td>\n",
" <td>0.149020</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>www.discogs.com</td>\n",
" <td>42528</td>\n",
" <td>0.145262</td>\n",
" <td>2377.0</td>\n",
" <td>5.502408e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>deadline.com</td>\n",
" <td>41989</td>\n",
" <td>0.143421</td>\n",
" <td>16324.0</td>\n",
" <td>3.778768e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>www.officialcharts.com</td>\n",
" <td>40620</td>\n",
" <td>0.138745</td>\n",
" <td>127.0</td>\n",
" <td>2.939865e-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>www.metacritic.com</td>\n",
" <td>40061</td>\n",
" <td>0.136836</td>\n",
" <td>5457.0</td>\n",
" <td>1.263216e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>www.abc.net.au</td>\n",
" <td>39787</td>\n",
" <td>0.135900</td>\n",
" <td>18879.0</td>\n",
" <td>4.370213e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>books.google.co.uk</td>\n",
" <td>38707</td>\n",
" <td>0.132211</td>\n",
" <td>15336.0</td>\n",
" <td>3.550060e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>web.archive.org</td>\n",
" <td>38564</td>\n",
" <td>0.131723</td>\n",
" <td>1.0</td>\n",
" <td>2.314854e-07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>www.facebook.com</td>\n",
" <td>37873</td>\n",
" <td>0.129362</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>www.cbc.ca</td>\n",
" <td>37385</td>\n",
" <td>0.127696</td>\n",
" <td>26628.0</td>\n",
" <td>6.163994e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>www.amazon.com</td>\n",
" <td>37280</td>\n",
" <td>0.127337</td>\n",
" <td>2721.0</td>\n",
" <td>6.298718e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>www.espn.com</td>\n",
" <td>36758</td>\n",
" <td>0.125554</td>\n",
" <td>39528.0</td>\n",
" <td>9.150155e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>www.latimes.com</td>\n",
" <td>36353</td>\n",
" <td>0.124171</td>\n",
" <td>26274.0</td>\n",
" <td>6.082048e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>www.usatoday.com</td>\n",
" <td>35919</td>\n",
" <td>0.122688</td>\n",
" <td>5631.0</td>\n",
" <td>1.303494e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>www.rollingstone.com</td>\n",
" <td>35602</td>\n",
" <td>0.121605</td>\n",
" <td>7996.0</td>\n",
" <td>1.850957e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>www.smh.com.au</td>\n",
" <td>33701</td>\n",
" <td>0.115112</td>\n",
" <td>41687.0</td>\n",
" <td>9.649933e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50</th>\n",
" <td>int.soccerway.com</td>\n",
" <td>32654</td>\n",
" <td>0.111536</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>www.forbes.com</td>\n",
" <td>32440</td>\n",
" <td>0.110805</td>\n",
" <td>58727.0</td>\n",
" <td>1.359444e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52</th>\n",
" <td>www.cnn.com</td>\n",
" <td>32378</td>\n",
" <td>0.110593</td>\n",
" <td>18908.0</td>\n",
" <td>4.376926e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>espn.go.com</td>\n",
" <td>31913</td>\n",
" <td>0.109005</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>www.huffingtonpost.com</td>\n",
" <td>30221</td>\n",
" <td>0.103226</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>www.pro-football-reference.com</td>\n",
" <td>30088</td>\n",
" <td>0.102771</td>\n",
" <td>34.0</td>\n",
" <td>7.870504e-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>www.bloomberg.com</td>\n",
" <td>30028</td>\n",
" <td>0.102566</td>\n",
" <td>1.0</td>\n",
" <td>2.314854e-07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>www.mtv.com</td>\n",
" <td>29337</td>\n",
" <td>0.100206</td>\n",
" <td>20034.0</td>\n",
" <td>4.637579e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>babel.hathitrust.org</td>\n",
" <td>28413</td>\n",
" <td>0.097050</td>\n",
" <td>1.0</td>\n",
" <td>2.314854e-07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>www.nba.com</td>\n",
" <td>27308</td>\n",
" <td>0.093276</td>\n",
" <td>14693.0</td>\n",
" <td>3.401215e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>pitchfork.com</td>\n",
" <td>26749</td>\n",
" <td>0.091366</td>\n",
" <td>17701.0</td>\n",
" <td>4.097523e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>www.ign.com</td>\n",
" <td>26743</td>\n",
" <td>0.091346</td>\n",
" <td>8201.0</td>\n",
" <td>1.898412e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>www.itis.gov</td>\n",
" <td>26056</td>\n",
" <td>0.088999</td>\n",
" <td>3256.0</td>\n",
" <td>7.537165e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>www.wsj.com</td>\n",
" <td>25622</td>\n",
" <td>0.087517</td>\n",
" <td>3534.0</td>\n",
" <td>8.180695e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>www.gbif.org</td>\n",
" <td>25432</td>\n",
" <td>0.086868</td>\n",
" <td>87.0</td>\n",
" <td>2.013923e-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>geonames.usgs.gov</td>\n",
" <td>24953</td>\n",
" <td>0.085232</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>www.showbuzzdaily.com</td>\n",
" <td>24882</td>\n",
" <td>0.084989</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>www.uefa.com</td>\n",
" <td>24811</td>\n",
" <td>0.084747</td>\n",
" <td>2204.0</td>\n",
" <td>5.101939e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>www.npr.org</td>\n",
" <td>24231</td>\n",
" <td>0.082766</td>\n",
" <td>49011.0</td>\n",
" <td>1.134533e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>bugguide.net</td>\n",
" <td>23956</td>\n",
" <td>0.081826</td>\n",
" <td>503.0</td>\n",
" <td>1.164372e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>www.gamespot.com</td>\n",
" <td>23634</td>\n",
" <td>0.080726</td>\n",
" <td>2539.0</td>\n",
" <td>5.877415e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>gaonchart.co.kr</td>\n",
" <td>23462</td>\n",
" <td>0.080139</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>www.nhl.com</td>\n",
" <td>23405</td>\n",
" <td>0.079944</td>\n",
" <td>3464.0</td>\n",
" <td>8.018655e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>www.oricon.co.jp</td>\n",
" <td>23205</td>\n",
" <td>0.079261</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>www.basketball-reference.com</td>\n",
" <td>23185</td>\n",
" <td>0.079193</td>\n",
" <td>15.0</td>\n",
" <td>3.472281e-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>www.rottentomatoes.com</td>\n",
" <td>23133</td>\n",
" <td>0.079015</td>\n",
" <td>4244.0</td>\n",
" <td>9.824241e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>www.rsssf.com</td>\n",
" <td>22979</td>\n",
" <td>0.078489</td>\n",
" <td>2187.0</td>\n",
" <td>5.062586e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>timesmachine.nytimes.com</td>\n",
" <td>22558</td>\n",
" <td>0.077051</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>www.nfl.com</td>\n",
" <td>21358</td>\n",
" <td>0.072952</td>\n",
" <td>16389.0</td>\n",
" <td>3.793814e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>www.irishtimes.com</td>\n",
" <td>21144</td>\n",
" <td>0.072221</td>\n",
" <td>1419.0</td>\n",
" <td>3.284778e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80</th>\n",
" <td>www.stuff.co.nz</td>\n",
" <td>20794</td>\n",
" <td>0.071026</td>\n",
" <td>71.0</td>\n",
" <td>1.643546e-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81</th>\n",
" <td>www2.census.gov</td>\n",
" <td>20790</td>\n",
" <td>0.071012</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82</th>\n",
" <td>www.bizjournals.com</td>\n",
" <td>20359</td>\n",
" <td>0.069540</td>\n",
" <td>6598.0</td>\n",
" <td>1.527341e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
" <td>www.digitalspy.co.uk</td>\n",
" <td>20143</td>\n",
" <td>0.068802</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>www.highbeam.com</td>\n",
" <td>19882</td>\n",
" <td>0.067911</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>www.boxofficemojo.com</td>\n",
" <td>19814</td>\n",
" <td>0.067678</td>\n",
" <td>1216.0</td>\n",
" <td>2.814863e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>www.kicker.de</td>\n",
" <td>19362</td>\n",
" <td>0.066135</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>paperspast.natlib.govt.nz</td>\n",
" <td>19175</td>\n",
" <td>0.065496</td>\n",
" <td>482.0</td>\n",
" <td>1.115760e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>www.wwe.com</td>\n",
" <td>18859</td>\n",
" <td>0.064416</td>\n",
" <td>1200.0</td>\n",
" <td>2.777825e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>trove.nla.gov.au</td>\n",
" <td>18661</td>\n",
" <td>0.063740</td>\n",
" <td>4024.0</td>\n",
" <td>9.314973e-04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>www.independent.ie</td>\n",
" <td>18544</td>\n",
" <td>0.063341</td>\n",
" <td>16140.0</td>\n",
" <td>3.736175e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>www.nzherald.co.nz</td>\n",
" <td>18470</td>\n",
" <td>0.063088</td>\n",
" <td>19532.0</td>\n",
" <td>4.521373e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>www.researchgate.net</td>\n",
" <td>18352</td>\n",
" <td>0.062685</td>\n",
" <td>34.0</td>\n",
" <td>7.870504e-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>www.britishnewspaperarchive.co.uk</td>\n",
" <td>17905</td>\n",
" <td>0.061158</td>\n",
" <td>58.0</td>\n",
" <td>1.342615e-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>query.nytimes.com</td>\n",
" <td>17876</td>\n",
" <td>0.061059</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>www.britannica.com</td>\n",
" <td>17675</td>\n",
" <td>0.060372</td>\n",
" <td>31253.0</td>\n",
" <td>7.234614e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>sports.espn.go.com</td>\n",
" <td>17661</td>\n",
" <td>0.060324</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>www.nydailynews.com</td>\n",
" <td>17332</td>\n",
" <td>0.059201</td>\n",
" <td>15879.0</td>\n",
" <td>3.675757e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>www.sfgate.com</td>\n",
" <td>17269</td>\n",
" <td>0.058986</td>\n",
" <td>21692.0</td>\n",
" <td>5.021382e-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>factfinder.census.gov</td>\n",
" <td>17213</td>\n",
" <td>0.058794</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" domain count_wiki ratio_wiki count_oscar \\\n",
"0 0 4321781 14.761861 0.0 \n",
"1 books.google.com 923239 3.153498 11530.0 \n",
"2 archive.org 422327 1.442538 2138.0 \n",
"3 www.nytimes.com 286414 0.978301 2090.0 \n",
"4 www.ncbi.nlm.nih.gov 282416 0.964645 1811.0 \n",
"5 www.bbc.co.uk 222875 0.761272 6305.0 \n",
"6 www.theguardian.com 192091 0.656123 102966.0 \n",
"7 news.bbc.co.uk 166381 0.568306 41252.0 \n",
"8 www.billboard.com 165065 0.563811 3915.0 \n",
"9 www.census.gov 161907 0.553024 16765.0 \n",
"10 www.youtube.com 149568 0.510878 35.0 \n",
"11 news.google.com 127045 0.433946 0.0 \n",
"12 www.newspapers.com 126038 0.430507 9259.0 \n",
"13 www.allmusic.com 99734 0.340660 3159.0 \n",
"14 nla.gov.au 84899 0.289989 151.0 \n",
"15 www.washingtonpost.com 82463 0.281668 14755.0 \n",
"16 www.telegraph.co.uk 82355 0.281299 37889.0 \n",
"17 www.espncricinfo.com 64787 0.221292 9020.0 \n",
"18 www.imdb.com 64419 0.220035 4381.0 \n",
"19 www.independent.co.uk 62139 0.212248 30507.0 \n",
"20 www.sports-reference.com 59964 0.204818 57.0 \n",
"21 itunes.apple.com 59719 0.203982 8590.0 \n",
"22 tvbythenumbers.zap2it.com 58393 0.199452 0.0 \n",
"23 timesofindia.indiatimes.com 57124 0.195118 3485.0 \n",
"24 www.baseball-reference.com 54084 0.184734 1767.0 \n",
"25 www.bbc.com 53718 0.183484 5123.0 \n",
"26 www.stat.gov.pl 51732 0.176700 0.0 \n",
"27 variety.com 51519 0.175973 1841.0 \n",
"28 twitter.com 48608 0.166030 0.0 \n",
"29 www.animenewsnetwork.com 48513 0.165705 1446.0 \n",
"30 www.hollywoodreporter.com 48282 0.164916 1651.0 \n",
"31 www.reuters.com 46416 0.158543 61665.0 \n",
"32 www.thehindu.com 45338 0.154861 2269.0 \n",
"33 cricketarchive.com 43944 0.150099 492.0 \n",
"34 articles.latimes.com 43628 0.149020 0.0 \n",
"35 www.discogs.com 42528 0.145262 2377.0 \n",
"36 deadline.com 41989 0.143421 16324.0 \n",
"37 www.officialcharts.com 40620 0.138745 127.0 \n",
"38 www.metacritic.com 40061 0.136836 5457.0 \n",
"39 www.abc.net.au 39787 0.135900 18879.0 \n",
"40 books.google.co.uk 38707 0.132211 15336.0 \n",
"41 web.archive.org 38564 0.131723 1.0 \n",
"42 www.facebook.com 37873 0.129362 0.0 \n",
"43 www.cbc.ca 37385 0.127696 26628.0 \n",
"44 www.amazon.com 37280 0.127337 2721.0 \n",
"45 www.espn.com 36758 0.125554 39528.0 \n",
"46 www.latimes.com 36353 0.124171 26274.0 \n",
"47 www.usatoday.com 35919 0.122688 5631.0 \n",
"48 www.rollingstone.com 35602 0.121605 7996.0 \n",
"49 www.smh.com.au 33701 0.115112 41687.0 \n",
"50 int.soccerway.com 32654 0.111536 0.0 \n",
"51 www.forbes.com 32440 0.110805 58727.0 \n",
"52 www.cnn.com 32378 0.110593 18908.0 \n",
"53 espn.go.com 31913 0.109005 0.0 \n",
"54 www.huffingtonpost.com 30221 0.103226 0.0 \n",
"55 www.pro-football-reference.com 30088 0.102771 34.0 \n",
"56 www.bloomberg.com 30028 0.102566 1.0 \n",
"57 www.mtv.com 29337 0.100206 20034.0 \n",
"58 babel.hathitrust.org 28413 0.097050 1.0 \n",
"59 www.nba.com 27308 0.093276 14693.0 \n",
"60 pitchfork.com 26749 0.091366 17701.0 \n",
"61 www.ign.com 26743 0.091346 8201.0 \n",
"62 www.itis.gov 26056 0.088999 3256.0 \n",
"63 www.wsj.com 25622 0.087517 3534.0 \n",
"64 www.gbif.org 25432 0.086868 87.0 \n",
"65 geonames.usgs.gov 24953 0.085232 0.0 \n",
"66 www.showbuzzdaily.com 24882 0.084989 0.0 \n",
"67 www.uefa.com 24811 0.084747 2204.0 \n",
"68 www.npr.org 24231 0.082766 49011.0 \n",
"69 bugguide.net 23956 0.081826 503.0 \n",
"70 www.gamespot.com 23634 0.080726 2539.0 \n",
"71 gaonchart.co.kr 23462 0.080139 0.0 \n",
"72 www.nhl.com 23405 0.079944 3464.0 \n",
"73 www.oricon.co.jp 23205 0.079261 0.0 \n",
"74 www.basketball-reference.com 23185 0.079193 15.0 \n",
"75 www.rottentomatoes.com 23133 0.079015 4244.0 \n",
"76 www.rsssf.com 22979 0.078489 2187.0 \n",
"77 timesmachine.nytimes.com 22558 0.077051 0.0 \n",
"78 www.nfl.com 21358 0.072952 16389.0 \n",
"79 www.irishtimes.com 21144 0.072221 1419.0 \n",
"80 www.stuff.co.nz 20794 0.071026 71.0 \n",
"81 www2.census.gov 20790 0.071012 0.0 \n",
"82 www.bizjournals.com 20359 0.069540 6598.0 \n",
"83 www.digitalspy.co.uk 20143 0.068802 0.0 \n",
"84 www.highbeam.com 19882 0.067911 0.0 \n",
"85 www.boxofficemojo.com 19814 0.067678 1216.0 \n",
"86 www.kicker.de 19362 0.066135 0.0 \n",
"87 paperspast.natlib.govt.nz 19175 0.065496 482.0 \n",
"88 www.wwe.com 18859 0.064416 1200.0 \n",
"89 trove.nla.gov.au 18661 0.063740 4024.0 \n",
"90 www.independent.ie 18544 0.063341 16140.0 \n",
"91 www.nzherald.co.nz 18470 0.063088 19532.0 \n",
"92 www.researchgate.net 18352 0.062685 34.0 \n",
"93 www.britishnewspaperarchive.co.uk 17905 0.061158 58.0 \n",
"94 query.nytimes.com 17876 0.061059 0.0 \n",
"95 www.britannica.com 17675 0.060372 31253.0 \n",
"96 sports.espn.go.com 17661 0.060324 0.0 \n",
"97 www.nydailynews.com 17332 0.059201 15879.0 \n",
"98 www.sfgate.com 17269 0.058986 21692.0 \n",
"99 factfinder.census.gov 17213 0.058794 0.0 \n",
"\n",
" ratio_oscar \n",
"0 0.000000e+00 \n",
"1 2.669027e-03 \n",
"2 4.949158e-04 \n",
"3 4.838045e-04 \n",
"4 4.192201e-04 \n",
"5 1.459516e-03 \n",
"6 2.383513e-02 \n",
"7 9.549236e-03 \n",
"8 9.062654e-04 \n",
"9 3.880853e-03 \n",
"10 8.101990e-06 \n",
"11 0.000000e+00 \n",
"12 2.143323e-03 \n",
"13 7.312624e-04 \n",
"14 3.495430e-05 \n",
"15 3.415567e-03 \n",
"16 8.770751e-03 \n",
"17 2.087998e-03 \n",
"18 1.014138e-03 \n",
"19 7.061926e-03 \n",
"20 1.319467e-05 \n",
"21 1.988460e-03 \n",
"22 0.000000e+00 \n",
"23 8.067267e-04 \n",
"24 4.090347e-04 \n",
"25 1.185900e-03 \n",
"26 0.000000e+00 \n",
"27 4.261646e-04 \n",
"28 0.000000e+00 \n",
"29 3.347279e-04 \n",
"30 3.821824e-04 \n",
"31 1.427455e-02 \n",
"32 5.252404e-04 \n",
"33 1.138908e-04 \n",
"34 0.000000e+00 \n",
"35 5.502408e-04 \n",
"36 3.778768e-03 \n",
"37 2.939865e-05 \n",
"38 1.263216e-03 \n",
"39 4.370213e-03 \n",
"40 3.550060e-03 \n",
"41 2.314854e-07 \n",
"42 0.000000e+00 \n",
"43 6.163994e-03 \n",
"44 6.298718e-04 \n",
"45 9.150155e-03 \n",
"46 6.082048e-03 \n",
"47 1.303494e-03 \n",
"48 1.850957e-03 \n",
"49 9.649933e-03 \n",
"50 0.000000e+00 \n",
"51 1.359444e-02 \n",
"52 4.376926e-03 \n",
"53 0.000000e+00 \n",
"54 0.000000e+00 \n",
"55 7.870504e-06 \n",
"56 2.314854e-07 \n",
"57 4.637579e-03 \n",
"58 2.314854e-07 \n",
"59 3.401215e-03 \n",
"60 4.097523e-03 \n",
"61 1.898412e-03 \n",
"62 7.537165e-04 \n",
"63 8.180695e-04 \n",
"64 2.013923e-05 \n",
"65 0.000000e+00 \n",
"66 0.000000e+00 \n",
"67 5.101939e-04 \n",
"68 1.134533e-02 \n",
"69 1.164372e-04 \n",
"70 5.877415e-04 \n",
"71 0.000000e+00 \n",
"72 8.018655e-04 \n",
"73 0.000000e+00 \n",
"74 3.472281e-06 \n",
"75 9.824241e-04 \n",
"76 5.062586e-04 \n",
"77 0.000000e+00 \n",
"78 3.793814e-03 \n",
"79 3.284778e-04 \n",
"80 1.643546e-05 \n",
"81 0.000000e+00 \n",
"82 1.527341e-03 \n",
"83 0.000000e+00 \n",
"84 0.000000e+00 \n",
"85 2.814863e-04 \n",
"86 0.000000e+00 \n",
"87 1.115760e-04 \n",
"88 2.777825e-04 \n",
"89 9.314973e-04 \n",
"90 3.736175e-03 \n",
"91 4.521373e-03 \n",
"92 7.870504e-06 \n",
"93 1.342615e-05 \n",
"94 0.000000e+00 \n",
"95 7.234614e-03 \n",
"96 0.000000e+00 \n",
"97 3.675757e-03 \n",
"98 5.021382e-03 \n",
"99 0.000000e+00 "
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"both"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "06bab2c3",
"metadata": {},
"outputs": [],
"source": [
"MAX = 13131313\n",
"tot[\"ratio\"] = tot.apply(\n",
" lambda row: MAX if row[\"ratio_oscar\"] == 0 else row[\"ratio_wiki\"] / row[\"ratio_oscar\"], axis=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "415da410",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>domain</th>\n",
" <th>count_wiki</th>\n",
" <th>ratio_wiki</th>\n",
" <th>count_oscar</th>\n",
" <th>ratio_oscar</th>\n",
" <th>ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>4321781</td>\n",
" <td>14.761861</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>news.google.com</td>\n",
" <td>127045</td>\n",
" <td>0.433946</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>tvbythenumbers.zap2it.com</td>\n",
" <td>58393</td>\n",
" <td>0.199452</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>www.stat.gov.pl</td>\n",
" <td>51732</td>\n",
" <td>0.176700</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>twitter.com</td>\n",
" <td>48608</td>\n",
" <td>0.166030</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>articles.latimes.com</td>\n",
" <td>43628</td>\n",
" <td>0.149020</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>www.facebook.com</td>\n",
" <td>37873</td>\n",
" <td>0.129362</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50</th>\n",
" <td>int.soccerway.com</td>\n",
" <td>32654</td>\n",
" <td>0.111536</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>espn.go.com</td>\n",
" <td>31913</td>\n",
" <td>0.109005</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>www.huffingtonpost.com</td>\n",
" <td>30221</td>\n",
" <td>0.103226</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>geonames.usgs.gov</td>\n",
" <td>24953</td>\n",
" <td>0.085232</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>www.showbuzzdaily.com</td>\n",
" <td>24882</td>\n",
" <td>0.084989</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>gaonchart.co.kr</td>\n",
" <td>23462</td>\n",
" <td>0.080139</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>www.oricon.co.jp</td>\n",
" <td>23205</td>\n",
" <td>0.079261</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>timesmachine.nytimes.com</td>\n",
" <td>22558</td>\n",
" <td>0.077051</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81</th>\n",
" <td>www2.census.gov</td>\n",
" <td>20790</td>\n",
" <td>0.071012</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
" <td>www.digitalspy.co.uk</td>\n",
" <td>20143</td>\n",
" <td>0.068802</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>www.highbeam.com</td>\n",
" <td>19882</td>\n",
" <td>0.067911</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>www.kicker.de</td>\n",
" <td>19362</td>\n",
" <td>0.066135</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>query.nytimes.com</td>\n",
" <td>17876</td>\n",
" <td>0.061059</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>sports.espn.go.com</td>\n",
" <td>17661</td>\n",
" <td>0.060324</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>factfinder.census.gov</td>\n",
" <td>17213</td>\n",
" <td>0.058794</td>\n",
" <td>0.0</td>\n",
" <td>0.000000e+00</td>\n",
" <td>1.313131e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>web.archive.org</td>\n",
" <td>38564</td>\n",
" <td>0.131723</td>\n",
" <td>1.0</td>\n",
" <td>2.314854e-07</td>\n",
" <td>5.690322e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56</th>\n",
" <td>www.bloomberg.com</td>\n",
" <td>30028</td>\n",
" <td>0.102566</td>\n",
" <td>1.0</td>\n",
" <td>2.314854e-07</td>\n",
" <td>4.430790e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>babel.hathitrust.org</td>\n",
" <td>28413</td>\n",
" <td>0.097050</td>\n",
" <td>1.0</td>\n",
" <td>2.314854e-07</td>\n",
" <td>4.192488e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>www.youtube.com</td>\n",
" <td>149568</td>\n",
" <td>0.510878</td>\n",
" <td>35.0</td>\n",
" <td>8.101990e-06</td>\n",
" <td>6.305585e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>www.basketball-reference.com</td>\n",
" <td>23185</td>\n",
" <td>0.079193</td>\n",
" <td>15.0</td>\n",
" <td>3.472281e-06</td>\n",
" <td>2.280713e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>www.sports-reference.com</td>\n",
" <td>59964</td>\n",
" <td>0.204818</td>\n",
" <td>57.0</td>\n",
" <td>1.319467e-05</td>\n",
" <td>1.552281e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>www.pro-football-reference.com</td>\n",
" <td>30088</td>\n",
" <td>0.102771</td>\n",
" <td>34.0</td>\n",
" <td>7.870504e-06</td>\n",
" <td>1.305777e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>nla.gov.au</td>\n",
" <td>84899</td>\n",
" <td>0.289989</td>\n",
" <td>151.0</td>\n",
" <td>3.495430e-05</td>\n",
" <td>8.296222e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>www.researchgate.net</td>\n",
" <td>18352</td>\n",
" <td>0.062685</td>\n",
" <td>34.0</td>\n",
" <td>7.870504e-06</td>\n",
" <td>7.964513e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>www.officialcharts.com</td>\n",
" <td>40620</td>\n",
" <td>0.138745</td>\n",
" <td>127.0</td>\n",
" <td>2.939865e-05</td>\n",
" <td>4.719445e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>www.britishnewspaperarchive.co.uk</td>\n",
" <td>17905</td>\n",
" <td>0.061158</td>\n",
" <td>58.0</td>\n",
" <td>1.342615e-05</td>\n",
" <td>4.555133e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80</th>\n",
" <td>www.stuff.co.nz</td>\n",
" <td>20794</td>\n",
" <td>0.071026</td>\n",
" <td>71.0</td>\n",
" <td>1.643546e-05</td>\n",
" <td>4.321499e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>www.gbif.org</td>\n",
" <td>25432</td>\n",
" <td>0.086868</td>\n",
" <td>87.0</td>\n",
" <td>2.013923e-05</td>\n",
" <td>4.313363e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>archive.org</td>\n",
" <td>422327</td>\n",
" <td>1.442538</td>\n",
" <td>2138.0</td>\n",
" <td>4.949158e-04</td>\n",
" <td>2.914714e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>www.ncbi.nlm.nih.gov</td>\n",
" <td>282416</td>\n",
" <td>0.964645</td>\n",
" <td>1811.0</td>\n",
" <td>4.192201e-04</td>\n",
" <td>2.301047e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>www.nytimes.com</td>\n",
" <td>286414</td>\n",
" <td>0.978301</td>\n",
" <td>2090.0</td>\n",
" <td>4.838045e-04</td>\n",
" <td>2.022100e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>cricketarchive.com</td>\n",
" <td>43944</td>\n",
" <td>0.150099</td>\n",
" <td>492.0</td>\n",
" <td>1.138908e-04</td>\n",
" <td>1.317921e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>books.google.com</td>\n",
" <td>923239</td>\n",
" <td>3.153498</td>\n",
" <td>11530.0</td>\n",
" <td>2.669027e-03</td>\n",
" <td>1.181516e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>bugguide.net</td>\n",
" <td>23956</td>\n",
" <td>0.081826</td>\n",
" <td>503.0</td>\n",
" <td>1.164372e-04</td>\n",
" <td>7.027503e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>www.billboard.com</td>\n",
" <td>165065</td>\n",
" <td>0.563811</td>\n",
" <td>3915.0</td>\n",
" <td>9.062654e-04</td>\n",
" <td>6.221254e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>paperspast.natlib.govt.nz</td>\n",
" <td>19175</td>\n",
" <td>0.065496</td>\n",
" <td>482.0</td>\n",
" <td>1.115760e-04</td>\n",
" <td>5.870067e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>www.bbc.co.uk</td>\n",
" <td>222875</td>\n",
" <td>0.761272</td>\n",
" <td>6305.0</td>\n",
" <td>1.459516e-03</td>\n",
" <td>5.215921e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>www.animenewsnetwork.com</td>\n",
" <td>48513</td>\n",
" <td>0.165705</td>\n",
" <td>1446.0</td>\n",
" <td>3.347279e-04</td>\n",
" <td>4.950449e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>www.allmusic.com</td>\n",
" <td>99734</td>\n",
" <td>0.340660</td>\n",
" <td>3159.0</td>\n",
" <td>7.312624e-04</td>\n",
" <td>4.658524e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>www.baseball-reference.com</td>\n",
" <td>54084</td>\n",
" <td>0.184734</td>\n",
" <td>1767.0</td>\n",
" <td>4.090347e-04</td>\n",
" <td>4.516344e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>www.hollywoodreporter.com</td>\n",
" <td>48282</td>\n",
" <td>0.164916</td>\n",
" <td>1651.0</td>\n",
" <td>3.821824e-04</td>\n",
" <td>4.315120e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>variety.com</td>\n",
" <td>51519</td>\n",
" <td>0.175973</td>\n",
" <td>1841.0</td>\n",
" <td>4.261646e-04</td>\n",
" <td>4.129223e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>www.thehindu.com</td>\n",
" <td>45338</td>\n",
" <td>0.154861</td>\n",
" <td>2269.0</td>\n",
" <td>5.252404e-04</td>\n",
" <td>2.948374e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>www.discogs.com</td>\n",
" <td>42528</td>\n",
" <td>0.145262</td>\n",
" <td>2377.0</td>\n",
" <td>5.502408e-04</td>\n",
" <td>2.639979e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>timesofindia.indiatimes.com</td>\n",
" <td>57124</td>\n",
" <td>0.195118</td>\n",
" <td>3485.0</td>\n",
" <td>8.067267e-04</td>\n",
" <td>2.418636e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>www.boxofficemojo.com</td>\n",
" <td>19814</td>\n",
" <td>0.067678</td>\n",
" <td>1216.0</td>\n",
" <td>2.814863e-04</td>\n",
" <td>2.404326e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>www.wwe.com</td>\n",
" <td>18859</td>\n",
" <td>0.064416</td>\n",
" <td>1200.0</td>\n",
" <td>2.777825e-04</td>\n",
" <td>2.318954e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>www.irishtimes.com</td>\n",
" <td>21144</td>\n",
" <td>0.072221</td>\n",
" <td>1419.0</td>\n",
" <td>3.284778e-04</td>\n",
" <td>2.198667e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>www.imdb.com</td>\n",
" <td>64419</td>\n",
" <td>0.220035</td>\n",
" <td>4381.0</td>\n",
" <td>1.014138e-03</td>\n",
" <td>2.169679e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>www.amazon.com</td>\n",
" <td>37280</td>\n",
" <td>0.127337</td>\n",
" <td>2721.0</td>\n",
" <td>6.298718e-04</td>\n",
" <td>2.021632e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>www.newspapers.com</td>\n",
" <td>126038</td>\n",
" <td>0.430507</td>\n",
" <td>9259.0</td>\n",
" <td>2.143323e-03</td>\n",
" <td>2.008594e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>www.uefa.com</td>\n",
" <td>24811</td>\n",
" <td>0.084747</td>\n",
" <td>2204.0</td>\n",
" <td>5.101939e-04</td>\n",
" <td>1.661068e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>www.rsssf.com</td>\n",
" <td>22979</td>\n",
" <td>0.078489</td>\n",
" <td>2187.0</td>\n",
" <td>5.062586e-04</td>\n",
" <td>1.550376e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>www.bbc.com</td>\n",
" <td>53718</td>\n",
" <td>0.183484</td>\n",
" <td>5123.0</td>\n",
" <td>1.185900e-03</td>\n",
" <td>1.547213e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>www.census.gov</td>\n",
" <td>161907</td>\n",
" <td>0.553024</td>\n",
" <td>16765.0</td>\n",
" <td>3.880853e-03</td>\n",
" <td>1.425006e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>www.gamespot.com</td>\n",
" <td>23634</td>\n",
" <td>0.080726</td>\n",
" <td>2539.0</td>\n",
" <td>5.877415e-04</td>\n",
" <td>1.373502e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>www.itis.gov</td>\n",
" <td>26056</td>\n",
" <td>0.088999</td>\n",
" <td>3256.0</td>\n",
" <td>7.537165e-04</td>\n",
" <td>1.180805e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>www.metacritic.com</td>\n",
" <td>40061</td>\n",
" <td>0.136836</td>\n",
" <td>5457.0</td>\n",
" <td>1.263216e-03</td>\n",
" <td>1.083235e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>www.wsj.com</td>\n",
" <td>25622</td>\n",
" <td>0.087517</td>\n",
" <td>3534.0</td>\n",
" <td>8.180695e-04</td>\n",
" <td>1.069797e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>www.espncricinfo.com</td>\n",
" <td>64787</td>\n",
" <td>0.221292</td>\n",
" <td>9020.0</td>\n",
" <td>2.087998e-03</td>\n",
" <td>1.059830e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>itunes.apple.com</td>\n",
" <td>59719</td>\n",
" <td>0.203982</td>\n",
" <td>8590.0</td>\n",
" <td>1.988460e-03</td>\n",
" <td>1.025827e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>www.nhl.com</td>\n",
" <td>23405</td>\n",
" <td>0.079944</td>\n",
" <td>3464.0</td>\n",
" <td>8.018655e-04</td>\n",
" <td>9.969778e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>www.usatoday.com</td>\n",
" <td>35919</td>\n",
" <td>0.122688</td>\n",
" <td>5631.0</td>\n",
" <td>1.303494e-03</td>\n",
" <td>9.412250e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>www.washingtonpost.com</td>\n",
" <td>82463</td>\n",
" <td>0.281668</td>\n",
" <td>14755.0</td>\n",
" <td>3.415567e-03</td>\n",
" <td>8.246595e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>www.rottentomatoes.com</td>\n",
" <td>23133</td>\n",
" <td>0.079015</td>\n",
" <td>4244.0</td>\n",
" <td>9.824241e-04</td>\n",
" <td>8.042875e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>trove.nla.gov.au</td>\n",
" <td>18661</td>\n",
" <td>0.063740</td>\n",
" <td>4024.0</td>\n",
" <td>9.314973e-04</td>\n",
" <td>6.842766e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>www.rollingstone.com</td>\n",
" <td>35602</td>\n",
" <td>0.121605</td>\n",
" <td>7996.0</td>\n",
" <td>1.850957e-03</td>\n",
" <td>6.569863e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>news.bbc.co.uk</td>\n",
" <td>166381</td>\n",
" <td>0.568306</td>\n",
" <td>41252.0</td>\n",
" <td>9.549236e-03</td>\n",
" <td>5.951322e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>www.ign.com</td>\n",
" <td>26743</td>\n",
" <td>0.091346</td>\n",
" <td>8201.0</td>\n",
" <td>1.898412e-03</td>\n",
" <td>4.811695e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82</th>\n",
" <td>www.bizjournals.com</td>\n",
" <td>20359</td>\n",
" <td>0.069540</td>\n",
" <td>6598.0</td>\n",
" <td>1.527341e-03</td>\n",
" <td>4.553013e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>deadline.com</td>\n",
" <td>41989</td>\n",
" <td>0.143421</td>\n",
" <td>16324.0</td>\n",
" <td>3.778768e-03</td>\n",
" <td>3.795454e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>books.google.co.uk</td>\n",
" <td>38707</td>\n",
" <td>0.132211</td>\n",
" <td>15336.0</td>\n",
" <td>3.550060e-03</td>\n",
" <td>3.724193e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>www.telegraph.co.uk</td>\n",
" <td>82355</td>\n",
" <td>0.281299</td>\n",
" <td>37889.0</td>\n",
" <td>8.770751e-03</td>\n",
" <td>3.207241e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>www.abc.net.au</td>\n",
" <td>39787</td>\n",
" <td>0.135900</td>\n",
" <td>18879.0</td>\n",
" <td>4.370213e-03</td>\n",
" <td>3.109689e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>www.independent.co.uk</td>\n",
" <td>62139</td>\n",
" <td>0.212248</td>\n",
" <td>30507.0</td>\n",
" <td>7.061926e-03</td>\n",
" <td>3.005519e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>www.theguardian.com</td>\n",
" <td>192091</td>\n",
" <td>0.656123</td>\n",
" <td>102966.0</td>\n",
" <td>2.383513e-02</td>\n",
" <td>2.752757e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>www.nba.com</td>\n",
" <td>27308</td>\n",
" <td>0.093276</td>\n",
" <td>14693.0</td>\n",
" <td>3.401215e-03</td>\n",
" <td>2.742421e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52</th>\n",
" <td>www.cnn.com</td>\n",
" <td>32378</td>\n",
" <td>0.110593</td>\n",
" <td>18908.0</td>\n",
" <td>4.376926e-03</td>\n",
" <td>2.526732e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>pitchfork.com</td>\n",
" <td>26749</td>\n",
" <td>0.091366</td>\n",
" <td>17701.0</td>\n",
" <td>4.097523e-03</td>\n",
" <td>2.229793e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>www.mtv.com</td>\n",
" <td>29337</td>\n",
" <td>0.100206</td>\n",
" <td>20034.0</td>\n",
" <td>4.637579e-03</td>\n",
" <td>2.160741e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>www.cbc.ca</td>\n",
" <td>37385</td>\n",
" <td>0.127696</td>\n",
" <td>26628.0</td>\n",
" <td>6.163994e-03</td>\n",
" <td>2.071637e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>www.latimes.com</td>\n",
" <td>36353</td>\n",
" <td>0.124171</td>\n",
" <td>26274.0</td>\n",
" <td>6.082048e-03</td>\n",
" <td>2.041591e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>www.nfl.com</td>\n",
" <td>21358</td>\n",
" <td>0.072952</td>\n",
" <td>16389.0</td>\n",
" <td>3.793814e-03</td>\n",
" <td>1.922927e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>www.independent.ie</td>\n",
" <td>18544</td>\n",
" <td>0.063341</td>\n",
" <td>16140.0</td>\n",
" <td>3.736175e-03</td>\n",
" <td>1.695331e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>www.nydailynews.com</td>\n",
" <td>17332</td>\n",
" <td>0.059201</td>\n",
" <td>15879.0</td>\n",
" <td>3.675757e-03</td>\n",
" <td>1.610572e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>www.nzherald.co.nz</td>\n",
" <td>18470</td>\n",
" <td>0.063088</td>\n",
" <td>19532.0</td>\n",
" <td>4.521373e-03</td>\n",
" <td>1.395324e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>www.espn.com</td>\n",
" <td>36758</td>\n",
" <td>0.125554</td>\n",
" <td>39528.0</td>\n",
" <td>9.150155e-03</td>\n",
" <td>1.372151e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>www.smh.com.au</td>\n",
" <td>33701</td>\n",
" <td>0.115112</td>\n",
" <td>41687.0</td>\n",
" <td>9.649933e-03</td>\n",
" <td>1.192880e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>www.sfgate.com</td>\n",
" <td>17269</td>\n",
" <td>0.058986</td>\n",
" <td>21692.0</td>\n",
" <td>5.021382e-03</td>\n",
" <td>1.174687e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>www.reuters.com</td>\n",
" <td>46416</td>\n",
" <td>0.158543</td>\n",
" <td>61665.0</td>\n",
" <td>1.427455e-02</td>\n",
" <td>1.110667e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>www.britannica.com</td>\n",
" <td>17675</td>\n",
" <td>0.060372</td>\n",
" <td>31253.0</td>\n",
" <td>7.234614e-03</td>\n",
" <td>8.344925e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>www.forbes.com</td>\n",
" <td>32440</td>\n",
" <td>0.110805</td>\n",
" <td>58727.0</td>\n",
" <td>1.359444e-02</td>\n",
" <td>8.150754e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>www.npr.org</td>\n",
" <td>24231</td>\n",
" <td>0.082766</td>\n",
" <td>49011.0</td>\n",
" <td>1.134533e-02</td>\n",
" <td>7.295121e+00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" domain count_wiki ratio_wiki count_oscar \\\n",
"0 0 4321781 14.761861 0.0 \n",
"11 news.google.com 127045 0.433946 0.0 \n",
"22 tvbythenumbers.zap2it.com 58393 0.199452 0.0 \n",
"26 www.stat.gov.pl 51732 0.176700 0.0 \n",
"28 twitter.com 48608 0.166030 0.0 \n",
"34 articles.latimes.com 43628 0.149020 0.0 \n",
"42 www.facebook.com 37873 0.129362 0.0 \n",
"50 int.soccerway.com 32654 0.111536 0.0 \n",
"53 espn.go.com 31913 0.109005 0.0 \n",
"54 www.huffingtonpost.com 30221 0.103226 0.0 \n",
"65 geonames.usgs.gov 24953 0.085232 0.0 \n",
"66 www.showbuzzdaily.com 24882 0.084989 0.0 \n",
"71 gaonchart.co.kr 23462 0.080139 0.0 \n",
"73 www.oricon.co.jp 23205 0.079261 0.0 \n",
"77 timesmachine.nytimes.com 22558 0.077051 0.0 \n",
"81 www2.census.gov 20790 0.071012 0.0 \n",
"83 www.digitalspy.co.uk 20143 0.068802 0.0 \n",
"84 www.highbeam.com 19882 0.067911 0.0 \n",
"86 www.kicker.de 19362 0.066135 0.0 \n",
"94 query.nytimes.com 17876 0.061059 0.0 \n",
"96 sports.espn.go.com 17661 0.060324 0.0 \n",
"99 factfinder.census.gov 17213 0.058794 0.0 \n",
"41 web.archive.org 38564 0.131723 1.0 \n",
"56 www.bloomberg.com 30028 0.102566 1.0 \n",
"58 babel.hathitrust.org 28413 0.097050 1.0 \n",
"10 www.youtube.com 149568 0.510878 35.0 \n",
"74 www.basketball-reference.com 23185 0.079193 15.0 \n",
"20 www.sports-reference.com 59964 0.204818 57.0 \n",
"55 www.pro-football-reference.com 30088 0.102771 34.0 \n",
"14 nla.gov.au 84899 0.289989 151.0 \n",
"92 www.researchgate.net 18352 0.062685 34.0 \n",
"37 www.officialcharts.com 40620 0.138745 127.0 \n",
"93 www.britishnewspaperarchive.co.uk 17905 0.061158 58.0 \n",
"80 www.stuff.co.nz 20794 0.071026 71.0 \n",
"64 www.gbif.org 25432 0.086868 87.0 \n",
"2 archive.org 422327 1.442538 2138.0 \n",
"4 www.ncbi.nlm.nih.gov 282416 0.964645 1811.0 \n",
"3 www.nytimes.com 286414 0.978301 2090.0 \n",
"33 cricketarchive.com 43944 0.150099 492.0 \n",
"1 books.google.com 923239 3.153498 11530.0 \n",
"69 bugguide.net 23956 0.081826 503.0 \n",
"8 www.billboard.com 165065 0.563811 3915.0 \n",
"87 paperspast.natlib.govt.nz 19175 0.065496 482.0 \n",
"5 www.bbc.co.uk 222875 0.761272 6305.0 \n",
"29 www.animenewsnetwork.com 48513 0.165705 1446.0 \n",
"13 www.allmusic.com 99734 0.340660 3159.0 \n",
"24 www.baseball-reference.com 54084 0.184734 1767.0 \n",
"30 www.hollywoodreporter.com 48282 0.164916 1651.0 \n",
"27 variety.com 51519 0.175973 1841.0 \n",
"32 www.thehindu.com 45338 0.154861 2269.0 \n",
"35 www.discogs.com 42528 0.145262 2377.0 \n",
"23 timesofindia.indiatimes.com 57124 0.195118 3485.0 \n",
"85 www.boxofficemojo.com 19814 0.067678 1216.0 \n",
"88 www.wwe.com 18859 0.064416 1200.0 \n",
"79 www.irishtimes.com 21144 0.072221 1419.0 \n",
"18 www.imdb.com 64419 0.220035 4381.0 \n",
"44 www.amazon.com 37280 0.127337 2721.0 \n",
"12 www.newspapers.com 126038 0.430507 9259.0 \n",
"67 www.uefa.com 24811 0.084747 2204.0 \n",
"76 www.rsssf.com 22979 0.078489 2187.0 \n",
"25 www.bbc.com 53718 0.183484 5123.0 \n",
"9 www.census.gov 161907 0.553024 16765.0 \n",
"70 www.gamespot.com 23634 0.080726 2539.0 \n",
"62 www.itis.gov 26056 0.088999 3256.0 \n",
"38 www.metacritic.com 40061 0.136836 5457.0 \n",
"63 www.wsj.com 25622 0.087517 3534.0 \n",
"17 www.espncricinfo.com 64787 0.221292 9020.0 \n",
"21 itunes.apple.com 59719 0.203982 8590.0 \n",
"72 www.nhl.com 23405 0.079944 3464.0 \n",
"47 www.usatoday.com 35919 0.122688 5631.0 \n",
"15 www.washingtonpost.com 82463 0.281668 14755.0 \n",
"75 www.rottentomatoes.com 23133 0.079015 4244.0 \n",
"89 trove.nla.gov.au 18661 0.063740 4024.0 \n",
"48 www.rollingstone.com 35602 0.121605 7996.0 \n",
"7 news.bbc.co.uk 166381 0.568306 41252.0 \n",
"61 www.ign.com 26743 0.091346 8201.0 \n",
"82 www.bizjournals.com 20359 0.069540 6598.0 \n",
"36 deadline.com 41989 0.143421 16324.0 \n",
"40 books.google.co.uk 38707 0.132211 15336.0 \n",
"16 www.telegraph.co.uk 82355 0.281299 37889.0 \n",
"39 www.abc.net.au 39787 0.135900 18879.0 \n",
"19 www.independent.co.uk 62139 0.212248 30507.0 \n",
"6 www.theguardian.com 192091 0.656123 102966.0 \n",
"59 www.nba.com 27308 0.093276 14693.0 \n",
"52 www.cnn.com 32378 0.110593 18908.0 \n",
"60 pitchfork.com 26749 0.091366 17701.0 \n",
"57 www.mtv.com 29337 0.100206 20034.0 \n",
"43 www.cbc.ca 37385 0.127696 26628.0 \n",
"46 www.latimes.com 36353 0.124171 26274.0 \n",
"78 www.nfl.com 21358 0.072952 16389.0 \n",
"90 www.independent.ie 18544 0.063341 16140.0 \n",
"97 www.nydailynews.com 17332 0.059201 15879.0 \n",
"91 www.nzherald.co.nz 18470 0.063088 19532.0 \n",
"45 www.espn.com 36758 0.125554 39528.0 \n",
"49 www.smh.com.au 33701 0.115112 41687.0 \n",
"98 www.sfgate.com 17269 0.058986 21692.0 \n",
"31 www.reuters.com 46416 0.158543 61665.0 \n",
"95 www.britannica.com 17675 0.060372 31253.0 \n",
"51 www.forbes.com 32440 0.110805 58727.0 \n",
"68 www.npr.org 24231 0.082766 49011.0 \n",
"\n",
" ratio_oscar ratio \n",
"0 0.000000e+00 1.313131e+07 \n",
"11 0.000000e+00 1.313131e+07 \n",
"22 0.000000e+00 1.313131e+07 \n",
"26 0.000000e+00 1.313131e+07 \n",
"28 0.000000e+00 1.313131e+07 \n",
"34 0.000000e+00 1.313131e+07 \n",
"42 0.000000e+00 1.313131e+07 \n",
"50 0.000000e+00 1.313131e+07 \n",
"53 0.000000e+00 1.313131e+07 \n",
"54 0.000000e+00 1.313131e+07 \n",
"65 0.000000e+00 1.313131e+07 \n",
"66 0.000000e+00 1.313131e+07 \n",
"71 0.000000e+00 1.313131e+07 \n",
"73 0.000000e+00 1.313131e+07 \n",
"77 0.000000e+00 1.313131e+07 \n",
"81 0.000000e+00 1.313131e+07 \n",
"83 0.000000e+00 1.313131e+07 \n",
"84 0.000000e+00 1.313131e+07 \n",
"86 0.000000e+00 1.313131e+07 \n",
"94 0.000000e+00 1.313131e+07 \n",
"96 0.000000e+00 1.313131e+07 \n",
"99 0.000000e+00 1.313131e+07 \n",
"41 2.314854e-07 5.690322e+05 \n",
"56 2.314854e-07 4.430790e+05 \n",
"58 2.314854e-07 4.192488e+05 \n",
"10 8.101990e-06 6.305585e+04 \n",
"74 3.472281e-06 2.280713e+04 \n",
"20 1.319467e-05 1.552281e+04 \n",
"55 7.870504e-06 1.305777e+04 \n",
"14 3.495430e-05 8.296222e+03 \n",
"92 7.870504e-06 7.964513e+03 \n",
"37 2.939865e-05 4.719445e+03 \n",
"93 1.342615e-05 4.555133e+03 \n",
"80 1.643546e-05 4.321499e+03 \n",
"64 2.013923e-05 4.313363e+03 \n",
"2 4.949158e-04 2.914714e+03 \n",
"4 4.192201e-04 2.301047e+03 \n",
"3 4.838045e-04 2.022100e+03 \n",
"33 1.138908e-04 1.317921e+03 \n",
"1 2.669027e-03 1.181516e+03 \n",
"69 1.164372e-04 7.027503e+02 \n",
"8 9.062654e-04 6.221254e+02 \n",
"87 1.115760e-04 5.870067e+02 \n",
"5 1.459516e-03 5.215921e+02 \n",
"29 3.347279e-04 4.950449e+02 \n",
"13 7.312624e-04 4.658524e+02 \n",
"24 4.090347e-04 4.516344e+02 \n",
"30 3.821824e-04 4.315120e+02 \n",
"27 4.261646e-04 4.129223e+02 \n",
"32 5.252404e-04 2.948374e+02 \n",
"35 5.502408e-04 2.639979e+02 \n",
"23 8.067267e-04 2.418636e+02 \n",
"85 2.814863e-04 2.404326e+02 \n",
"88 2.777825e-04 2.318954e+02 \n",
"79 3.284778e-04 2.198667e+02 \n",
"18 1.014138e-03 2.169679e+02 \n",
"44 6.298718e-04 2.021632e+02 \n",
"12 2.143323e-03 2.008594e+02 \n",
"67 5.101939e-04 1.661068e+02 \n",
"76 5.062586e-04 1.550376e+02 \n",
"25 1.185900e-03 1.547213e+02 \n",
"9 3.880853e-03 1.425006e+02 \n",
"70 5.877415e-04 1.373502e+02 \n",
"62 7.537165e-04 1.180805e+02 \n",
"38 1.263216e-03 1.083235e+02 \n",
"63 8.180695e-04 1.069797e+02 \n",
"17 2.087998e-03 1.059830e+02 \n",
"21 1.988460e-03 1.025827e+02 \n",
"72 8.018655e-04 9.969778e+01 \n",
"47 1.303494e-03 9.412250e+01 \n",
"15 3.415567e-03 8.246595e+01 \n",
"75 9.824241e-04 8.042875e+01 \n",
"89 9.314973e-04 6.842766e+01 \n",
"48 1.850957e-03 6.569863e+01 \n",
"7 9.549236e-03 5.951322e+01 \n",
"61 1.898412e-03 4.811695e+01 \n",
"82 1.527341e-03 4.553013e+01 \n",
"36 3.778768e-03 3.795454e+01 \n",
"40 3.550060e-03 3.724193e+01 \n",
"16 8.770751e-03 3.207241e+01 \n",
"39 4.370213e-03 3.109689e+01 \n",
"19 7.061926e-03 3.005519e+01 \n",
"6 2.383513e-02 2.752757e+01 \n",
"59 3.401215e-03 2.742421e+01 \n",
"52 4.376926e-03 2.526732e+01 \n",
"60 4.097523e-03 2.229793e+01 \n",
"57 4.637579e-03 2.160741e+01 \n",
"43 6.163994e-03 2.071637e+01 \n",
"46 6.082048e-03 2.041591e+01 \n",
"78 3.793814e-03 1.922927e+01 \n",
"90 3.736175e-03 1.695331e+01 \n",
"97 3.675757e-03 1.610572e+01 \n",
"91 4.521373e-03 1.395324e+01 \n",
"45 9.150155e-03 1.372151e+01 \n",
"49 9.649933e-03 1.192880e+01 \n",
"98 5.021382e-03 1.174687e+01 \n",
"31 1.427455e-02 1.110667e+01 \n",
"95 7.234614e-03 8.344925e+00 \n",
"51 1.359444e-02 8.150754e+00 \n",
"68 1.134533e-02 7.295121e+00 "
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tot.sort_values([\"ratio\", \"count_wiki\"], ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d888605",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}