filtering_notebooks/blindspots.ipynb (2,462 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": 50, "id": "b25e0643", "metadata": {}, "outputs": [], "source": [ "import jsonlines\n", "import pandas as pd\n", "import pickle\n", "import pprint\n", "\n", "from collections import Counter\n", "from datasets import load_from_disk\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 51, "id": "33879ae4", "metadata": {}, "outputs": [], "source": [ "pp = pprint.PrettyPrinter(indent=2)\n", "pd.set_option(\"display.max_rows\", 500)" ] }, { "cell_type": "code", "execution_count": 74, "id": "4bcbc961", "metadata": {}, "outputs": [], "source": [ "def get_domain(uri):\n", " return uri.split(\"/\")[2]\n", "\n", "\n", "def get_df(domains_count):\n", " domains = []\n", " counts = []\n", " for domain, count in domains_count.most_common():\n", " domains.append(domain)\n", " counts.append(count)\n", "\n", " total = sum(counts)\n", " ratio = [(100 * i) / total for i in counts]\n", " return pd.DataFrame.from_dict({\"domain\": domains, \"count\": counts, \"ratio\": ratio})" ] }, { "cell_type": "code", "execution_count": 75, "id": "3c7a42e7", "metadata": {}, "outputs": [], "source": [ "with open(\"/home/piktus_huggingface_co/1tt/data/wiki_domains.pkl\", \"rb\") as handle:\n", " wiki_domains = pickle.load(handle)" ] }, { "cell_type": "code", "execution_count": 97, "id": "931edb4c", "metadata": {}, "outputs": [], "source": [ "wiki_df = get_df(wiki_domains)\n", "wiki_df = wiki_df[:100]" ] }, { "cell_type": "code", "execution_count": 77, "id": "075cb7ec", "metadata": {}, "outputs": [], "source": [ "with open(\"/home/piktus_huggingface_co/1tt/data/oscar_domains.pkl\", \"rb\") as handle:\n", " oscar_domains = pickle.load(handle)" ] }, { "cell_type": "code", "execution_count": 78, "id": "6cc1109e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>domain</th>\n", " <th>count</th>\n", " <th>ratio</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>pubmed.ncbi.nlm.nih.gov</td>\n", " <td>164199</td>\n", " <td>0.038010</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>www.theguardian.com</td>\n", " <td>102966</td>\n", " <td>0.023835</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>unistore.www.microsoft.com</td>\n", " <td>67582</td>\n", " <td>0.015644</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>us.vestiairecollective.com</td>\n", " <td>64876</td>\n", " <td>0.015018</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>imgur.com</td>\n", " <td>62244</td>\n", " <td>0.014409</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>www.reuters.com</td>\n", " <td>61665</td>\n", " <td>0.014275</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>espas.secure.europarl.europa.eu</td>\n", " <td>60999</td>\n", " <td>0.014120</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>www.forbes.com</td>\n", " <td>58727</td>\n", " <td>0.013594</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " domain count ratio\n", "0 pubmed.ncbi.nlm.nih.gov 164199 0.038010\n", "1 www.theguardian.com 102966 0.023835\n", "2 unistore.www.microsoft.com 67582 0.015644\n", "3 us.vestiairecollective.com 64876 0.015018\n", "4 imgur.com 62244 0.014409\n", "5 www.reuters.com 61665 0.014275\n", "6 espas.secure.europarl.europa.eu 60999 0.014120\n", "7 www.forbes.com 58727 0.013594" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "oscar_df = get_df(oscar_domains)\n", "oscar_df[:8]" ] }, { "cell_type": "code", "execution_count": 98, "id": "4131dc3a", "metadata": {}, "outputs": [], "source": [ "both = wiki_df.join(\n", " oscar_df.set_index(\"domain\"),\n", " on=\"domain\",\n", " how=\"left\",\n", " lsuffix=\"_wiki\",\n", " rsuffix=\"_oscar\",\n", ").fillna(0)" ] }, { "cell_type": "code", "execution_count": 99, "id": "290bf0cd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>domain</th>\n", " <th>count_wiki</th>\n", " <th>ratio_wiki</th>\n", " <th>count_oscar</th>\n", " <th>ratio_oscar</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>4321781</td>\n", " <td>14.761861</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>books.google.com</td>\n", " <td>923239</td>\n", " <td>3.153498</td>\n", " <td>11530.0</td>\n", " <td>2.669027e-03</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>archive.org</td>\n", " <td>422327</td>\n", " <td>1.442538</td>\n", " <td>2138.0</td>\n", " <td>4.949158e-04</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>www.nytimes.com</td>\n", " <td>286414</td>\n", " <td>0.978301</td>\n", " <td>2090.0</td>\n", " <td>4.838045e-04</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>282416</td>\n", " <td>0.964645</td>\n", " <td>1811.0</td>\n", " <td>4.192201e-04</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>www.bbc.co.uk</td>\n", " <td>222875</td>\n", " <td>0.761272</td>\n", " <td>6305.0</td>\n", " <td>1.459516e-03</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.theguardian.com</td>\n", " <td>192091</td>\n", " <td>0.656123</td>\n", " <td>102966.0</td>\n", " <td>2.383513e-02</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>news.bbc.co.uk</td>\n", " <td>166381</td>\n", " <td>0.568306</td>\n", " <td>41252.0</td>\n", " <td>9.549236e-03</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>www.billboard.com</td>\n", " <td>165065</td>\n", " <td>0.563811</td>\n", " <td>3915.0</td>\n", " <td>9.062654e-04</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>www.census.gov</td>\n", " <td>161907</td>\n", " <td>0.553024</td>\n", " <td>16765.0</td>\n", " <td>3.880853e-03</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>www.youtube.com</td>\n", " <td>149568</td>\n", " <td>0.510878</td>\n", " <td>35.0</td>\n", " <td>8.101990e-06</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>news.google.com</td>\n", " <td>127045</td>\n", " <td>0.433946</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>www.newspapers.com</td>\n", " <td>126038</td>\n", " <td>0.430507</td>\n", " <td>9259.0</td>\n", " <td>2.143323e-03</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.allmusic.com</td>\n", " <td>99734</td>\n", " <td>0.340660</td>\n", " <td>3159.0</td>\n", " <td>7.312624e-04</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>nla.gov.au</td>\n", " <td>84899</td>\n", " <td>0.289989</td>\n", " <td>151.0</td>\n", " <td>3.495430e-05</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>www.washingtonpost.com</td>\n", " <td>82463</td>\n", " <td>0.281668</td>\n", " <td>14755.0</td>\n", " <td>3.415567e-03</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>www.telegraph.co.uk</td>\n", " <td>82355</td>\n", " <td>0.281299</td>\n", " <td>37889.0</td>\n", " <td>8.770751e-03</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>www.espncricinfo.com</td>\n", " <td>64787</td>\n", " <td>0.221292</td>\n", " <td>9020.0</td>\n", " <td>2.087998e-03</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>www.imdb.com</td>\n", " <td>64419</td>\n", " <td>0.220035</td>\n", " <td>4381.0</td>\n", " <td>1.014138e-03</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>www.independent.co.uk</td>\n", " <td>62139</td>\n", " <td>0.212248</td>\n", " <td>30507.0</td>\n", " <td>7.061926e-03</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>www.sports-reference.com</td>\n", " <td>59964</td>\n", " <td>0.204818</td>\n", " <td>57.0</td>\n", " <td>1.319467e-05</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>itunes.apple.com</td>\n", " <td>59719</td>\n", " <td>0.203982</td>\n", " <td>8590.0</td>\n", " <td>1.988460e-03</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>tvbythenumbers.zap2it.com</td>\n", " <td>58393</td>\n", " <td>0.199452</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>timesofindia.indiatimes.com</td>\n", " <td>57124</td>\n", " <td>0.195118</td>\n", " <td>3485.0</td>\n", " <td>8.067267e-04</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>www.baseball-reference.com</td>\n", " <td>54084</td>\n", " <td>0.184734</td>\n", " <td>1767.0</td>\n", " <td>4.090347e-04</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>www.bbc.com</td>\n", " <td>53718</td>\n", " <td>0.183484</td>\n", " <td>5123.0</td>\n", " <td>1.185900e-03</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>www.stat.gov.pl</td>\n", " <td>51732</td>\n", " <td>0.176700</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>variety.com</td>\n", " <td>51519</td>\n", " <td>0.175973</td>\n", " <td>1841.0</td>\n", " <td>4.261646e-04</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>twitter.com</td>\n", " <td>48608</td>\n", " <td>0.166030</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>www.animenewsnetwork.com</td>\n", " <td>48513</td>\n", " <td>0.165705</td>\n", " <td>1446.0</td>\n", " <td>3.347279e-04</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>www.hollywoodreporter.com</td>\n", " <td>48282</td>\n", " <td>0.164916</td>\n", " <td>1651.0</td>\n", " <td>3.821824e-04</td>\n", " </tr>\n", " <tr>\n", " <th>31</th>\n", " <td>www.reuters.com</td>\n", " <td>46416</td>\n", " <td>0.158543</td>\n", " <td>61665.0</td>\n", " <td>1.427455e-02</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>www.thehindu.com</td>\n", " <td>45338</td>\n", " <td>0.154861</td>\n", " <td>2269.0</td>\n", " <td>5.252404e-04</td>\n", " </tr>\n", " <tr>\n", " <th>33</th>\n", " <td>cricketarchive.com</td>\n", " <td>43944</td>\n", " <td>0.150099</td>\n", " <td>492.0</td>\n", " <td>1.138908e-04</td>\n", " </tr>\n", " <tr>\n", " <th>34</th>\n", " <td>articles.latimes.com</td>\n", " <td>43628</td>\n", " <td>0.149020</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>35</th>\n", " <td>www.discogs.com</td>\n", " <td>42528</td>\n", " <td>0.145262</td>\n", " <td>2377.0</td>\n", " <td>5.502408e-04</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>deadline.com</td>\n", " <td>41989</td>\n", " <td>0.143421</td>\n", " <td>16324.0</td>\n", " <td>3.778768e-03</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>www.officialcharts.com</td>\n", " <td>40620</td>\n", " <td>0.138745</td>\n", " <td>127.0</td>\n", " <td>2.939865e-05</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>www.metacritic.com</td>\n", " <td>40061</td>\n", " <td>0.136836</td>\n", " <td>5457.0</td>\n", " <td>1.263216e-03</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>www.abc.net.au</td>\n", " <td>39787</td>\n", " <td>0.135900</td>\n", " <td>18879.0</td>\n", " <td>4.370213e-03</td>\n", " </tr>\n", " <tr>\n", " <th>40</th>\n", " <td>books.google.co.uk</td>\n", " <td>38707</td>\n", " <td>0.132211</td>\n", " <td>15336.0</td>\n", " <td>3.550060e-03</td>\n", " </tr>\n", " <tr>\n", " <th>41</th>\n", " <td>web.archive.org</td>\n", " <td>38564</td>\n", " <td>0.131723</td>\n", " <td>1.0</td>\n", " <td>2.314854e-07</td>\n", " </tr>\n", " <tr>\n", " <th>42</th>\n", " <td>www.facebook.com</td>\n", " <td>37873</td>\n", " <td>0.129362</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>43</th>\n", " <td>www.cbc.ca</td>\n", " <td>37385</td>\n", " <td>0.127696</td>\n", " <td>26628.0</td>\n", " <td>6.163994e-03</td>\n", " </tr>\n", " <tr>\n", " <th>44</th>\n", " <td>www.amazon.com</td>\n", " <td>37280</td>\n", " <td>0.127337</td>\n", " <td>2721.0</td>\n", " <td>6.298718e-04</td>\n", " </tr>\n", " <tr>\n", " <th>45</th>\n", " <td>www.espn.com</td>\n", " <td>36758</td>\n", " <td>0.125554</td>\n", " <td>39528.0</td>\n", " <td>9.150155e-03</td>\n", " </tr>\n", " <tr>\n", " <th>46</th>\n", " <td>www.latimes.com</td>\n", " <td>36353</td>\n", " <td>0.124171</td>\n", " <td>26274.0</td>\n", " <td>6.082048e-03</td>\n", " </tr>\n", " <tr>\n", " <th>47</th>\n", " <td>www.usatoday.com</td>\n", " <td>35919</td>\n", " <td>0.122688</td>\n", " <td>5631.0</td>\n", " <td>1.303494e-03</td>\n", " </tr>\n", " <tr>\n", " <th>48</th>\n", " <td>www.rollingstone.com</td>\n", " <td>35602</td>\n", " <td>0.121605</td>\n", " <td>7996.0</td>\n", " <td>1.850957e-03</td>\n", " </tr>\n", " <tr>\n", " <th>49</th>\n", " <td>www.smh.com.au</td>\n", " <td>33701</td>\n", " <td>0.115112</td>\n", " <td>41687.0</td>\n", " <td>9.649933e-03</td>\n", " </tr>\n", " <tr>\n", " <th>50</th>\n", " <td>int.soccerway.com</td>\n", " <td>32654</td>\n", " <td>0.111536</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>51</th>\n", " <td>www.forbes.com</td>\n", " <td>32440</td>\n", " <td>0.110805</td>\n", " <td>58727.0</td>\n", " <td>1.359444e-02</td>\n", " </tr>\n", " <tr>\n", " <th>52</th>\n", " <td>www.cnn.com</td>\n", " <td>32378</td>\n", " <td>0.110593</td>\n", " <td>18908.0</td>\n", " <td>4.376926e-03</td>\n", " </tr>\n", " <tr>\n", " <th>53</th>\n", " <td>espn.go.com</td>\n", " <td>31913</td>\n", " <td>0.109005</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>54</th>\n", " <td>www.huffingtonpost.com</td>\n", " <td>30221</td>\n", " <td>0.103226</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>55</th>\n", " <td>www.pro-football-reference.com</td>\n", " <td>30088</td>\n", " <td>0.102771</td>\n", " <td>34.0</td>\n", " <td>7.870504e-06</td>\n", " </tr>\n", " <tr>\n", " <th>56</th>\n", " <td>www.bloomberg.com</td>\n", " <td>30028</td>\n", " <td>0.102566</td>\n", " <td>1.0</td>\n", " <td>2.314854e-07</td>\n", " </tr>\n", " <tr>\n", " <th>57</th>\n", " <td>www.mtv.com</td>\n", " <td>29337</td>\n", " <td>0.100206</td>\n", " <td>20034.0</td>\n", " <td>4.637579e-03</td>\n", " </tr>\n", " <tr>\n", " <th>58</th>\n", " <td>babel.hathitrust.org</td>\n", " <td>28413</td>\n", " <td>0.097050</td>\n", " <td>1.0</td>\n", " <td>2.314854e-07</td>\n", " </tr>\n", " <tr>\n", " <th>59</th>\n", " <td>www.nba.com</td>\n", " <td>27308</td>\n", " <td>0.093276</td>\n", " <td>14693.0</td>\n", " <td>3.401215e-03</td>\n", " </tr>\n", " <tr>\n", " <th>60</th>\n", " <td>pitchfork.com</td>\n", " <td>26749</td>\n", " <td>0.091366</td>\n", " <td>17701.0</td>\n", " <td>4.097523e-03</td>\n", " </tr>\n", " <tr>\n", " <th>61</th>\n", " <td>www.ign.com</td>\n", " <td>26743</td>\n", " <td>0.091346</td>\n", " <td>8201.0</td>\n", " <td>1.898412e-03</td>\n", " </tr>\n", " <tr>\n", " <th>62</th>\n", " <td>www.itis.gov</td>\n", " <td>26056</td>\n", " <td>0.088999</td>\n", " <td>3256.0</td>\n", " <td>7.537165e-04</td>\n", " </tr>\n", " <tr>\n", " <th>63</th>\n", " <td>www.wsj.com</td>\n", " <td>25622</td>\n", " <td>0.087517</td>\n", " <td>3534.0</td>\n", " <td>8.180695e-04</td>\n", " </tr>\n", " <tr>\n", " <th>64</th>\n", " <td>www.gbif.org</td>\n", " <td>25432</td>\n", " <td>0.086868</td>\n", " <td>87.0</td>\n", " <td>2.013923e-05</td>\n", " </tr>\n", " <tr>\n", " <th>65</th>\n", " <td>geonames.usgs.gov</td>\n", " <td>24953</td>\n", " <td>0.085232</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>66</th>\n", " <td>www.showbuzzdaily.com</td>\n", " <td>24882</td>\n", " <td>0.084989</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>67</th>\n", " <td>www.uefa.com</td>\n", " <td>24811</td>\n", " <td>0.084747</td>\n", " <td>2204.0</td>\n", " <td>5.101939e-04</td>\n", " </tr>\n", " <tr>\n", " <th>68</th>\n", " <td>www.npr.org</td>\n", " <td>24231</td>\n", " <td>0.082766</td>\n", " <td>49011.0</td>\n", " <td>1.134533e-02</td>\n", " </tr>\n", " <tr>\n", " <th>69</th>\n", " <td>bugguide.net</td>\n", " <td>23956</td>\n", " <td>0.081826</td>\n", " <td>503.0</td>\n", " <td>1.164372e-04</td>\n", " </tr>\n", " <tr>\n", " <th>70</th>\n", " <td>www.gamespot.com</td>\n", " <td>23634</td>\n", " <td>0.080726</td>\n", " <td>2539.0</td>\n", " <td>5.877415e-04</td>\n", " </tr>\n", " <tr>\n", " <th>71</th>\n", " <td>gaonchart.co.kr</td>\n", " <td>23462</td>\n", " <td>0.080139</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>72</th>\n", " <td>www.nhl.com</td>\n", " <td>23405</td>\n", " <td>0.079944</td>\n", " <td>3464.0</td>\n", " <td>8.018655e-04</td>\n", " </tr>\n", " <tr>\n", " <th>73</th>\n", " <td>www.oricon.co.jp</td>\n", " <td>23205</td>\n", " <td>0.079261</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>74</th>\n", " <td>www.basketball-reference.com</td>\n", " <td>23185</td>\n", " <td>0.079193</td>\n", " <td>15.0</td>\n", " <td>3.472281e-06</td>\n", " </tr>\n", " <tr>\n", " <th>75</th>\n", " <td>www.rottentomatoes.com</td>\n", " <td>23133</td>\n", " <td>0.079015</td>\n", " <td>4244.0</td>\n", " <td>9.824241e-04</td>\n", " </tr>\n", " <tr>\n", " <th>76</th>\n", " <td>www.rsssf.com</td>\n", " <td>22979</td>\n", " <td>0.078489</td>\n", " <td>2187.0</td>\n", " <td>5.062586e-04</td>\n", " </tr>\n", " <tr>\n", " <th>77</th>\n", " <td>timesmachine.nytimes.com</td>\n", " <td>22558</td>\n", " <td>0.077051</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>78</th>\n", " <td>www.nfl.com</td>\n", " <td>21358</td>\n", " <td>0.072952</td>\n", " <td>16389.0</td>\n", " <td>3.793814e-03</td>\n", " </tr>\n", " <tr>\n", " <th>79</th>\n", " <td>www.irishtimes.com</td>\n", " <td>21144</td>\n", " <td>0.072221</td>\n", " <td>1419.0</td>\n", " <td>3.284778e-04</td>\n", " </tr>\n", " <tr>\n", " <th>80</th>\n", " <td>www.stuff.co.nz</td>\n", " <td>20794</td>\n", " <td>0.071026</td>\n", " <td>71.0</td>\n", " <td>1.643546e-05</td>\n", " </tr>\n", " <tr>\n", " <th>81</th>\n", " <td>www2.census.gov</td>\n", " <td>20790</td>\n", " <td>0.071012</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>82</th>\n", " <td>www.bizjournals.com</td>\n", " <td>20359</td>\n", " <td>0.069540</td>\n", " <td>6598.0</td>\n", " <td>1.527341e-03</td>\n", " </tr>\n", " <tr>\n", " <th>83</th>\n", " <td>www.digitalspy.co.uk</td>\n", " <td>20143</td>\n", " <td>0.068802</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>84</th>\n", " <td>www.highbeam.com</td>\n", " <td>19882</td>\n", " <td>0.067911</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>85</th>\n", " <td>www.boxofficemojo.com</td>\n", " <td>19814</td>\n", " <td>0.067678</td>\n", " <td>1216.0</td>\n", " <td>2.814863e-04</td>\n", " </tr>\n", " <tr>\n", " <th>86</th>\n", " <td>www.kicker.de</td>\n", " <td>19362</td>\n", " <td>0.066135</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>87</th>\n", " <td>paperspast.natlib.govt.nz</td>\n", " <td>19175</td>\n", " <td>0.065496</td>\n", " <td>482.0</td>\n", " <td>1.115760e-04</td>\n", " </tr>\n", " <tr>\n", " <th>88</th>\n", " <td>www.wwe.com</td>\n", " <td>18859</td>\n", " <td>0.064416</td>\n", " <td>1200.0</td>\n", " <td>2.777825e-04</td>\n", " </tr>\n", " <tr>\n", " <th>89</th>\n", " <td>trove.nla.gov.au</td>\n", " <td>18661</td>\n", " <td>0.063740</td>\n", " <td>4024.0</td>\n", " <td>9.314973e-04</td>\n", " </tr>\n", " <tr>\n", " <th>90</th>\n", " <td>www.independent.ie</td>\n", " <td>18544</td>\n", " <td>0.063341</td>\n", " <td>16140.0</td>\n", " <td>3.736175e-03</td>\n", " </tr>\n", " <tr>\n", " <th>91</th>\n", " <td>www.nzherald.co.nz</td>\n", " <td>18470</td>\n", " <td>0.063088</td>\n", " <td>19532.0</td>\n", " <td>4.521373e-03</td>\n", " </tr>\n", " <tr>\n", " <th>92</th>\n", " <td>www.researchgate.net</td>\n", " <td>18352</td>\n", " <td>0.062685</td>\n", " <td>34.0</td>\n", " <td>7.870504e-06</td>\n", " </tr>\n", " <tr>\n", " <th>93</th>\n", " <td>www.britishnewspaperarchive.co.uk</td>\n", " <td>17905</td>\n", " <td>0.061158</td>\n", " <td>58.0</td>\n", " <td>1.342615e-05</td>\n", " </tr>\n", " <tr>\n", " <th>94</th>\n", " <td>query.nytimes.com</td>\n", " <td>17876</td>\n", " <td>0.061059</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>95</th>\n", " <td>www.britannica.com</td>\n", " <td>17675</td>\n", " <td>0.060372</td>\n", " <td>31253.0</td>\n", " <td>7.234614e-03</td>\n", " </tr>\n", " <tr>\n", " <th>96</th>\n", " <td>sports.espn.go.com</td>\n", " <td>17661</td>\n", " <td>0.060324</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " <tr>\n", " <th>97</th>\n", " <td>www.nydailynews.com</td>\n", " <td>17332</td>\n", " <td>0.059201</td>\n", " <td>15879.0</td>\n", " <td>3.675757e-03</td>\n", " </tr>\n", " <tr>\n", " <th>98</th>\n", " <td>www.sfgate.com</td>\n", " <td>17269</td>\n", " <td>0.058986</td>\n", " <td>21692.0</td>\n", " <td>5.021382e-03</td>\n", " </tr>\n", " <tr>\n", " <th>99</th>\n", " <td>factfinder.census.gov</td>\n", " <td>17213</td>\n", " <td>0.058794</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " domain count_wiki ratio_wiki count_oscar \\\n", "0 0 4321781 14.761861 0.0 \n", "1 books.google.com 923239 3.153498 11530.0 \n", "2 archive.org 422327 1.442538 2138.0 \n", "3 www.nytimes.com 286414 0.978301 2090.0 \n", "4 www.ncbi.nlm.nih.gov 282416 0.964645 1811.0 \n", "5 www.bbc.co.uk 222875 0.761272 6305.0 \n", "6 www.theguardian.com 192091 0.656123 102966.0 \n", "7 news.bbc.co.uk 166381 0.568306 41252.0 \n", "8 www.billboard.com 165065 0.563811 3915.0 \n", "9 www.census.gov 161907 0.553024 16765.0 \n", "10 www.youtube.com 149568 0.510878 35.0 \n", "11 news.google.com 127045 0.433946 0.0 \n", "12 www.newspapers.com 126038 0.430507 9259.0 \n", "13 www.allmusic.com 99734 0.340660 3159.0 \n", "14 nla.gov.au 84899 0.289989 151.0 \n", "15 www.washingtonpost.com 82463 0.281668 14755.0 \n", "16 www.telegraph.co.uk 82355 0.281299 37889.0 \n", "17 www.espncricinfo.com 64787 0.221292 9020.0 \n", "18 www.imdb.com 64419 0.220035 4381.0 \n", "19 www.independent.co.uk 62139 0.212248 30507.0 \n", "20 www.sports-reference.com 59964 0.204818 57.0 \n", "21 itunes.apple.com 59719 0.203982 8590.0 \n", "22 tvbythenumbers.zap2it.com 58393 0.199452 0.0 \n", "23 timesofindia.indiatimes.com 57124 0.195118 3485.0 \n", "24 www.baseball-reference.com 54084 0.184734 1767.0 \n", "25 www.bbc.com 53718 0.183484 5123.0 \n", "26 www.stat.gov.pl 51732 0.176700 0.0 \n", "27 variety.com 51519 0.175973 1841.0 \n", "28 twitter.com 48608 0.166030 0.0 \n", "29 www.animenewsnetwork.com 48513 0.165705 1446.0 \n", "30 www.hollywoodreporter.com 48282 0.164916 1651.0 \n", "31 www.reuters.com 46416 0.158543 61665.0 \n", "32 www.thehindu.com 45338 0.154861 2269.0 \n", "33 cricketarchive.com 43944 0.150099 492.0 \n", "34 articles.latimes.com 43628 0.149020 0.0 \n", "35 www.discogs.com 42528 0.145262 2377.0 \n", "36 deadline.com 41989 0.143421 16324.0 \n", "37 www.officialcharts.com 40620 0.138745 127.0 \n", "38 www.metacritic.com 40061 0.136836 5457.0 \n", "39 www.abc.net.au 39787 0.135900 18879.0 \n", "40 books.google.co.uk 38707 0.132211 15336.0 \n", "41 web.archive.org 38564 0.131723 1.0 \n", "42 www.facebook.com 37873 0.129362 0.0 \n", "43 www.cbc.ca 37385 0.127696 26628.0 \n", "44 www.amazon.com 37280 0.127337 2721.0 \n", "45 www.espn.com 36758 0.125554 39528.0 \n", "46 www.latimes.com 36353 0.124171 26274.0 \n", "47 www.usatoday.com 35919 0.122688 5631.0 \n", "48 www.rollingstone.com 35602 0.121605 7996.0 \n", "49 www.smh.com.au 33701 0.115112 41687.0 \n", "50 int.soccerway.com 32654 0.111536 0.0 \n", "51 www.forbes.com 32440 0.110805 58727.0 \n", "52 www.cnn.com 32378 0.110593 18908.0 \n", "53 espn.go.com 31913 0.109005 0.0 \n", "54 www.huffingtonpost.com 30221 0.103226 0.0 \n", "55 www.pro-football-reference.com 30088 0.102771 34.0 \n", "56 www.bloomberg.com 30028 0.102566 1.0 \n", "57 www.mtv.com 29337 0.100206 20034.0 \n", "58 babel.hathitrust.org 28413 0.097050 1.0 \n", "59 www.nba.com 27308 0.093276 14693.0 \n", "60 pitchfork.com 26749 0.091366 17701.0 \n", "61 www.ign.com 26743 0.091346 8201.0 \n", "62 www.itis.gov 26056 0.088999 3256.0 \n", "63 www.wsj.com 25622 0.087517 3534.0 \n", "64 www.gbif.org 25432 0.086868 87.0 \n", "65 geonames.usgs.gov 24953 0.085232 0.0 \n", "66 www.showbuzzdaily.com 24882 0.084989 0.0 \n", "67 www.uefa.com 24811 0.084747 2204.0 \n", "68 www.npr.org 24231 0.082766 49011.0 \n", "69 bugguide.net 23956 0.081826 503.0 \n", "70 www.gamespot.com 23634 0.080726 2539.0 \n", "71 gaonchart.co.kr 23462 0.080139 0.0 \n", "72 www.nhl.com 23405 0.079944 3464.0 \n", "73 www.oricon.co.jp 23205 0.079261 0.0 \n", "74 www.basketball-reference.com 23185 0.079193 15.0 \n", "75 www.rottentomatoes.com 23133 0.079015 4244.0 \n", "76 www.rsssf.com 22979 0.078489 2187.0 \n", "77 timesmachine.nytimes.com 22558 0.077051 0.0 \n", "78 www.nfl.com 21358 0.072952 16389.0 \n", "79 www.irishtimes.com 21144 0.072221 1419.0 \n", "80 www.stuff.co.nz 20794 0.071026 71.0 \n", "81 www2.census.gov 20790 0.071012 0.0 \n", "82 www.bizjournals.com 20359 0.069540 6598.0 \n", "83 www.digitalspy.co.uk 20143 0.068802 0.0 \n", "84 www.highbeam.com 19882 0.067911 0.0 \n", "85 www.boxofficemojo.com 19814 0.067678 1216.0 \n", "86 www.kicker.de 19362 0.066135 0.0 \n", "87 paperspast.natlib.govt.nz 19175 0.065496 482.0 \n", "88 www.wwe.com 18859 0.064416 1200.0 \n", "89 trove.nla.gov.au 18661 0.063740 4024.0 \n", "90 www.independent.ie 18544 0.063341 16140.0 \n", "91 www.nzherald.co.nz 18470 0.063088 19532.0 \n", "92 www.researchgate.net 18352 0.062685 34.0 \n", "93 www.britishnewspaperarchive.co.uk 17905 0.061158 58.0 \n", "94 query.nytimes.com 17876 0.061059 0.0 \n", "95 www.britannica.com 17675 0.060372 31253.0 \n", "96 sports.espn.go.com 17661 0.060324 0.0 \n", "97 www.nydailynews.com 17332 0.059201 15879.0 \n", "98 www.sfgate.com 17269 0.058986 21692.0 \n", "99 factfinder.census.gov 17213 0.058794 0.0 \n", "\n", " ratio_oscar \n", "0 0.000000e+00 \n", "1 2.669027e-03 \n", "2 4.949158e-04 \n", "3 4.838045e-04 \n", "4 4.192201e-04 \n", "5 1.459516e-03 \n", "6 2.383513e-02 \n", "7 9.549236e-03 \n", "8 9.062654e-04 \n", "9 3.880853e-03 \n", "10 8.101990e-06 \n", "11 0.000000e+00 \n", "12 2.143323e-03 \n", "13 7.312624e-04 \n", "14 3.495430e-05 \n", "15 3.415567e-03 \n", "16 8.770751e-03 \n", "17 2.087998e-03 \n", "18 1.014138e-03 \n", "19 7.061926e-03 \n", "20 1.319467e-05 \n", "21 1.988460e-03 \n", "22 0.000000e+00 \n", "23 8.067267e-04 \n", "24 4.090347e-04 \n", "25 1.185900e-03 \n", "26 0.000000e+00 \n", "27 4.261646e-04 \n", "28 0.000000e+00 \n", "29 3.347279e-04 \n", "30 3.821824e-04 \n", "31 1.427455e-02 \n", "32 5.252404e-04 \n", "33 1.138908e-04 \n", "34 0.000000e+00 \n", "35 5.502408e-04 \n", "36 3.778768e-03 \n", "37 2.939865e-05 \n", "38 1.263216e-03 \n", "39 4.370213e-03 \n", "40 3.550060e-03 \n", "41 2.314854e-07 \n", "42 0.000000e+00 \n", "43 6.163994e-03 \n", "44 6.298718e-04 \n", "45 9.150155e-03 \n", "46 6.082048e-03 \n", "47 1.303494e-03 \n", "48 1.850957e-03 \n", "49 9.649933e-03 \n", "50 0.000000e+00 \n", "51 1.359444e-02 \n", "52 4.376926e-03 \n", "53 0.000000e+00 \n", "54 0.000000e+00 \n", "55 7.870504e-06 \n", "56 2.314854e-07 \n", "57 4.637579e-03 \n", "58 2.314854e-07 \n", "59 3.401215e-03 \n", "60 4.097523e-03 \n", "61 1.898412e-03 \n", "62 7.537165e-04 \n", "63 8.180695e-04 \n", "64 2.013923e-05 \n", "65 0.000000e+00 \n", "66 0.000000e+00 \n", "67 5.101939e-04 \n", "68 1.134533e-02 \n", "69 1.164372e-04 \n", "70 5.877415e-04 \n", "71 0.000000e+00 \n", "72 8.018655e-04 \n", "73 0.000000e+00 \n", "74 3.472281e-06 \n", "75 9.824241e-04 \n", "76 5.062586e-04 \n", "77 0.000000e+00 \n", "78 3.793814e-03 \n", "79 3.284778e-04 \n", "80 1.643546e-05 \n", "81 0.000000e+00 \n", "82 1.527341e-03 \n", "83 0.000000e+00 \n", "84 0.000000e+00 \n", "85 2.814863e-04 \n", "86 0.000000e+00 \n", "87 1.115760e-04 \n", "88 2.777825e-04 \n", "89 9.314973e-04 \n", "90 3.736175e-03 \n", "91 4.521373e-03 \n", "92 7.870504e-06 \n", "93 1.342615e-05 \n", "94 0.000000e+00 \n", "95 7.234614e-03 \n", "96 0.000000e+00 \n", "97 3.675757e-03 \n", "98 5.021382e-03 \n", "99 0.000000e+00 " ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "both" ] }, { "cell_type": "code", "execution_count": 100, "id": "06bab2c3", "metadata": {}, "outputs": [], "source": [ "MAX = 13131313\n", "tot[\"ratio\"] = tot.apply(\n", " lambda row: MAX if row[\"ratio_oscar\"] == 0 else row[\"ratio_wiki\"] / row[\"ratio_oscar\"], axis=1\n", ")" ] }, { "cell_type": "code", "execution_count": 103, "id": "415da410", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>domain</th>\n", " <th>count_wiki</th>\n", " <th>ratio_wiki</th>\n", " <th>count_oscar</th>\n", " <th>ratio_oscar</th>\n", " <th>ratio</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>4321781</td>\n", " <td>14.761861</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>news.google.com</td>\n", " <td>127045</td>\n", " <td>0.433946</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>tvbythenumbers.zap2it.com</td>\n", " <td>58393</td>\n", " <td>0.199452</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>www.stat.gov.pl</td>\n", " <td>51732</td>\n", " <td>0.176700</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>twitter.com</td>\n", " <td>48608</td>\n", " <td>0.166030</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>34</th>\n", " <td>articles.latimes.com</td>\n", " <td>43628</td>\n", " <td>0.149020</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>42</th>\n", " <td>www.facebook.com</td>\n", " <td>37873</td>\n", " <td>0.129362</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>50</th>\n", " <td>int.soccerway.com</td>\n", " <td>32654</td>\n", " <td>0.111536</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>53</th>\n", " <td>espn.go.com</td>\n", " <td>31913</td>\n", " <td>0.109005</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>54</th>\n", " <td>www.huffingtonpost.com</td>\n", " <td>30221</td>\n", " <td>0.103226</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>65</th>\n", " <td>geonames.usgs.gov</td>\n", " <td>24953</td>\n", " <td>0.085232</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>66</th>\n", " <td>www.showbuzzdaily.com</td>\n", " <td>24882</td>\n", " <td>0.084989</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>71</th>\n", " <td>gaonchart.co.kr</td>\n", " <td>23462</td>\n", " <td>0.080139</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>73</th>\n", " <td>www.oricon.co.jp</td>\n", " <td>23205</td>\n", " <td>0.079261</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>77</th>\n", " <td>timesmachine.nytimes.com</td>\n", " <td>22558</td>\n", " <td>0.077051</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>81</th>\n", " <td>www2.census.gov</td>\n", " <td>20790</td>\n", " <td>0.071012</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>83</th>\n", " <td>www.digitalspy.co.uk</td>\n", " <td>20143</td>\n", " <td>0.068802</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>84</th>\n", " <td>www.highbeam.com</td>\n", " <td>19882</td>\n", " <td>0.067911</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>86</th>\n", " <td>www.kicker.de</td>\n", " <td>19362</td>\n", " <td>0.066135</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>94</th>\n", " <td>query.nytimes.com</td>\n", " <td>17876</td>\n", " <td>0.061059</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>96</th>\n", " <td>sports.espn.go.com</td>\n", " <td>17661</td>\n", " <td>0.060324</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>99</th>\n", " <td>factfinder.census.gov</td>\n", " <td>17213</td>\n", " <td>0.058794</td>\n", " <td>0.0</td>\n", " <td>0.000000e+00</td>\n", " <td>1.313131e+07</td>\n", " </tr>\n", " <tr>\n", " <th>41</th>\n", " <td>web.archive.org</td>\n", " <td>38564</td>\n", " <td>0.131723</td>\n", " <td>1.0</td>\n", " <td>2.314854e-07</td>\n", " <td>5.690322e+05</td>\n", " </tr>\n", " <tr>\n", " <th>56</th>\n", " <td>www.bloomberg.com</td>\n", " <td>30028</td>\n", " <td>0.102566</td>\n", " <td>1.0</td>\n", " <td>2.314854e-07</td>\n", " <td>4.430790e+05</td>\n", " </tr>\n", " <tr>\n", " <th>58</th>\n", " <td>babel.hathitrust.org</td>\n", " <td>28413</td>\n", " <td>0.097050</td>\n", " <td>1.0</td>\n", " <td>2.314854e-07</td>\n", " <td>4.192488e+05</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>www.youtube.com</td>\n", " <td>149568</td>\n", " <td>0.510878</td>\n", " <td>35.0</td>\n", " <td>8.101990e-06</td>\n", " <td>6.305585e+04</td>\n", " </tr>\n", " <tr>\n", " <th>74</th>\n", " <td>www.basketball-reference.com</td>\n", " <td>23185</td>\n", " <td>0.079193</td>\n", " <td>15.0</td>\n", " <td>3.472281e-06</td>\n", " <td>2.280713e+04</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>www.sports-reference.com</td>\n", " <td>59964</td>\n", " <td>0.204818</td>\n", " <td>57.0</td>\n", " <td>1.319467e-05</td>\n", " <td>1.552281e+04</td>\n", " </tr>\n", " <tr>\n", " <th>55</th>\n", " <td>www.pro-football-reference.com</td>\n", " <td>30088</td>\n", " <td>0.102771</td>\n", " <td>34.0</td>\n", " <td>7.870504e-06</td>\n", " <td>1.305777e+04</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>nla.gov.au</td>\n", " <td>84899</td>\n", " <td>0.289989</td>\n", " <td>151.0</td>\n", " <td>3.495430e-05</td>\n", " <td>8.296222e+03</td>\n", " </tr>\n", " <tr>\n", " <th>92</th>\n", " <td>www.researchgate.net</td>\n", " <td>18352</td>\n", " <td>0.062685</td>\n", " <td>34.0</td>\n", " <td>7.870504e-06</td>\n", " <td>7.964513e+03</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>www.officialcharts.com</td>\n", " <td>40620</td>\n", " <td>0.138745</td>\n", " <td>127.0</td>\n", " <td>2.939865e-05</td>\n", " <td>4.719445e+03</td>\n", " </tr>\n", " <tr>\n", " <th>93</th>\n", " <td>www.britishnewspaperarchive.co.uk</td>\n", " <td>17905</td>\n", " <td>0.061158</td>\n", " <td>58.0</td>\n", " <td>1.342615e-05</td>\n", " <td>4.555133e+03</td>\n", " </tr>\n", " <tr>\n", " <th>80</th>\n", " <td>www.stuff.co.nz</td>\n", " <td>20794</td>\n", " <td>0.071026</td>\n", " <td>71.0</td>\n", " <td>1.643546e-05</td>\n", " <td>4.321499e+03</td>\n", " </tr>\n", " <tr>\n", " <th>64</th>\n", " <td>www.gbif.org</td>\n", " <td>25432</td>\n", " <td>0.086868</td>\n", " <td>87.0</td>\n", " <td>2.013923e-05</td>\n", " <td>4.313363e+03</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>archive.org</td>\n", " <td>422327</td>\n", " <td>1.442538</td>\n", " <td>2138.0</td>\n", " <td>4.949158e-04</td>\n", " <td>2.914714e+03</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>www.ncbi.nlm.nih.gov</td>\n", " <td>282416</td>\n", " <td>0.964645</td>\n", " <td>1811.0</td>\n", " <td>4.192201e-04</td>\n", " <td>2.301047e+03</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>www.nytimes.com</td>\n", " <td>286414</td>\n", " <td>0.978301</td>\n", " <td>2090.0</td>\n", " <td>4.838045e-04</td>\n", " <td>2.022100e+03</td>\n", " </tr>\n", " <tr>\n", " <th>33</th>\n", " <td>cricketarchive.com</td>\n", " <td>43944</td>\n", " <td>0.150099</td>\n", " <td>492.0</td>\n", " <td>1.138908e-04</td>\n", " <td>1.317921e+03</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>books.google.com</td>\n", " <td>923239</td>\n", " <td>3.153498</td>\n", " <td>11530.0</td>\n", " <td>2.669027e-03</td>\n", " <td>1.181516e+03</td>\n", " </tr>\n", " <tr>\n", " <th>69</th>\n", " <td>bugguide.net</td>\n", " <td>23956</td>\n", " <td>0.081826</td>\n", " <td>503.0</td>\n", " <td>1.164372e-04</td>\n", " <td>7.027503e+02</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>www.billboard.com</td>\n", " <td>165065</td>\n", " <td>0.563811</td>\n", " <td>3915.0</td>\n", " <td>9.062654e-04</td>\n", " <td>6.221254e+02</td>\n", " </tr>\n", " <tr>\n", " <th>87</th>\n", " <td>paperspast.natlib.govt.nz</td>\n", " <td>19175</td>\n", " <td>0.065496</td>\n", " <td>482.0</td>\n", " <td>1.115760e-04</td>\n", " <td>5.870067e+02</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>www.bbc.co.uk</td>\n", " <td>222875</td>\n", " <td>0.761272</td>\n", " <td>6305.0</td>\n", " <td>1.459516e-03</td>\n", " <td>5.215921e+02</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>www.animenewsnetwork.com</td>\n", " <td>48513</td>\n", " <td>0.165705</td>\n", " <td>1446.0</td>\n", " <td>3.347279e-04</td>\n", " <td>4.950449e+02</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>www.allmusic.com</td>\n", " <td>99734</td>\n", " <td>0.340660</td>\n", " <td>3159.0</td>\n", " <td>7.312624e-04</td>\n", " <td>4.658524e+02</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>www.baseball-reference.com</td>\n", " <td>54084</td>\n", " <td>0.184734</td>\n", " <td>1767.0</td>\n", " <td>4.090347e-04</td>\n", " <td>4.516344e+02</td>\n", " </tr>\n", " <tr>\n", " <th>30</th>\n", " <td>www.hollywoodreporter.com</td>\n", " <td>48282</td>\n", " <td>0.164916</td>\n", " <td>1651.0</td>\n", " <td>3.821824e-04</td>\n", " <td>4.315120e+02</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>variety.com</td>\n", " <td>51519</td>\n", " <td>0.175973</td>\n", " <td>1841.0</td>\n", " <td>4.261646e-04</td>\n", " <td>4.129223e+02</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>www.thehindu.com</td>\n", " <td>45338</td>\n", " <td>0.154861</td>\n", " <td>2269.0</td>\n", " <td>5.252404e-04</td>\n", " <td>2.948374e+02</td>\n", " </tr>\n", " <tr>\n", " <th>35</th>\n", " <td>www.discogs.com</td>\n", " <td>42528</td>\n", " <td>0.145262</td>\n", " <td>2377.0</td>\n", " <td>5.502408e-04</td>\n", " <td>2.639979e+02</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>timesofindia.indiatimes.com</td>\n", " <td>57124</td>\n", " <td>0.195118</td>\n", " <td>3485.0</td>\n", " <td>8.067267e-04</td>\n", " <td>2.418636e+02</td>\n", " </tr>\n", " <tr>\n", " <th>85</th>\n", " <td>www.boxofficemojo.com</td>\n", " <td>19814</td>\n", " <td>0.067678</td>\n", " <td>1216.0</td>\n", " <td>2.814863e-04</td>\n", " <td>2.404326e+02</td>\n", " </tr>\n", " <tr>\n", " <th>88</th>\n", " <td>www.wwe.com</td>\n", " <td>18859</td>\n", " <td>0.064416</td>\n", " <td>1200.0</td>\n", " <td>2.777825e-04</td>\n", " <td>2.318954e+02</td>\n", " </tr>\n", " <tr>\n", " <th>79</th>\n", " <td>www.irishtimes.com</td>\n", " <td>21144</td>\n", " <td>0.072221</td>\n", " <td>1419.0</td>\n", " <td>3.284778e-04</td>\n", " <td>2.198667e+02</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>www.imdb.com</td>\n", " <td>64419</td>\n", " <td>0.220035</td>\n", " <td>4381.0</td>\n", " <td>1.014138e-03</td>\n", " <td>2.169679e+02</td>\n", " </tr>\n", " <tr>\n", " <th>44</th>\n", " <td>www.amazon.com</td>\n", " <td>37280</td>\n", " <td>0.127337</td>\n", " <td>2721.0</td>\n", " <td>6.298718e-04</td>\n", " <td>2.021632e+02</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>www.newspapers.com</td>\n", " <td>126038</td>\n", " <td>0.430507</td>\n", " <td>9259.0</td>\n", " <td>2.143323e-03</td>\n", " <td>2.008594e+02</td>\n", " </tr>\n", " <tr>\n", " <th>67</th>\n", " <td>www.uefa.com</td>\n", " <td>24811</td>\n", " <td>0.084747</td>\n", " <td>2204.0</td>\n", " <td>5.101939e-04</td>\n", " <td>1.661068e+02</td>\n", " </tr>\n", " <tr>\n", " <th>76</th>\n", " <td>www.rsssf.com</td>\n", " <td>22979</td>\n", " <td>0.078489</td>\n", " <td>2187.0</td>\n", " <td>5.062586e-04</td>\n", " <td>1.550376e+02</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>www.bbc.com</td>\n", " <td>53718</td>\n", " <td>0.183484</td>\n", " <td>5123.0</td>\n", " <td>1.185900e-03</td>\n", " <td>1.547213e+02</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>www.census.gov</td>\n", " <td>161907</td>\n", " <td>0.553024</td>\n", " <td>16765.0</td>\n", " <td>3.880853e-03</td>\n", " <td>1.425006e+02</td>\n", " </tr>\n", " <tr>\n", " <th>70</th>\n", " <td>www.gamespot.com</td>\n", " <td>23634</td>\n", " <td>0.080726</td>\n", " <td>2539.0</td>\n", " <td>5.877415e-04</td>\n", " <td>1.373502e+02</td>\n", " </tr>\n", " <tr>\n", " <th>62</th>\n", " <td>www.itis.gov</td>\n", " <td>26056</td>\n", " <td>0.088999</td>\n", " <td>3256.0</td>\n", " <td>7.537165e-04</td>\n", " <td>1.180805e+02</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>www.metacritic.com</td>\n", " <td>40061</td>\n", " <td>0.136836</td>\n", " <td>5457.0</td>\n", " <td>1.263216e-03</td>\n", " <td>1.083235e+02</td>\n", " </tr>\n", " <tr>\n", " <th>63</th>\n", " <td>www.wsj.com</td>\n", " <td>25622</td>\n", " <td>0.087517</td>\n", " <td>3534.0</td>\n", " <td>8.180695e-04</td>\n", " <td>1.069797e+02</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>www.espncricinfo.com</td>\n", " <td>64787</td>\n", " <td>0.221292</td>\n", " <td>9020.0</td>\n", " <td>2.087998e-03</td>\n", " <td>1.059830e+02</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>itunes.apple.com</td>\n", " <td>59719</td>\n", " <td>0.203982</td>\n", " <td>8590.0</td>\n", " <td>1.988460e-03</td>\n", " <td>1.025827e+02</td>\n", " </tr>\n", " <tr>\n", " <th>72</th>\n", " <td>www.nhl.com</td>\n", " <td>23405</td>\n", " <td>0.079944</td>\n", " <td>3464.0</td>\n", " <td>8.018655e-04</td>\n", " <td>9.969778e+01</td>\n", " </tr>\n", " <tr>\n", " <th>47</th>\n", " <td>www.usatoday.com</td>\n", " <td>35919</td>\n", " <td>0.122688</td>\n", " <td>5631.0</td>\n", " <td>1.303494e-03</td>\n", " <td>9.412250e+01</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>www.washingtonpost.com</td>\n", " <td>82463</td>\n", " <td>0.281668</td>\n", " <td>14755.0</td>\n", " <td>3.415567e-03</td>\n", " <td>8.246595e+01</td>\n", " </tr>\n", " <tr>\n", " <th>75</th>\n", " <td>www.rottentomatoes.com</td>\n", " <td>23133</td>\n", " <td>0.079015</td>\n", " <td>4244.0</td>\n", " <td>9.824241e-04</td>\n", " <td>8.042875e+01</td>\n", " </tr>\n", " <tr>\n", " <th>89</th>\n", " <td>trove.nla.gov.au</td>\n", " <td>18661</td>\n", " <td>0.063740</td>\n", " <td>4024.0</td>\n", " <td>9.314973e-04</td>\n", " <td>6.842766e+01</td>\n", " </tr>\n", " <tr>\n", " <th>48</th>\n", " <td>www.rollingstone.com</td>\n", " <td>35602</td>\n", " <td>0.121605</td>\n", " <td>7996.0</td>\n", " <td>1.850957e-03</td>\n", " <td>6.569863e+01</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>news.bbc.co.uk</td>\n", " <td>166381</td>\n", " <td>0.568306</td>\n", " <td>41252.0</td>\n", " <td>9.549236e-03</td>\n", " <td>5.951322e+01</td>\n", " </tr>\n", " <tr>\n", " <th>61</th>\n", " <td>www.ign.com</td>\n", " <td>26743</td>\n", " <td>0.091346</td>\n", " <td>8201.0</td>\n", " <td>1.898412e-03</td>\n", " <td>4.811695e+01</td>\n", " </tr>\n", " <tr>\n", " <th>82</th>\n", " <td>www.bizjournals.com</td>\n", " <td>20359</td>\n", " <td>0.069540</td>\n", " <td>6598.0</td>\n", " <td>1.527341e-03</td>\n", " <td>4.553013e+01</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>deadline.com</td>\n", " <td>41989</td>\n", " <td>0.143421</td>\n", " <td>16324.0</td>\n", " <td>3.778768e-03</td>\n", " <td>3.795454e+01</td>\n", " </tr>\n", " <tr>\n", " <th>40</th>\n", " <td>books.google.co.uk</td>\n", " <td>38707</td>\n", " <td>0.132211</td>\n", " <td>15336.0</td>\n", " <td>3.550060e-03</td>\n", " <td>3.724193e+01</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>www.telegraph.co.uk</td>\n", " <td>82355</td>\n", " <td>0.281299</td>\n", " <td>37889.0</td>\n", " <td>8.770751e-03</td>\n", " <td>3.207241e+01</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>www.abc.net.au</td>\n", " <td>39787</td>\n", " <td>0.135900</td>\n", " <td>18879.0</td>\n", " <td>4.370213e-03</td>\n", " <td>3.109689e+01</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>www.independent.co.uk</td>\n", " <td>62139</td>\n", " <td>0.212248</td>\n", " <td>30507.0</td>\n", " <td>7.061926e-03</td>\n", " <td>3.005519e+01</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>www.theguardian.com</td>\n", " <td>192091</td>\n", " <td>0.656123</td>\n", " <td>102966.0</td>\n", " <td>2.383513e-02</td>\n", " <td>2.752757e+01</td>\n", " </tr>\n", " <tr>\n", " <th>59</th>\n", " <td>www.nba.com</td>\n", " <td>27308</td>\n", " <td>0.093276</td>\n", " <td>14693.0</td>\n", " <td>3.401215e-03</td>\n", " <td>2.742421e+01</td>\n", " </tr>\n", " <tr>\n", " <th>52</th>\n", " <td>www.cnn.com</td>\n", " <td>32378</td>\n", " <td>0.110593</td>\n", " <td>18908.0</td>\n", " <td>4.376926e-03</td>\n", " <td>2.526732e+01</td>\n", " </tr>\n", " <tr>\n", " <th>60</th>\n", " <td>pitchfork.com</td>\n", " <td>26749</td>\n", " <td>0.091366</td>\n", " <td>17701.0</td>\n", " <td>4.097523e-03</td>\n", " <td>2.229793e+01</td>\n", " </tr>\n", " <tr>\n", " <th>57</th>\n", " <td>www.mtv.com</td>\n", " <td>29337</td>\n", " <td>0.100206</td>\n", " <td>20034.0</td>\n", " <td>4.637579e-03</td>\n", " <td>2.160741e+01</td>\n", " </tr>\n", " <tr>\n", " <th>43</th>\n", " <td>www.cbc.ca</td>\n", " <td>37385</td>\n", " <td>0.127696</td>\n", " <td>26628.0</td>\n", " <td>6.163994e-03</td>\n", " <td>2.071637e+01</td>\n", " </tr>\n", " <tr>\n", " <th>46</th>\n", " <td>www.latimes.com</td>\n", " <td>36353</td>\n", " <td>0.124171</td>\n", " <td>26274.0</td>\n", " <td>6.082048e-03</td>\n", " <td>2.041591e+01</td>\n", " </tr>\n", " <tr>\n", " <th>78</th>\n", " <td>www.nfl.com</td>\n", " <td>21358</td>\n", " <td>0.072952</td>\n", " <td>16389.0</td>\n", " <td>3.793814e-03</td>\n", " <td>1.922927e+01</td>\n", " </tr>\n", " <tr>\n", " <th>90</th>\n", " <td>www.independent.ie</td>\n", " <td>18544</td>\n", " <td>0.063341</td>\n", " <td>16140.0</td>\n", " <td>3.736175e-03</td>\n", " <td>1.695331e+01</td>\n", " </tr>\n", " <tr>\n", " <th>97</th>\n", " <td>www.nydailynews.com</td>\n", " <td>17332</td>\n", " <td>0.059201</td>\n", " <td>15879.0</td>\n", " <td>3.675757e-03</td>\n", " <td>1.610572e+01</td>\n", " </tr>\n", " <tr>\n", " <th>91</th>\n", " <td>www.nzherald.co.nz</td>\n", " <td>18470</td>\n", " <td>0.063088</td>\n", " <td>19532.0</td>\n", " <td>4.521373e-03</td>\n", " <td>1.395324e+01</td>\n", " </tr>\n", " <tr>\n", " <th>45</th>\n", " <td>www.espn.com</td>\n", " <td>36758</td>\n", " <td>0.125554</td>\n", " <td>39528.0</td>\n", " <td>9.150155e-03</td>\n", " <td>1.372151e+01</td>\n", " </tr>\n", " <tr>\n", " <th>49</th>\n", " <td>www.smh.com.au</td>\n", " <td>33701</td>\n", " <td>0.115112</td>\n", " <td>41687.0</td>\n", " <td>9.649933e-03</td>\n", " <td>1.192880e+01</td>\n", " </tr>\n", " <tr>\n", " <th>98</th>\n", " <td>www.sfgate.com</td>\n", " <td>17269</td>\n", " <td>0.058986</td>\n", " <td>21692.0</td>\n", " <td>5.021382e-03</td>\n", " <td>1.174687e+01</td>\n", " </tr>\n", " <tr>\n", " <th>31</th>\n", " <td>www.reuters.com</td>\n", " <td>46416</td>\n", " <td>0.158543</td>\n", " <td>61665.0</td>\n", " <td>1.427455e-02</td>\n", " <td>1.110667e+01</td>\n", " </tr>\n", " <tr>\n", " <th>95</th>\n", " <td>www.britannica.com</td>\n", " <td>17675</td>\n", " <td>0.060372</td>\n", " <td>31253.0</td>\n", " <td>7.234614e-03</td>\n", " <td>8.344925e+00</td>\n", " </tr>\n", " <tr>\n", " <th>51</th>\n", " <td>www.forbes.com</td>\n", " <td>32440</td>\n", " <td>0.110805</td>\n", " <td>58727.0</td>\n", " <td>1.359444e-02</td>\n", " <td>8.150754e+00</td>\n", " </tr>\n", " <tr>\n", " <th>68</th>\n", " <td>www.npr.org</td>\n", " <td>24231</td>\n", " <td>0.082766</td>\n", " <td>49011.0</td>\n", " <td>1.134533e-02</td>\n", " <td>7.295121e+00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " domain count_wiki ratio_wiki count_oscar \\\n", "0 0 4321781 14.761861 0.0 \n", "11 news.google.com 127045 0.433946 0.0 \n", "22 tvbythenumbers.zap2it.com 58393 0.199452 0.0 \n", "26 www.stat.gov.pl 51732 0.176700 0.0 \n", "28 twitter.com 48608 0.166030 0.0 \n", "34 articles.latimes.com 43628 0.149020 0.0 \n", "42 www.facebook.com 37873 0.129362 0.0 \n", "50 int.soccerway.com 32654 0.111536 0.0 \n", "53 espn.go.com 31913 0.109005 0.0 \n", "54 www.huffingtonpost.com 30221 0.103226 0.0 \n", "65 geonames.usgs.gov 24953 0.085232 0.0 \n", "66 www.showbuzzdaily.com 24882 0.084989 0.0 \n", "71 gaonchart.co.kr 23462 0.080139 0.0 \n", "73 www.oricon.co.jp 23205 0.079261 0.0 \n", "77 timesmachine.nytimes.com 22558 0.077051 0.0 \n", "81 www2.census.gov 20790 0.071012 0.0 \n", "83 www.digitalspy.co.uk 20143 0.068802 0.0 \n", "84 www.highbeam.com 19882 0.067911 0.0 \n", "86 www.kicker.de 19362 0.066135 0.0 \n", "94 query.nytimes.com 17876 0.061059 0.0 \n", "96 sports.espn.go.com 17661 0.060324 0.0 \n", "99 factfinder.census.gov 17213 0.058794 0.0 \n", "41 web.archive.org 38564 0.131723 1.0 \n", "56 www.bloomberg.com 30028 0.102566 1.0 \n", "58 babel.hathitrust.org 28413 0.097050 1.0 \n", "10 www.youtube.com 149568 0.510878 35.0 \n", "74 www.basketball-reference.com 23185 0.079193 15.0 \n", "20 www.sports-reference.com 59964 0.204818 57.0 \n", "55 www.pro-football-reference.com 30088 0.102771 34.0 \n", "14 nla.gov.au 84899 0.289989 151.0 \n", "92 www.researchgate.net 18352 0.062685 34.0 \n", "37 www.officialcharts.com 40620 0.138745 127.0 \n", "93 www.britishnewspaperarchive.co.uk 17905 0.061158 58.0 \n", "80 www.stuff.co.nz 20794 0.071026 71.0 \n", "64 www.gbif.org 25432 0.086868 87.0 \n", "2 archive.org 422327 1.442538 2138.0 \n", "4 www.ncbi.nlm.nih.gov 282416 0.964645 1811.0 \n", "3 www.nytimes.com 286414 0.978301 2090.0 \n", "33 cricketarchive.com 43944 0.150099 492.0 \n", "1 books.google.com 923239 3.153498 11530.0 \n", "69 bugguide.net 23956 0.081826 503.0 \n", "8 www.billboard.com 165065 0.563811 3915.0 \n", "87 paperspast.natlib.govt.nz 19175 0.065496 482.0 \n", "5 www.bbc.co.uk 222875 0.761272 6305.0 \n", "29 www.animenewsnetwork.com 48513 0.165705 1446.0 \n", "13 www.allmusic.com 99734 0.340660 3159.0 \n", "24 www.baseball-reference.com 54084 0.184734 1767.0 \n", "30 www.hollywoodreporter.com 48282 0.164916 1651.0 \n", "27 variety.com 51519 0.175973 1841.0 \n", "32 www.thehindu.com 45338 0.154861 2269.0 \n", "35 www.discogs.com 42528 0.145262 2377.0 \n", "23 timesofindia.indiatimes.com 57124 0.195118 3485.0 \n", "85 www.boxofficemojo.com 19814 0.067678 1216.0 \n", "88 www.wwe.com 18859 0.064416 1200.0 \n", "79 www.irishtimes.com 21144 0.072221 1419.0 \n", "18 www.imdb.com 64419 0.220035 4381.0 \n", "44 www.amazon.com 37280 0.127337 2721.0 \n", "12 www.newspapers.com 126038 0.430507 9259.0 \n", "67 www.uefa.com 24811 0.084747 2204.0 \n", "76 www.rsssf.com 22979 0.078489 2187.0 \n", "25 www.bbc.com 53718 0.183484 5123.0 \n", "9 www.census.gov 161907 0.553024 16765.0 \n", "70 www.gamespot.com 23634 0.080726 2539.0 \n", "62 www.itis.gov 26056 0.088999 3256.0 \n", "38 www.metacritic.com 40061 0.136836 5457.0 \n", "63 www.wsj.com 25622 0.087517 3534.0 \n", "17 www.espncricinfo.com 64787 0.221292 9020.0 \n", "21 itunes.apple.com 59719 0.203982 8590.0 \n", "72 www.nhl.com 23405 0.079944 3464.0 \n", "47 www.usatoday.com 35919 0.122688 5631.0 \n", "15 www.washingtonpost.com 82463 0.281668 14755.0 \n", "75 www.rottentomatoes.com 23133 0.079015 4244.0 \n", "89 trove.nla.gov.au 18661 0.063740 4024.0 \n", "48 www.rollingstone.com 35602 0.121605 7996.0 \n", "7 news.bbc.co.uk 166381 0.568306 41252.0 \n", "61 www.ign.com 26743 0.091346 8201.0 \n", "82 www.bizjournals.com 20359 0.069540 6598.0 \n", "36 deadline.com 41989 0.143421 16324.0 \n", "40 books.google.co.uk 38707 0.132211 15336.0 \n", "16 www.telegraph.co.uk 82355 0.281299 37889.0 \n", "39 www.abc.net.au 39787 0.135900 18879.0 \n", "19 www.independent.co.uk 62139 0.212248 30507.0 \n", "6 www.theguardian.com 192091 0.656123 102966.0 \n", "59 www.nba.com 27308 0.093276 14693.0 \n", "52 www.cnn.com 32378 0.110593 18908.0 \n", "60 pitchfork.com 26749 0.091366 17701.0 \n", "57 www.mtv.com 29337 0.100206 20034.0 \n", "43 www.cbc.ca 37385 0.127696 26628.0 \n", "46 www.latimes.com 36353 0.124171 26274.0 \n", "78 www.nfl.com 21358 0.072952 16389.0 \n", "90 www.independent.ie 18544 0.063341 16140.0 \n", "97 www.nydailynews.com 17332 0.059201 15879.0 \n", "91 www.nzherald.co.nz 18470 0.063088 19532.0 \n", "45 www.espn.com 36758 0.125554 39528.0 \n", "49 www.smh.com.au 33701 0.115112 41687.0 \n", "98 www.sfgate.com 17269 0.058986 21692.0 \n", "31 www.reuters.com 46416 0.158543 61665.0 \n", "95 www.britannica.com 17675 0.060372 31253.0 \n", "51 www.forbes.com 32440 0.110805 58727.0 \n", "68 www.npr.org 24231 0.082766 49011.0 \n", "\n", " ratio_oscar ratio \n", "0 0.000000e+00 1.313131e+07 \n", "11 0.000000e+00 1.313131e+07 \n", "22 0.000000e+00 1.313131e+07 \n", "26 0.000000e+00 1.313131e+07 \n", "28 0.000000e+00 1.313131e+07 \n", "34 0.000000e+00 1.313131e+07 \n", "42 0.000000e+00 1.313131e+07 \n", "50 0.000000e+00 1.313131e+07 \n", "53 0.000000e+00 1.313131e+07 \n", "54 0.000000e+00 1.313131e+07 \n", "65 0.000000e+00 1.313131e+07 \n", "66 0.000000e+00 1.313131e+07 \n", "71 0.000000e+00 1.313131e+07 \n", "73 0.000000e+00 1.313131e+07 \n", "77 0.000000e+00 1.313131e+07 \n", "81 0.000000e+00 1.313131e+07 \n", "83 0.000000e+00 1.313131e+07 \n", "84 0.000000e+00 1.313131e+07 \n", "86 0.000000e+00 1.313131e+07 \n", "94 0.000000e+00 1.313131e+07 \n", "96 0.000000e+00 1.313131e+07 \n", "99 0.000000e+00 1.313131e+07 \n", "41 2.314854e-07 5.690322e+05 \n", "56 2.314854e-07 4.430790e+05 \n", "58 2.314854e-07 4.192488e+05 \n", "10 8.101990e-06 6.305585e+04 \n", "74 3.472281e-06 2.280713e+04 \n", "20 1.319467e-05 1.552281e+04 \n", "55 7.870504e-06 1.305777e+04 \n", "14 3.495430e-05 8.296222e+03 \n", "92 7.870504e-06 7.964513e+03 \n", "37 2.939865e-05 4.719445e+03 \n", "93 1.342615e-05 4.555133e+03 \n", "80 1.643546e-05 4.321499e+03 \n", "64 2.013923e-05 4.313363e+03 \n", "2 4.949158e-04 2.914714e+03 \n", "4 4.192201e-04 2.301047e+03 \n", "3 4.838045e-04 2.022100e+03 \n", "33 1.138908e-04 1.317921e+03 \n", "1 2.669027e-03 1.181516e+03 \n", "69 1.164372e-04 7.027503e+02 \n", "8 9.062654e-04 6.221254e+02 \n", "87 1.115760e-04 5.870067e+02 \n", "5 1.459516e-03 5.215921e+02 \n", "29 3.347279e-04 4.950449e+02 \n", "13 7.312624e-04 4.658524e+02 \n", "24 4.090347e-04 4.516344e+02 \n", "30 3.821824e-04 4.315120e+02 \n", "27 4.261646e-04 4.129223e+02 \n", "32 5.252404e-04 2.948374e+02 \n", "35 5.502408e-04 2.639979e+02 \n", "23 8.067267e-04 2.418636e+02 \n", "85 2.814863e-04 2.404326e+02 \n", "88 2.777825e-04 2.318954e+02 \n", "79 3.284778e-04 2.198667e+02 \n", "18 1.014138e-03 2.169679e+02 \n", "44 6.298718e-04 2.021632e+02 \n", "12 2.143323e-03 2.008594e+02 \n", "67 5.101939e-04 1.661068e+02 \n", "76 5.062586e-04 1.550376e+02 \n", "25 1.185900e-03 1.547213e+02 \n", "9 3.880853e-03 1.425006e+02 \n", "70 5.877415e-04 1.373502e+02 \n", "62 7.537165e-04 1.180805e+02 \n", "38 1.263216e-03 1.083235e+02 \n", "63 8.180695e-04 1.069797e+02 \n", "17 2.087998e-03 1.059830e+02 \n", "21 1.988460e-03 1.025827e+02 \n", "72 8.018655e-04 9.969778e+01 \n", "47 1.303494e-03 9.412250e+01 \n", "15 3.415567e-03 8.246595e+01 \n", "75 9.824241e-04 8.042875e+01 \n", "89 9.314973e-04 6.842766e+01 \n", "48 1.850957e-03 6.569863e+01 \n", "7 9.549236e-03 5.951322e+01 \n", "61 1.898412e-03 4.811695e+01 \n", "82 1.527341e-03 4.553013e+01 \n", "36 3.778768e-03 3.795454e+01 \n", "40 3.550060e-03 3.724193e+01 \n", "16 8.770751e-03 3.207241e+01 \n", "39 4.370213e-03 3.109689e+01 \n", "19 7.061926e-03 3.005519e+01 \n", "6 2.383513e-02 2.752757e+01 \n", "59 3.401215e-03 2.742421e+01 \n", "52 4.376926e-03 2.526732e+01 \n", "60 4.097523e-03 2.229793e+01 \n", "57 4.637579e-03 2.160741e+01 \n", "43 6.163994e-03 2.071637e+01 \n", "46 6.082048e-03 2.041591e+01 \n", "78 3.793814e-03 1.922927e+01 \n", "90 3.736175e-03 1.695331e+01 \n", "97 3.675757e-03 1.610572e+01 \n", "91 4.521373e-03 1.395324e+01 \n", "45 9.150155e-03 1.372151e+01 \n", "49 9.649933e-03 1.192880e+01 \n", "98 5.021382e-03 1.174687e+01 \n", "31 1.427455e-02 1.110667e+01 \n", "95 7.234614e-03 8.344925e+00 \n", "51 1.359444e-02 8.150754e+00 \n", "68 1.134533e-02 7.295121e+00 " ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tot.sort_values([\"ratio\", \"count_wiki\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "9d888605", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 5 }