def read_and_soup()

in modules/url_content_fetcher.py [0:0]


    def read_and_soup(self):
        """
        Fetch content from a url
        """
        user_agent_list = [
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/35.0.1916.47 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/60.0.3112.113 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/60.0.3112.90 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/44.0.2403.157 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/60.0.3112.113 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/57.0.2987.133 Safari/537.36',
        ]

        parsed = urlparse.urlparse(self.url)
        headers = {
            "User-Agent": user_agent_list[
                hash(parsed.netloc + parsed.path) % len(user_agent_list)],
            "X-Requested-With": "XMLHttpRequest",
            "Accept-Encoding": "gzip",
        }
        try:
            start_time = time.time()
            r = requests.get(
                self.url,
                headers=headers,
                timeout=self.timeout,
                stream=True,
                proxies=self.proxies
            )
            url_data = r.content.decode('utf-8', 'ignore')
            soup = BeautifulSoup(url_data, self.parser)
            end_time = time.time()
            self.running_time = end_time - start_time
            self.soup = soup
            self.success = True
        except Exception as e:
            logging.error(repr(e) + ", url: {0}".format(self.url))
            self.success = False
            self.message = "Modified URL error: " + str(e)