in modules/url_content_fetcher.py [0:0]
def read_and_soup(self):
"""
Fetch content from a url
"""
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/35.0.1916.47 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/57.0.2987.133 Safari/537.36',
]
parsed = urlparse.urlparse(self.url)
headers = {
"User-Agent": user_agent_list[
hash(parsed.netloc + parsed.path) % len(user_agent_list)],
"X-Requested-With": "XMLHttpRequest",
"Accept-Encoding": "gzip",
}
try:
start_time = time.time()
r = requests.get(
self.url,
headers=headers,
timeout=self.timeout,
stream=True,
proxies=self.proxies
)
url_data = r.content.decode('utf-8', 'ignore')
soup = BeautifulSoup(url_data, self.parser)
end_time = time.time()
self.running_time = end_time - start_time
self.soup = soup
self.success = True
except Exception as e:
logging.error(repr(e) + ", url: {0}".format(self.url))
self.success = False
self.message = "Modified URL error: " + str(e)