in modules/url_comparison.py [0:0]
def process_one_url(self, url):
"""
Function to iterate over query params in a particular url,
parse params, iteratively remove each, store comparison. Also
performs an 'AA test' comparing the full URL to itself to
account for dynamic elements in a webpage and minimize false
positives.
:param url: STRING
:return: dict containing each param, the difference ratio
"""
url_withs_soup = URLContentFetcher(
url, timeout=self.timeout, parser=self.parser, proxies=self.proxies)
modified_urls = self.generate_modified_urls(url)
compare_result = []
for key, mod_url in modified_urls:
# Compare urls and save output:
# how similar would a URL would be to its original form if
# a particular query string was removed? Use content similarity
# and whether it has the same title as metrics.
mod_url_with_soup = URLContentFetcher(
mod_url,
timeout=self.timeout, parser=self.parser, proxies=self.proxies)
comp = self.compare_two_soups(url_withs_soup, mod_url_with_soup)
current = pd.concat(
(pd.Series({'url': url, 'key': key, 'mod_url': mod_url}), comp))
compare_result.append(current)
compare_result = pd.DataFrame(compare_result)
return compare_result