in jobs/extensions/extensions/main.py [0:0]
def pull_data_from_detail_page(url, timeout_limit, current_date):
"""Input: URL, timeout limit (integer), and current date"""
# Initialize as empty strings
number_of_ratings = None
chrome_extension_name = None
star_rating = None
number_of_users = None
extension_version = None
extension_size = None
extension_languages = None
developer_desc = None
developer_email = None
developer_website = None
developer_phone = None
extension_updated_date = None
trader_status = None
featured = False
manifest_json = None
# Get the soup from the current link
current_link_soup = get_soup_from_webpage(
webpage_url=url, timeout_seconds=timeout_limit
)
# Get paragraphs & headers from the current link
paragraphs_from_current_link_soup = get_paragraphs_from_soup(current_link_soup)
headers_from_current_link_soup = get_h1_headers_from_soup(current_link_soup)
h2_headers_from_current_link_soup = get_h2_headers_from_soup(current_link_soup)
divs_from_current_link_soup = get_divs_from_soup(current_link_soup)
spans_from_current_link_soup = get_spans_from_soup(current_link_soup)
# Get the developer website URL
developer_website = get_website_url_from_soup(current_link_soup)
# Get the number of ratings
for paragraph in paragraphs_from_current_link_soup:
# Check if this has the ratings information
if paragraph.endswith("ratings"):
# Get portion before the space
number_of_ratings = paragraph.split(" ")[0]
# Get the extension name
for header in headers_from_current_link_soup:
chrome_extension_name = header
# Get the star rating
for h2_header in h2_headers_from_current_link_soup:
if "out of 5" in h2_header:
pattern = r"^.*?out of 5"
match = re.search(pattern, h2_header)
if match:
star_rating = match.group(0).split(" ")[0]
# Loop through the divs
for div in divs_from_current_link_soup:
# Get the number of users
if " users" in div:
pattern = r"(\d{1,3}(?:,\d{3})*|\d+) (?=users)"
match = re.search(pattern, div)
if match:
number_of_users = match.group(0).split(" ")[0].replace(",", "")
if "Non-trader" == div:
trader_status = "Non-trader"
if "Trader" == div.strip():
trader_status = "Trader"
# Loop through spans
for span in spans_from_current_link_soup:
if "Featured" == span.strip():
featured = True
# Loop through divs
for index, div in enumerate(divs_from_current_link_soup):
# If you see updated in div and it's not the last div found
if "Updated" in div and index + 1 < len(divs_from_current_link_soup):
# The next div should have the extension_updated_date
extension_updated_date = divs_from_current_link_soup[index + 1]
if "Version" in div and index + 1 < len(divs_from_current_link_soup):
# The next div should have the extension version
extension_version = divs_from_current_link_soup[index + 1]
if "Size" in div and index + 1 < len(divs_from_current_link_soup):
# The next div should have the extension size
extension_size = divs_from_current_link_soup[index + 1]
if "Languages" in div and index + 1 < len(divs_from_current_link_soup):
# The next div should have language info
extension_languages = divs_from_current_link_soup[index + 1]
# Get all divs
all_divs = current_link_soup.find_all("div")
# Loop through each div
for idx, div in enumerate(all_divs):
developer_info = None
# If the div is developer and it's not the last div
if div.text.strip() == "Developer" and idx + 1 < len(all_divs):
# Get the next div after Developer
next_div = all_divs[idx + 1]
# Find the first nested tag
first_nested_tag = next_div.find()
# If there is a first nested tag
if first_nested_tag:
# Get developer info as all the text from that first nested tag
developer_info = first_nested_tag.get_text(separator="\n", strip=True)
# If website is in developer info
if "Website" in developer_info:
# Split on website, first part is developer desc
developer_desc = developer_info.split("Website")[0].replace(
"\n", " "
)
# If email is in developer info
if "Email" in developer_info:
developer_email_and_phone = (
developer_info.split("Email")[1].replace("\n", " ").strip()
)
# If phone is there, get developer email and phone
if "Phone" in developer_email_and_phone:
developer_email_and_phone_list = (
developer_email_and_phone.split("Phone")
)
developer_email = developer_email_and_phone_list[0]
developer_phone = developer_email_and_phone_list[1]
# If phone is not there, only get developer email
else:
developer_email = developer_email_and_phone
category = get_category_from_soup(current_link_soup)
verified_domain = get_verified_domain(current_link_soup)
# NOTE - Still need to add logic for manifest json
# Put the results into a dataframe
curr_link_results_df = pd.DataFrame(
{
"submission_date": [current_date],
"url": [url],
"chrome_extension_name": [chrome_extension_name],
"star_rating": [star_rating],
"number_of_ratings": [number_of_ratings],
"number_of_users": [number_of_users],
"extension_version": [extension_version],
"extension_size": [extension_size],
"extension_languages": [extension_languages],
"developer_desc": [developer_desc],
"developer_email": [developer_email],
"developer_website": [developer_website],
"developer_phone": [developer_phone],
"extension_updated_date": [extension_updated_date],
"category": [category],
"trader_status": [trader_status],
"featured": [featured],
"verified_domain": [verified_domain],
"manifest_json": [manifest_json],
}
)
return curr_link_results_df