def pull_data_from_detail_page()

in jobs/extensions/extensions/main.py [0:0]


def pull_data_from_detail_page(url, timeout_limit, current_date):
    """Input: URL, timeout limit (integer), and current date"""

    # Initialize as empty strings
    number_of_ratings = None
    chrome_extension_name = None
    star_rating = None
    number_of_users = None
    extension_version = None
    extension_size = None
    extension_languages = None
    developer_desc = None
    developer_email = None
    developer_website = None
    developer_phone = None
    extension_updated_date = None
    trader_status = None
    featured = False
    manifest_json = None

    # Get the soup from the current link
    current_link_soup = get_soup_from_webpage(
        webpage_url=url, timeout_seconds=timeout_limit
    )

    # Get paragraphs & headers from the current link
    paragraphs_from_current_link_soup = get_paragraphs_from_soup(current_link_soup)
    headers_from_current_link_soup = get_h1_headers_from_soup(current_link_soup)
    h2_headers_from_current_link_soup = get_h2_headers_from_soup(current_link_soup)
    divs_from_current_link_soup = get_divs_from_soup(current_link_soup)
    spans_from_current_link_soup = get_spans_from_soup(current_link_soup)

    # Get the developer website URL
    developer_website = get_website_url_from_soup(current_link_soup)

    # Get the number of ratings
    for paragraph in paragraphs_from_current_link_soup:
        # Check if this has the ratings information
        if paragraph.endswith("ratings"):
            # Get portion before the space
            number_of_ratings = paragraph.split(" ")[0]

    # Get the extension name
    for header in headers_from_current_link_soup:
        chrome_extension_name = header

    # Get the star rating
    for h2_header in h2_headers_from_current_link_soup:
        if "out of 5" in h2_header:
            pattern = r"^.*?out of 5"
            match = re.search(pattern, h2_header)
            if match:
                star_rating = match.group(0).split(" ")[0]

    # Loop through the divs
    for div in divs_from_current_link_soup:
        # Get the number of users
        if " users" in div:
            pattern = r"(\d{1,3}(?:,\d{3})*|\d+) (?=users)"
            match = re.search(pattern, div)
            if match:
                number_of_users = match.group(0).split(" ")[0].replace(",", "")
        if "Non-trader" == div:
            trader_status = "Non-trader"
        if "Trader" == div.strip():
            trader_status = "Trader"

    # Loop through spans
    for span in spans_from_current_link_soup:
        if "Featured" == span.strip():
            featured = True

    # Loop through divs
    for index, div in enumerate(divs_from_current_link_soup):
        # If you see updated in div and it's not the last div found
        if "Updated" in div and index + 1 < len(divs_from_current_link_soup):
            # The next div should have the extension_updated_date
            extension_updated_date = divs_from_current_link_soup[index + 1]
        if "Version" in div and index + 1 < len(divs_from_current_link_soup):
            # The next div should have the extension version
            extension_version = divs_from_current_link_soup[index + 1]
        if "Size" in div and index + 1 < len(divs_from_current_link_soup):
            # The next div should have the extension size
            extension_size = divs_from_current_link_soup[index + 1]
        if "Languages" in div and index + 1 < len(divs_from_current_link_soup):
            # The next div should have language info
            extension_languages = divs_from_current_link_soup[index + 1]

    # Get all divs
    all_divs = current_link_soup.find_all("div")
    # Loop through each div
    for idx, div in enumerate(all_divs):
        developer_info = None
        # If the div is developer and it's not the last div
        if div.text.strip() == "Developer" and idx + 1 < len(all_divs):
            # Get the next div after Developer
            next_div = all_divs[idx + 1]
            # Find the first nested tag
            first_nested_tag = next_div.find()
            # If there is a first nested tag
            if first_nested_tag:
                # Get developer info as all the text from that first nested tag
                developer_info = first_nested_tag.get_text(separator="\n", strip=True)

                # If website is in developer info
                if "Website" in developer_info:
                    # Split on website, first part is developer desc
                    developer_desc = developer_info.split("Website")[0].replace(
                        "\n", " "
                    )

                # If email is in developer info
                if "Email" in developer_info:
                    developer_email_and_phone = (
                        developer_info.split("Email")[1].replace("\n", " ").strip()
                    )
                    # If phone is there, get developer email and phone
                    if "Phone" in developer_email_and_phone:
                        developer_email_and_phone_list = (
                            developer_email_and_phone.split("Phone")
                        )
                        developer_email = developer_email_and_phone_list[0]
                        developer_phone = developer_email_and_phone_list[1]
                    # If phone is not there, only get developer email
                    else:
                        developer_email = developer_email_and_phone

    category = get_category_from_soup(current_link_soup)
    verified_domain = get_verified_domain(current_link_soup)

    # NOTE - Still need to add logic for manifest json

    # Put the results into a dataframe
    curr_link_results_df = pd.DataFrame(
        {
            "submission_date": [current_date],
            "url": [url],
            "chrome_extension_name": [chrome_extension_name],
            "star_rating": [star_rating],
            "number_of_ratings": [number_of_ratings],
            "number_of_users": [number_of_users],
            "extension_version": [extension_version],
            "extension_size": [extension_size],
            "extension_languages": [extension_languages],
            "developer_desc": [developer_desc],
            "developer_email": [developer_email],
            "developer_website": [developer_website],
            "developer_phone": [developer_phone],
            "extension_updated_date": [extension_updated_date],
            "category": [category],
            "trader_status": [trader_status],
            "featured": [featured],
            "verified_domain": [verified_domain],
            "manifest_json": [manifest_json],
        }
    )

    return curr_link_results_df