def create_and_publish_record()

in source/lambda/capture_news_feed/util/newscatcher_helper.py [0:0]


def create_and_publish_record(news_feed, account_name, platform, last_published_timestamp=None, query_str=None):
    language = news_feed["language"]
    url = news_feed["url"]
    country = news_feed["country"]
    topic = news_feed["topic"]
    articles = news_feed["articles"]

    query_str_list = query_str.split(",") if query_str else []

    # strip off html tags and other html tags starting with '&' and '#' present in regular text from RSS feeds controlled by
    # a list of sites stored in SQLLite. Any ReDoS attack may stop ingestion of news feeds, with lambda timing out for a
    # specific RSS provider site
    cleanr = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")  # NOSONAR - Rule python:S4784.

    for article in articles:
        published_timestamp = None
        try:
            published_timestamp = news_feed_timestamp(article)
        except ValueError:
            logger.warning(f"Cannot parse published timestamp for {article}")
            continue

        if not last_published_timestamp or published_timestamp > datetime.fromisoformat(last_published_timestamp):
            # check if at least one element of list is present in the article summary else skip this article
            text = article.get("summary", article.get("title", None))
            if text:
                logger.debug(f"Article Detail: {article}")
                if len(query_str_list) > 0 and not any(keyword in text for keyword in query_str_list):
                    logger.debug(f"Did not find {query_str} in {article}")
                    # Moving to next article since it did not have any of the search key words
                    continue

                clean_text = re.sub(cleanr, "", text)
                text_array = slice_text_into_arrays(clean_text)

                # populate image urls
                id_str = f"{str(int(datetime.now().timestamp() * 1000))}#{url}"
                image_urls = filter_link_types(article["links"], "image/jpeg")
                entities, extended_entities = dict(), dict()
                entities["media"], extended_entities["media"] = image_urls, image_urls

                # populate text urls
                text_urls = filter_link_types(article["links"], "text/html")
                text_urls = filter_link_types(article["links"], "audio/mpeg") if not text_urls else text_urls

                if text_urls:
                    entities["urls"], extended_entities["urls"] = text_urls, text_urls
                    publish_record(
                        {
                            "account_name": account_name,
                            "platform": platform,
                            "search_query": query_str,
                            "feed": {
                                "created_at": published_timestamp.isoformat(),
                                "entities": entities,
                                "extended_entities": extended_entities,
                                "lang": language,
                                "metadata": {"website": url, "country": country, "topic": topic},
                            },
                        },
                        id_str,
                        text_array,
                    )
                else:
                    logger.debug(f"Skipping news feed from {url} since could not get url from {json.dumps(article)}")
            else:
                logger.debug(f"Could not find article in newsfeed {article}")