in source/lambda/capture_news_feed/util/newscatcher_helper.py [0:0]
def create_and_publish_record(news_feed, account_name, platform, last_published_timestamp=None, query_str=None):
language = news_feed["language"]
url = news_feed["url"]
country = news_feed["country"]
topic = news_feed["topic"]
articles = news_feed["articles"]
query_str_list = query_str.split(",") if query_str else []
# strip off html tags and other html tags starting with '&' and '#' present in regular text from RSS feeds controlled by
# a list of sites stored in SQLLite. Any ReDoS attack may stop ingestion of news feeds, with lambda timing out for a
# specific RSS provider site
cleanr = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});") # NOSONAR - Rule python:S4784.
for article in articles:
published_timestamp = None
try:
published_timestamp = news_feed_timestamp(article)
except ValueError:
logger.warning(f"Cannot parse published timestamp for {article}")
continue
if not last_published_timestamp or published_timestamp > datetime.fromisoformat(last_published_timestamp):
# check if at least one element of list is present in the article summary else skip this article
text = article.get("summary", article.get("title", None))
if text:
logger.debug(f"Article Detail: {article}")
if len(query_str_list) > 0 and not any(keyword in text for keyword in query_str_list):
logger.debug(f"Did not find {query_str} in {article}")
# Moving to next article since it did not have any of the search key words
continue
clean_text = re.sub(cleanr, "", text)
text_array = slice_text_into_arrays(clean_text)
# populate image urls
id_str = f"{str(int(datetime.now().timestamp() * 1000))}#{url}"
image_urls = filter_link_types(article["links"], "image/jpeg")
entities, extended_entities = dict(), dict()
entities["media"], extended_entities["media"] = image_urls, image_urls
# populate text urls
text_urls = filter_link_types(article["links"], "text/html")
text_urls = filter_link_types(article["links"], "audio/mpeg") if not text_urls else text_urls
if text_urls:
entities["urls"], extended_entities["urls"] = text_urls, text_urls
publish_record(
{
"account_name": account_name,
"platform": platform,
"search_query": query_str,
"feed": {
"created_at": published_timestamp.isoformat(),
"entities": entities,
"extended_entities": extended_entities,
"lang": language,
"metadata": {"website": url, "country": country, "topic": topic},
},
},
id_str,
text_array,
)
else:
logger.debug(f"Skipping news feed from {url} since could not get url from {json.dumps(article)}")
else:
logger.debug(f"Could not find article in newsfeed {article}")