in src/upload_to_elasticsearch.py [0:0]
def index_keywords(es, event, keywordsS3Location):
# This is the number of seconds before the start time of the word to place
# the hyperlink. This gives the listener some context before the word is spoken
# to the discussion. Also browsers are precise when seeking and there is some
# variation across browsers to the accuracy of the seek function. 10 seconds
# is usually good, but occasionally you'll land after the word was spoken.
audioOffset = int(os.environ['AUDIO_OFFSET'])
response = s3_client.get_object(Bucket=keywordsS3Location['bucket'], Key=keywordsS3Location['key'])
file_content = response['Body'].read().decode('utf-8')
keywords = json.loads(file_content)
actions = []
# Iterate through all the keywords and create an index document for each phrase
for i in range(len(keywords)):
keyword = keywords[i]["text"]
tags = keywords[i]["tags"]
# Offset the time that the word was spoken to the listener has some context to the phrase
time = str(max(float(keywords[i]["startTime"]) - audioOffset, 0))
actions.append({
"_index": KEYWORDS_INDEX,
"_type": "_doc",
"_source": {
"PodcastName": event["PodcastName"],
"Episode": event["Episode"],
"url": event["podcastUrl"] + "#t=" + time,
"text": keyword,
"tags": tags,
"speaker": keywords[i]["speaker"],
"startTime": float(time)
}
})
# Bulk load the documents into the index.
result = helpers.bulk(es, actions)
logger.info("indexed keywords to ES")
logger.info(json.dumps(result, indent=2))
return result