in sources/lambda/async/document_analyzer.py [0:0]
def extract_entities(self, pages):
""" extract entities from pages with Comprehend """
selected_entity_types = ["ORGANIZATION", "PERSON", "LOCATION", "DATE"]
final_entities = []
for page in pages:
text = self.__get_clean_text_in_supported_language(page['Content'])
detected_entities = comprehend.detect_entities(
Text=text,
LanguageCode="en"
)
# uncomment to see output of comprehend
# print(detected_entities)
selected_entities = [x for x in detected_entities['Entities']
if x['Score'] > 0.9 and
x['Type'] in selected_entity_types]
for selected_entity in selected_entities:
clean_entity = {key: selected_entity[key]
for key in ["Text", "Type"]}
if clean_entity not in final_entities:
final_entities.append(clean_entity)
return final_entities