def extract_entities()

in sources/lambda/async/document_analyzer.py [0:0]


    def extract_entities(self, pages):
        """ extract entities from pages with Comprehend """

        selected_entity_types = ["ORGANIZATION", "PERSON", "LOCATION", "DATE"]

        final_entities = []
        for page in pages:
            text = self.__get_clean_text_in_supported_language(page['Content'])

            detected_entities = comprehend.detect_entities(
                Text=text,
                LanguageCode="en"
            )

            # uncomment to see output of comprehend
            # print(detected_entities)

            selected_entities = [x for x in detected_entities['Entities']
                                 if x['Score'] > 0.9 and
                                 x['Type'] in selected_entity_types]

            for selected_entity in selected_entities:
                clean_entity = {key: selected_entity[key]
                                for key in ["Text", "Type"]}
                if clean_entity not in final_entities:
                    final_entities.append(clean_entity)

        return final_entities