def __get_clean_text_in_supported_language()

in sources/lambda/async/document_analyzer.py [0:0]


    def __get_clean_text_in_supported_language(self, inputText):
        """ Prepare text for Comprehend:
        reduce the size of the text to 5000 bytes
        and translate it in english if not in supported language """

        # max size for Comprehend: 5000 bytes
        text = inputText[:5000]

        languages = comprehend.detect_dominant_language(
            Text=text
        )
        dominant_languages = sorted(languages['Languages'],
                                    key=lambda k: k['LanguageCode'])
        dominant_language = dominant_languages[0]['LanguageCode']

        if dominant_language not in ['en', 'es', 'fr', 'de', 'it', 'pt']:
            translation = translate.translate_text(
                Text=text,
                SourceLanguageCode=dominant_language,
                TargetLanguageCode="en"
            )
            text = translation['TranslatedText']

        return text[:5000]