in sources/lambda/async/document_analyzer.py [0:0]
def __get_clean_text_in_supported_language(self, inputText):
""" Prepare text for Comprehend:
reduce the size of the text to 5000 bytes
and translate it in english if not in supported language """
# max size for Comprehend: 5000 bytes
text = inputText[:5000]
languages = comprehend.detect_dominant_language(
Text=text
)
dominant_languages = sorted(languages['Languages'],
key=lambda k: k['LanguageCode'])
dominant_language = dominant_languages[0]['LanguageCode']
if dominant_language not in ['en', 'es', 'fr', 'de', 'it', 'pt']:
translation = translate.translate_text(
Text=text,
SourceLanguageCode=dominant_language,
TargetLanguageCode="en"
)
text = translation['TranslatedText']
return text[:5000]