tools/update_translations.py (84 lines of code) (raw):

import csv from html.parser import HTMLParser import sys from google.cloud import firestore from auth import get_credentials class HTMLStripper(HTMLParser): def __init__(self): super().__init__() self.text = '' def feed(self, *args, **kwargs): self.text = '' return super().feed(*args, **kwargs) def handle_data(self, d): self.text += d def get_data(self): return self.text html_stripper = HTMLStripper() words = {} with open('./EndangeredLanguageTranslations.csv', encoding='utf-8') as translations_file: reader = csv.reader(translations_file) for row in reader: if len(row) == 0: continue language = row[0] if not language: continue english_word = row[1].lower().strip() if not english_word: continue html_stripper.feed(english_word) english_word = html_stripper.get_data() english_word = english_word.split('/')[0] translation = row[2] if len(row) > 2 else None transliteration = row[3] if len(row) > 3 else None if not translation: if not transliteration: continue translation = transliteration translation = translation.lower().strip() html_stripper.feed(translation) translation = html_stripper.get_data() if transliteration: transliteration = transliteration.lower().strip() html_stripper.feed(transliteration) transliteration = html_stripper.get_data() if english_word not in words: word = {'en': {'sound_link': '', 'translation': english_word, 'transliteration': english_word}} words[english_word] = word else: word = words[english_word] word[language] = { 'sound_link': '', 'translation': translation, 'transliteration': transliteration} # Get credemtials for the user or SA authorised in glcoud credentials, project = get_credentials() if credentials.token is None: # If auth worked and we have a proper Google Cloud Identity we should have a token print('No token credentials') sys.exit() else: print(f'Credentials obtained for project ${project}, exit if not correct!') db = firestore.Client(project='ggl-woolaroo-multilang-uat', credentials=credentials) collection = db.collection(u'translations') batch = firestore.WriteBatch(db) print('Deleting documents') count = 1 for doc in collection.list_documents(): batch.delete(doc) if count % 500 == 0: batch.commit() if count % 100 == 0: print(f'{count} deleted') count = count + 1 batch.commit() print('Creating documents') count = 1 for key, val in words.items(): batch.create(firestore.DocumentReference( 'translations', key, client=db), val) if count % 500 == 0: batch.commit() if count % 100 == 0: print(f'{count}/{len(words)} created') count = count + 1 batch.commit()