BackupFileParser.py

# Copyright 2022 Google, LLC. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from google.cloud import storage import jsonlines columns = 'columns' tags = 'tags' create_time = 'createTime' update_time = 'updateTime' snapshot_time = 'snapshotTime' class BackupFileParser: @staticmethod def match_template_id(json_obj, source_template_id): tag_template = "'templateId': '{}'".format(source_template_id) if tag_template in str(json_obj): return True else: return False @staticmethod def match_template_id_project(json_obj, source_template_id, source_template_project): is_match = False tag_template = "'templateId': '{}'".format(source_template_id) project_id = "'projectId': '{}'".format(source_template_project) #print('tag_template: ', tag_template) #print('project_id: ', project_id) if tag_template in str(json_obj) and project_id in str(json_obj): is_match = True return is_match @staticmethod def extract_tags(credentials, source_template_id, source_template_project, backup_file): gcs_client = storage.Client(credentials=credentials) extracted_tags = [] # stores the result set # download the backup file from GCS bucket_name, filename = backup_file bucket = gcs_client.get_bucket(bucket_name) blob = bucket.get_blob(filename) tmp_file = '/tmp/' + filename blob.download_to_filename(filename=tmp_file) with jsonlines.open(tmp_file) as reader: index = 0 for obj in reader: if BackupFileParser.match_template_id(str(obj), source_template_id) == False: continue #print(str(index) + ': ' + str(obj)) unwanted_keys = [] for k, v in obj.items(): #print('** key: ', k, ', value: ', v) if k == columns and BackupFileParser.match_template_id_project(v, source_template_id, source_template_project) == True: #print('$$$$ column ', v) columns_copy = v.copy() for element in columns_copy: #print('$$$$$ element ', element) if BackupFileParser.match_template_id(element, source_template_id) == False: v.remove(element) if k == columns and BackupFileParser.match_template_id_project(v, source_template_id, source_template_project) == False: #print('added ', k, ' to unwanted_keys') unwanted_keys.append(k) if k == tags and BackupFileParser.match_template_id_project(v, source_template_id, source_template_project) == True: tags_copy = v.copy() for element in tags_copy: #print('$$$$ element ', element) # looking for elements with references to tag template if BackupFileParser.match_template_id(element, source_template_id) == False: v.remove(element) #print('#### we are left with: ', v) if k == tags and BackupFileParser.match_template_id_project(v, source_template_id, source_template_project) == False: #print('added ', k, ' to unwanted_keys') unwanted_keys.append(k) if k == create_time or k == update_time or k == snapshot_time: #print('added ', k, ' to unwanted_keys') unwanted_keys.append(k) #print('unwanted_keys: ', unwanted_keys) for k in unwanted_keys: del obj[k] if BackupFileParser.match_template_id_project(obj, source_template_id, source_template_project): extracted_tags.append(obj) index = index + 1 return extracted_tags if __name__ == '__main__': bkp_file = ('catalog_metadata_exports', 'Exported_Metadata_Project_tag-engine-develop_2022-08-04T15-23-28Z_UTC.jsonl') extracted_tags = BackupFileParser.extract_tags('data_resource', 'data-mesh-344315', bkp_file) print('extracted_tags: ', extracted_tags) bkp_file = ('catalog_metadata_exports', 'Exported_Metadata_Project_tag-engine-develop_2022-08-02T22-09-14Z_UTC.jsonl') extracted_tags = BackupFileParser.extract_tags('data_attribute', 'data-mesh-344315', bkp_file) print('extracted_tags: ', extracted_tags)

BackupFileParser.py (68 lines of code) (raw):