BackupFileParser.py (68 lines of code) (raw):
# Copyright 2022 Google, LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from google.cloud import storage
import jsonlines
columns = 'columns'
tags = 'tags'
create_time = 'createTime'
update_time = 'updateTime'
snapshot_time = 'snapshotTime'
class BackupFileParser:
@staticmethod
def match_template_id(json_obj, source_template_id):
tag_template = "'templateId': '{}'".format(source_template_id)
if tag_template in str(json_obj):
return True
else:
return False
@staticmethod
def match_template_id_project(json_obj, source_template_id, source_template_project):
is_match = False
tag_template = "'templateId': '{}'".format(source_template_id)
project_id = "'projectId': '{}'".format(source_template_project)
#print('tag_template: ', tag_template)
#print('project_id: ', project_id)
if tag_template in str(json_obj) and project_id in str(json_obj):
is_match = True
return is_match
@staticmethod
def extract_tags(credentials, source_template_id, source_template_project, backup_file):
gcs_client = storage.Client(credentials=credentials)
extracted_tags = [] # stores the result set
# download the backup file from GCS
bucket_name, filename = backup_file
bucket = gcs_client.get_bucket(bucket_name)
blob = bucket.get_blob(filename)
tmp_file = '/tmp/' + filename
blob.download_to_filename(filename=tmp_file)
with jsonlines.open(tmp_file) as reader:
index = 0
for obj in reader:
if BackupFileParser.match_template_id(str(obj), source_template_id) == False:
continue
#print(str(index) + ': ' + str(obj))
unwanted_keys = []
for k, v in obj.items():
#print('** key: ', k, ', value: ', v)
if k == columns and BackupFileParser.match_template_id_project(v, source_template_id, source_template_project) == True:
#print('$$$$ column ', v)
columns_copy = v.copy()
for element in columns_copy:
#print('$$$$$ element ', element)
if BackupFileParser.match_template_id(element, source_template_id) == False:
v.remove(element)
if k == columns and BackupFileParser.match_template_id_project(v, source_template_id, source_template_project) == False:
#print('added ', k, ' to unwanted_keys')
unwanted_keys.append(k)
if k == tags and BackupFileParser.match_template_id_project(v, source_template_id, source_template_project) == True:
tags_copy = v.copy()
for element in tags_copy:
#print('$$$$ element ', element)
# looking for elements with references to tag template
if BackupFileParser.match_template_id(element, source_template_id) == False:
v.remove(element)
#print('#### we are left with: ', v)
if k == tags and BackupFileParser.match_template_id_project(v, source_template_id, source_template_project) == False:
#print('added ', k, ' to unwanted_keys')
unwanted_keys.append(k)
if k == create_time or k == update_time or k == snapshot_time:
#print('added ', k, ' to unwanted_keys')
unwanted_keys.append(k)
#print('unwanted_keys: ', unwanted_keys)
for k in unwanted_keys:
del obj[k]
if BackupFileParser.match_template_id_project(obj, source_template_id, source_template_project):
extracted_tags.append(obj)
index = index + 1
return extracted_tags
if __name__ == '__main__':
bkp_file = ('catalog_metadata_exports', 'Exported_Metadata_Project_tag-engine-develop_2022-08-04T15-23-28Z_UTC.jsonl')
extracted_tags = BackupFileParser.extract_tags('data_resource', 'data-mesh-344315', bkp_file)
print('extracted_tags: ', extracted_tags)
bkp_file = ('catalog_metadata_exports', 'Exported_Metadata_Project_tag-engine-develop_2022-08-02T22-09-14Z_UTC.jsonl')
extracted_tags = BackupFileParser.extract_tags('data_attribute', 'data-mesh-344315', bkp_file)
print('extracted_tags: ', extracted_tags)