google-datacatalog-apache-atlas-connector/tools/cleanup_datacatalog.py (67 lines of code) (raw):

#!/usr/bin/python # # Copyright 2020 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import logging import re import sys from google.api_core import exceptions from google.cloud import datacatalog __datacatalog = datacatalog.DataCatalogClient() def __delete_entries_and_groups(project_ids): logging.info('\nStarting to clean up the catalog...') query = 'system=apache_atlas' scope = datacatalog.SearchCatalogRequest.Scope() scope.include_project_ids.extend(project_ids) request = datacatalog.SearchCatalogRequest() request.scope = scope request.query = query request.page_size = 1000 search_results = __datacatalog.search_catalog(request) datacatalog_entry_name_pattern = '(?P<entry_group_name>.+?)/entries/(.+?)' entry_group_names = [] for result in search_results: try: __datacatalog.delete_entry(name=result.relative_resource_name) logging.info('Entry deleted: %s', result.relative_resource_name) entry_group_name = re.match( pattern=datacatalog_entry_name_pattern, string=result.relative_resource_name).group('entry_group_name') entry_group_names.append(entry_group_name) except exceptions.GoogleAPICallError as e: logging.warning('Exception deleting entry: %s', str(e)) # Delete any pre-existing Entry Groups. for entry_group_name in set(entry_group_names): try: __datacatalog.delete_entry_group(name=entry_group_name) logging.info('--> Entry Group deleted: %s', entry_group_name) except exceptions.GoogleAPICallError as e: logging.warning('Exception deleting entry group: %s', str(e)) def __delete_tag_templates(project_id): query = 'type=TAG_TEMPLATE name:apache_atlas' scope = datacatalog.SearchCatalogRequest.Scope() scope.include_project_ids.extend([project_id]) request = datacatalog.SearchCatalogRequest() request.scope = scope request.query = query request.page_size = 1000 search_results = __datacatalog.search_catalog(request) for result in search_results: try: __datacatalog.delete_tag_template(name=result.relative_resource_name, force=True) logging.info('--> Tag Template deleted: %s', result.relative_resource_name) except exceptions.GoogleAPICallError as e: logging.warning('Exception deleting Tag Template: %s', str(e)) def __parse_args(): parser = argparse.ArgumentParser( description='Command line utility to remove all Apache-atlas related' ' metadata from Data Catalog') parser.add_argument( '--datacatalog-project-ids', help='List of Google Cloud project IDs split by comma.' ' At least one must be provided.', required=True) return parser.parse_args() if __name__ == "__main__": args = __parse_args() # Enable logging logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) # Split multiple values separated by comma. datacatalog_project_ids = args.datacatalog_project_ids.split(',') __delete_entries_and_groups(datacatalog_project_ids) for datacatalog_project_id in datacatalog_project_ids: __delete_tag_templates(datacatalog_project_id) logging.info('\nFinished to clean up the catalog')