def feature_databrew

def feature_databrew_profile()

in 07-module-feature-monitoring/feature_monitoring_utils.py [0:0]
45 lines of code
6 McCabe index (conditional complexity)

def feature_databrew_profile(fg_name, results_bucket, results_key, verbose=True):
    
    fg = FeatureGroup(name=fg_name, sagemaker_session=feature_store_session)
    #Retrieve S3 location to be used for setting up the Crawler (this could be a Utils function)
    s3_uri = fg.describe()['OfflineStoreConfig']['S3StorageConfig']['S3Uri']
    table_name = fg.describe()['OfflineStoreConfig']['DataCatalogConfig']['TableName']
    fg_s3_url = f'{s3_uri}/{account_id}/sagemaker/{region}/offline-store/{table_name}'
    fg_file_name = f'sagemaker-feature-store/{account_id}/sagemaker/{region}/offline-store/{table_name}/data/'
    databrew_dataset_name = f'{fg_name}{databrew_dataset_suffix}'
    databrew_reports_key = f'{results_key}{databrew_reports_suffix}'
    
    
    if verbose:
        print(s3_uri)
        print(fg_s3_url)
        print(fg_file_name)

    # Instantiate an AWS Glue DataBrew Object
    databrew = boto3.client(service_name='databrew', region_name=region)
    
    # CREATE PROFILING JOB
    
    # Profile Job configuration
    databrew_profilejob_name = f'{fg_name}-profile-job'
    
    # Check if Profile Job already exists
    response = databrew.list_jobs()
    available_jobs = response["Jobs"]

    for job in available_jobs:
        #print(crawler_name)
        #response = client.get_crawler(Name=crawler_name)
        if job["Name"] == databrew_profilejob_name:
            #response = client.get_dataset(Name=crawler_name)
            print(job)
            return job, databrew_profilejob_name

    
    databrew_output_location = {
        'Bucket': results_bucket,
        'Key': databrew_reports_key
    }
    databrew_job_configuration = {
        'DatasetStatisticsConfiguration': {
            'Overrides': [{
                'Statistic': 'CORRELATION',
                'Parameters': {'columnNumber': '20'}
            }]
        }
    }
    databrew_job_timeout = 120 # job timeout (minutes)
    
    # Create AWS Glue DataBrew Profile Job
    create_response = databrew.create_profile_job(
        Name = databrew_profilejob_name,
        RoleArn = role,
        DatasetName = databrew_dataset_name,
        OutputLocation = databrew_output_location,
        #Configuration = databrew_job_configuration,
        Timeout = databrew_job_timeout,
        JobSample = {
            'Mode': 'FULL_DATASET'
        }
    )
        
    print(f'AWS Glue DataBrew Profile Job Created: {create_response["Name"]}')

    return create_response