in 07-module-feature-monitoring/feature_monitoring_utils.py [0:0]
def feature_databrew_profile(fg_name, results_bucket, results_key, verbose=True):
fg = FeatureGroup(name=fg_name, sagemaker_session=feature_store_session)
#Retrieve S3 location to be used for setting up the Crawler (this could be a Utils function)
s3_uri = fg.describe()['OfflineStoreConfig']['S3StorageConfig']['S3Uri']
table_name = fg.describe()['OfflineStoreConfig']['DataCatalogConfig']['TableName']
fg_s3_url = f'{s3_uri}/{account_id}/sagemaker/{region}/offline-store/{table_name}'
fg_file_name = f'sagemaker-feature-store/{account_id}/sagemaker/{region}/offline-store/{table_name}/data/'
databrew_dataset_name = f'{fg_name}{databrew_dataset_suffix}'
databrew_reports_key = f'{results_key}{databrew_reports_suffix}'
if verbose:
print(s3_uri)
print(fg_s3_url)
print(fg_file_name)
# Instantiate an AWS Glue DataBrew Object
databrew = boto3.client(service_name='databrew', region_name=region)
# CREATE PROFILING JOB
# Profile Job configuration
databrew_profilejob_name = f'{fg_name}-profile-job'
# Check if Profile Job already exists
response = databrew.list_jobs()
available_jobs = response["Jobs"]
for job in available_jobs:
#print(crawler_name)
#response = client.get_crawler(Name=crawler_name)
if job["Name"] == databrew_profilejob_name:
#response = client.get_dataset(Name=crawler_name)
print(job)
return job, databrew_profilejob_name
databrew_output_location = {
'Bucket': results_bucket,
'Key': databrew_reports_key
}
databrew_job_configuration = {
'DatasetStatisticsConfiguration': {
'Overrides': [{
'Statistic': 'CORRELATION',
'Parameters': {'columnNumber': '20'}
}]
}
}
databrew_job_timeout = 120 # job timeout (minutes)
# Create AWS Glue DataBrew Profile Job
create_response = databrew.create_profile_job(
Name = databrew_profilejob_name,
RoleArn = role,
DatasetName = databrew_dataset_name,
OutputLocation = databrew_output_location,
#Configuration = databrew_job_configuration,
Timeout = databrew_job_timeout,
JobSample = {
'Mode': 'FULL_DATASET'
}
)
print(f'AWS Glue DataBrew Profile Job Created: {create_response["Name"]}')
return create_response