in src/buildTrainTest.py [0:0]
def lambda_handler(event, context):
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')
bucket = os.environ['s3Bucket']
prefix = 'Comprehend-Custom-Classification'
bucket = 'aim317-workshop-bucket'
DSTTRAINFILE='data/training/comprehend-train.csv'
DSTVALIDATIONFILE='data/test/comprehend-test.csv'
raw_data = pd.read_csv('data/training/aim317-cust-class-train-data.csv')
raw_data['label'] = raw_data['label'].astype(str)
raw_data.groupby('label')['text'].count()
selected_columns = ['label', 'text']
selected_data = raw_data[selected_columns]
selected_data.shape
selected_data.groupby('label')['text'].count()
selected_data.to_csv(path_or_buf=DSTTRAINFILE,
header=False,
index=False,
escapechar='\\',
doublequote=False,
quotechar='"')
s3 = boto3.client('s3')
comprehend = boto3.client('comprehend')
s3.upload_file(DSTTRAINFILE, bucket, prefix+'/'+DSTTRAINFILE)
s3_train_data = 's3://{}/{}/{}'.format(bucket, prefix, DSTTRAINFILE)
s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'output/train_job')
print('training data location: ',s3_train_data, "output location:", s3_output_job)
id = str(datetime.datetime.now().strftime("%s"))
training_job = comprehend.create_document_classifier(
DocumentClassifierName='BYOD-Custom-Classifier-'+ id,
DataAccessRoleArn=os.environ['ServiceRoleArn'],
InputDataConfig={
'S3Uri': s3_train_data
},
OutputDataConfig={
'S3Uri': s3_output_job
},
LanguageCode='en',
VersionName= 'v001',
)
response = comprehend.describe_document_classifier(
DocumentClassifierArn=training_job['DocumentClassifierArn']
)