in autogluon/tabular-prediction/AutoGluon-Tabular-with-SageMaker/container-training/train.py [0:0]
def train(args):
model_output_dir = f'{args.output_dir}/data'
is_distributed = len(args.hosts) > 1
host_rank = args.hosts.index(args.current_host)
dist_ip_addrs = args.hosts
dist_ip_addrs.pop(host_rank)
# Load training and validation data
print(f'Train files: {os.listdir(args.train)}')
train_data = __load_input_data(args.train)
# Extract column info
target = args.fit_args['label']
columns = train_data.columns.tolist()
column_dict = {"columns":columns}
with open('columns.pkl', 'wb') as f:
pickle.dump(column_dict, f)
# Train models
predictor = task.fit(
train_data=train_data,
output_directory=args.model_dir,
**args.fit_args,
)
# Results summary
predictor.fit_summary(verbosity=3)
model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html')
model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html')
if os.path.exists(model_summary_fname_src):
shutil.copy(model_summary_fname_src, model_summary_fname_tgt)
# ensemble visualization
G = predictor._trainer.model_graph
remove = [node for node,degree in dict(G.degree()).items() if degree < 1]
G.remove_nodes_from(remove)
A = nx.nx_agraph.to_agraph(G)
A.graph_attr.update(rankdir='BT')
A.node_attr.update(fontsize=10)
for node in A.iternodes():
node.attr['shape'] = 'rectagle'
A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot')
# Optional test data
if args.test:
print(f'Test files: {os.listdir(args.test)}')
test_data = __load_input_data(args.test)
# Test data must be labeled for scoring
if args.fit_args['label'] in test_data:
# Leaderboard on test data
print('Running model on test data and getting Leaderboard...')
leaderboard = predictor.leaderboard(dataset=test_data, silent=True)
print(format_for_print(leaderboard), end='\n\n')
leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False)
# Feature importance on test data
# Note: Feature importance must be calculated on held-out (test) data.
# If calculated on training data it will be biased due to overfitting.
if args.feature_importance:
print('Feature importance:')
# Increase rows to print feature importance
pd.set_option('display.max_rows', 500)
feature_importance = predictor.feature_importance(test_data)
feature_importance_df = pd.DataFrame(feature_importance, columns=['Importance score']).rename_axis(index='Feature')
print(feature_importance_df)
feature_importance_df.to_csv(f'{model_output_dir}/feature_importance.csv', index=True)
# Classification report and confusion matrix for classification model
if predictor.problem_type in [BINARY, MULTICLASS]:
from sklearn.metrics import classification_report, confusion_matrix
X_test = test_data.drop(args.fit_args['label'], axis=1)
y_test_true = test_data[args.fit_args['label']]
y_test_pred = predictor.predict(X_test)
y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True)
report_dict = classification_report(y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels)
report_dict_df = pd.DataFrame(report_dict).T
report_dict_df.to_csv(f'{model_output_dir}/classification_report.csv', index=True)
cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels)
cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels)
sns.set(font_scale=1)
cmap = 'coolwarm'
sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap)
plt.title('Confusion Matrix')
plt.ylabel('true label')
plt.xlabel('predicted label')
plt.show()
plt.savefig(f'{model_output_dir}/confusion_matrix.png')
get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir)
else:
warnings.warn('Skipping eval on test data since label column is not included.')
# Files summary
print(f'Model export summary:')
print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
models_contents = os.listdir('/opt/ml/model/models')
print(f"/opt/ml/model/models: {models_contents}")
print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")