def train()

in autogluon/tabular-prediction/AutoGluon-Tabular-with-SageMaker/container-training/train.py [0:0]


def train(args):
    model_output_dir = f'{args.output_dir}/data'
    
    is_distributed = len(args.hosts) > 1
    host_rank = args.hosts.index(args.current_host)
    dist_ip_addrs = args.hosts
    dist_ip_addrs.pop(host_rank)

    # Load training and validation data
    print(f'Train files: {os.listdir(args.train)}')
    train_data = __load_input_data(args.train)
    
    # Extract column info
    target = args.fit_args['label']
    columns = train_data.columns.tolist()
    column_dict = {"columns":columns}
    with open('columns.pkl', 'wb') as f:
        pickle.dump(column_dict, f)
    
    # Train models
    predictor = task.fit(
        train_data=train_data,
        output_directory=args.model_dir,
        **args.fit_args,
    )
    
    # Results summary
    predictor.fit_summary(verbosity=3)
    model_summary_fname_src = os.path.join(predictor.output_directory, 'SummaryOfModels.html')
    model_summary_fname_tgt = os.path.join(model_output_dir, 'SummaryOfModels.html')
    
    if os.path.exists(model_summary_fname_src):
        shutil.copy(model_summary_fname_src, model_summary_fname_tgt)
    
    # ensemble visualization
    G = predictor._trainer.model_graph
    remove = [node for node,degree in dict(G.degree()).items() if degree < 1]
    G.remove_nodes_from(remove)
    A = nx.nx_agraph.to_agraph(G)
    A.graph_attr.update(rankdir='BT')
    A.node_attr.update(fontsize=10)
    for node in A.iternodes():
        node.attr['shape'] = 'rectagle'
    A.draw(os.path.join(model_output_dir, 'ensemble-model.png'), format='png', prog='dot')

    # Optional test data
    if args.test:
        print(f'Test files: {os.listdir(args.test)}')
        test_data = __load_input_data(args.test)
        # Test data must be labeled for scoring
        if args.fit_args['label'] in test_data:
            # Leaderboard on test data
            print('Running model on test data and getting Leaderboard...')
            leaderboard = predictor.leaderboard(dataset=test_data, silent=True)
            print(format_for_print(leaderboard), end='\n\n')
            leaderboard.to_csv(f'{model_output_dir}/leaderboard.csv', index=False)

            # Feature importance on test data
            # Note: Feature importance must be calculated on held-out (test) data.
            # If calculated on training data it will be biased due to overfitting.
            if args.feature_importance:      
                print('Feature importance:')
                # Increase rows to print feature importance                
                pd.set_option('display.max_rows', 500)
                feature_importance = predictor.feature_importance(test_data)
                feature_importance_df = pd.DataFrame(feature_importance, columns=['Importance score']).rename_axis(index='Feature')
                print(feature_importance_df)
                feature_importance_df.to_csv(f'{model_output_dir}/feature_importance.csv', index=True)
            
            # Classification report and confusion matrix for classification model
            if predictor.problem_type in [BINARY, MULTICLASS]:
                from sklearn.metrics import classification_report, confusion_matrix
                
                X_test = test_data.drop(args.fit_args['label'], axis=1)
                y_test_true = test_data[args.fit_args['label']]
                y_test_pred = predictor.predict(X_test)
                y_test_pred_prob = predictor.predict_proba(X_test, as_multiclass=True)
                
                report_dict = classification_report(y_test_true, y_test_pred, output_dict=True, labels=predictor.class_labels)
                report_dict_df = pd.DataFrame(report_dict).T
                report_dict_df.to_csv(f'{model_output_dir}/classification_report.csv', index=True)
                
                cm = confusion_matrix(y_test_true, y_test_pred, labels=predictor.class_labels)
                cm_df = pd.DataFrame(cm, predictor.class_labels, predictor.class_labels)
                sns.set(font_scale=1)
                cmap = 'coolwarm'
                sns.heatmap(cm_df, annot=True, fmt='d', cmap=cmap)
                plt.title('Confusion Matrix')
                plt.ylabel('true label')
                plt.xlabel('predicted label')
                plt.show()
                plt.savefig(f'{model_output_dir}/confusion_matrix.png')
                
                get_roc_auc(y_test_true, y_test_pred_prob, predictor.class_labels, predictor.class_labels_internal, model_output_dir)
        else:
            warnings.warn('Skipping eval on test data since label column is not included.')

    # Files summary
    print(f'Model export summary:')
    print(f"/opt/ml/model/: {os.listdir('/opt/ml/model/')}")
    models_contents = os.listdir('/opt/ml/model/models')
    print(f"/opt/ml/model/models: {models_contents}")
    print(f"/opt/ml/model directory size: {du('/opt/ml/model/')}\n")