def cluster_auditd_processes()

in msticpy/analysis/cluster_auditd.py [0:0]


def cluster_auditd_processes(audit_data: pd.DataFrame, app: str = None) -> pd.DataFrame:
    """
    Clusters process data into specific processes.

    Parameters
    ----------
    audit_data : pd.DataFrame
        The Audit data containing process creation events
    app: str, optional
        The name of a specific app you wish to cluster

    Returns
    -------
    pd.DataFrame
        Details of the clustered process

    """
    if app is not None:
        processes = audit_data[audit_data["exe"].str.contains(app, na=False)]
    else:
        processes = audit_data
    processes = processes.rename(
        columns={
            "acct": "SubjectUserName",
            "uid": "SubjectUserSid",
            "user": "SubjectUserName",
            "ses": "SubjectLogonId",
            "pid": "NewProcessId",
            "exe": "NewProcessName",
            "ppid": "ProcessId",
            "cmdline": "CommandLine",
        }
    )
    req_cols = [
        "cwd",
        "SubjectUserName",
        "SubjectUserSid",
        "SubjectUserName",
        "SubjectLogonId",
        "NewProcessId",
        "NewProcessName",
        "ProcessId",
        "CommandLine",
    ]
    for col in req_cols:
        if col not in processes:
            processes[col] = ""

    feature_procs_h1 = add_process_features(input_frame=processes)

    clus_events, _, _ = dbcluster_events(
        data=feature_procs_h1,
        cluster_columns=["pathScore", "SubjectUserSid"],
        time_column="TimeGenerated",
        max_cluster_distance=0.0001,
    )
    (
        clus_events.sort_values("TimeGenerated")[
            [
                "TimeGenerated",
                "LastEventTime",
                "NewProcessName",
                "CommandLine",
                "SubjectLogonId",
                "SubjectUserSid",
                "pathScore",
                "isSystemSession",
                "ProcessId",
                "ClusterSize",
            ]
        ].sort_values("ClusterSize", ascending=True)
    )

    procs = clus_events[
        [
            "TimeGenerated",
            "NewProcessName",
            "CommandLine",
            "NewProcessId",
            "SubjectUserSid",
            "cwd",
            "ClusterSize",
            "ProcessId",
        ]
    ]
    procs = procs.rename(columns={"NewProcessId": "pid", "ProcessId": "ppid"})

    return procs