# Automation Gallery - Credential Scan on Azure Log Analytics

__Notebook Version:__ 1.0<br>
__Python Version:__ Python 3.8<br>
__Apache Spark Version:__ 3.1<br>
__Required Packages:__ azure-monitor-query, azure-mgmt-loganalytics<br>
__Platforms Supported:__  Azure Synapse Analytics
     
__Data Source Required:__ Log Analytics tables 
    
### Description
This notebook provides step-by-step instructions and sample code to detect credential leak into Azure Log Analytics using Azure SDK for Python and KQL.<br>
*** Please run the cells sequentially to avoid errors.  Please do not use "run all cells". *** <br>
Need to know more about KQL? [Getting started with Kusto Query Language](https://docs.microsoft.com/azure/data-explorer/kusto/concepts/).

## Table of Contents
1. Warm-up
2. Azure Authentication
3. Azure Log Analytics Data Queries
4. Save result to Microsoft Sentinel Dynamic Summaries

## 1. Warm-up

In [None]:
# Load Python libraries that will be used in this notebook
from azure.mgmt.loganalytics import LogAnalyticsManagementClient
from azure.monitor.query import LogsQueryClient, MetricsQueryClient, LogsQueryStatus
from azure.identity import AzureCliCredential, DefaultAzureCredential, ClientSecretCredential
from azure.core.exceptions import  HttpResponseError 

from datetime import datetime, timezone, timedelta
import pandas as pd
import json
import re
import ipywidgets
from IPython.display import display, HTML, Markdown

In [None]:
# Functions will be used in this notebook        
def get_credscan_kql_where_clause(column_name):
    "This function return the KQL where clause for credscan"
    where_clause = " | where TimeGenerated > ago({0}) | where {1} "
    time_range = "7d"
    regex_string = ""
    regex_list = [
        r"(?i)(ida:password|IssuerSecret|(api|client|app(lication)?)[_\\- ]?(key|secret)[^,a-z]|\\.azuredatabricks\\.net).{0,10}(dapi)?[a-z0-9/+]{22}",
        r"(?i)(x-api-(key|token).{0,10}[a-z0-9/+]{40}|v1\\.[a-z0-9/+]{40}[^a-z0-9/+])",
        r"(?-i)\\WAIza(?i)[a-z0-9_\\\\\\-]{35}\\W",
        r"(?i)(\\Wsig\\W|Secret(Value)?|IssuerSecret|(\\Wsas|primary|secondary|management|Shared(Access(Policy)?)?).?Key|\\.azure\\-devices\\.net|\\.(core|servicebus|redis\\.cache|accesscontrol|mediaservices)\\.(windows\\.net|chinacloudapi\\.cn|cloudapi\\.de|usgovcloudapi\\.net)|New\\-AzureRedisCache).{0,100}([a-z0-9/+]{43}=)",
        r"(?i)visualstudio\\.com.{1,100}\\W(?-i)[a-z2-7]{52}\\W",
        r"(?i)se=2021.+sig=[a-z0-9%]{43,63}%3d",
        r"(?i)(x-functions-key|ApiKey|Code=|\\.azurewebsites\\.net/api/).{0,100}[a-z0-9/\\+]{54}={2}",
        r"(?i)code=[a-z0-9%]{54,74}(%3d){2}",
        r"(?i)(userpwd|publishingpassword).{0,100}[a-z0-9/\\+]{60}\\W",
        r"(?i)[^a-z0-9/\\+][a-z0-9/\\+]{86}==",
        r"(?-i)\\-{5}BEGIN( ([DR]SA|EC|OPENSSH|PGP))? PRIVATE KEY( BLOCK)?\\-{5}",
        r"(?i)(app(lication)?|client)[_\\- ]?(key(url)?|secret)([\\s=:>]{1,10}|[\\s\"':=|>\\]]{3,15}|[\"'=:\\(]{2})[^\\-]",
        r"(?i)refresh[_\\-]?token([\\s=:>]{1,10}|[\\s\"':=|>\\]]{3,15}|[\"'=:\\(]{2})(\"data:text/plain,.+\"|[a-z0-9/+=_.-]{20,200})",
        r"(?i)AccessToken(Secret)?([\\s\"':=|>\\]]{3,15}|[\"'=:\\(]{2}|[\\s=:>]{1,10})[a-z0-9/+=_.-]{20,200}",
        r"(?i)[a-z0-9]{3,5}://[^%:\\s\"'/][^:\\s\"'/\\$]+[^:\\s\"'/\\$%]:([^%\\s\"'/][^@\\s\"'/]{0,100}[^%\\s\"'/])@[\\$a-z0-9:\\.\\-_%\\?=/]+",
        r"(?i)snmp(\\-server)?\\.exe.{0,100}(priv|community)",
        r"(?i)(ConvertTo\\-?SecureString\\s*((\\(|\\Wstring)\\s*)?['\"]+)",
        r"(?i)(Consumer|api)[_\\- ]?(Secret|Key)([\\s=:>]{1,10}|[\\s\"':=|>,\\]]{3,15}|[\"'=:\\(]{2})[^\\s]{5,}",
        r"(?i)authorization[,\\[:= \"']+([dbaohmnsv])",
        r"(?i)-u\\s+.{2,100}-p\\s+[^\\-/]",
        r"(?i)(amqp|ssh|(ht|f)tps?)://[^%:\\s\"'/][^:\\s\"'/\\$]+[^:\\s\"'/\\$%]:([^%\\s\"'/][^@\\s\"'/]{0,100}[^%\\s\"'/])@[\\$a-z0-9:\\.\\-_%\\?=/]+",
        r"(?i)(\\Waws|amazon)?.{0,5}(secret|access.?key).{0,10}\\W[a-z0-9/\\+]{40}",
        r"(?-i)(eyJ0eXAiOiJKV1Qi|eyJhbGci)",
        r"(?i)@(\\.(on)?)?microsoft\\.com[ -~\\s]{1,100}?(\\w?pass\\w?)",
        r"(?i)net(\\.exe)?.{1,5}(user\\s+|share\\s+/user:|user-?secrets? set)\\s+[a-z0-9]",
        r"(?i)xox[pbar]\\-[a-z0-9]",
        r"(?i)[\":\\s=]((x?corp|extranet(test)?|ntdev)(\\.microsoft\\.com)?|corp|redmond|europe|middleeast|northamerica|southpacific|southamerica|fareast|africa|exchange|extranet(test)?|partners|parttest|ntdev|ntwksta)\\W.{0,100}(password|\\Wpwd|\\Wpass|\\Wpw\\W|userpass)",
        r"(?i)(sign_in|SharePointOnlineAuthenticatedContext|(User|Exchange)Credentials?|password)[ -~\\s]{0,100}?@([a-z0-9.]+\\.(on)?)?microsoft\\.com['\"]?",
        r"(?i)(\\.database\\.azure\\.com|\\.database(\\.secure)?\\.windows\\.net|\\.cloudapp\\.net|\\.database\\.usgovcloudapi\\.net|\\.database\\.chinacloudapi\\.cn|\\.database.cloudapi.de).{0,100}(DB_PASS|(sql|service)?password|\\Wpwd\\W)",
        r"(?i)(secret(.?key)?|password)[\"']?\\s*[:=]\\s*[\"'][^\\s]+?[\"']",
        r"(?i)[^a-z\\$](DB_USER|user id|uid|(sql)?user(name)?|service\\s?account)\\s*[^\\w\\s,]([ -~\\s]{2,120}?|[ -~]{2,30}?)([^a-z\\s\\$]|\\s)\\s*(DB_PASS|(sql|service)?password|pwd)",
        r"(?i)(password|secret(key)?)[ \\t]*[=:]+[ \\t]*([^:\\s\"';,<]{2,200})",
    ]

    for (i, re_str) in enumerate(regex_list):
        if i != 0:
            if i == 27:
                regex_string += " and "
            else:
                regex_string += " or " 

        if column_name == "*":
            regex_string += " " + column_name + " matches regex \"" + re_str + "\""
        else:
            regex_string += " tostring(" + column_name + ") matches regex \"" + re_str + "\""

    return where_clause.format(time_range, regex_string)

def filter_column(comumn_name):
    "This function will be used to filter out columns that you don't want to run KQL against (True).  You may customize the filter to meet your requirements"
    if column_name.find('Description') >= 0:
        return False
    elif column_name.find('Id') >= 0 or column_name.find('TimeGenerated') >= 0:
        return True
    else:
        regex_str = '_[a-z]'
        re.compile(regex_str)
        results = re.findall(regex_str, comumn_name)
        if results:
            return True


In [None]:
import uuid
import requests

class DynamicSummary():
    """ Dynamic Summary object model """
    
    @staticmethod
    def get_new_guid():
        """ generate new GUID """
        return uuid.uuid4()

    def __init__(self, summary_id):
        self.summary_id = summary_id

    def serialize(self):
        serialized_str = '"summaryId": "' + self.summary_id + '", "summaryName": "' + self.summary_name + '", "azureTenantId": "' + self.azure_tenant_id + '", "summaryDescription": "' +  self.summary_description + '"'
        if hasattr(self, 'relation_name') and self.relation_name != None:
            serialized_str += ', "relationName": "' + self.relation_name + '"'
        if hasattr(self, 'relation_id') and self.relation_id != None:
            serialized_str += ', "relationId": "' + self.relation_id + '"'
        if hasattr(self, 'search_key') and self.search_key != None:
            serialized_str += ', "searchKey": "' + self.search_key + '"'
        if hasattr(self, 'tactics') and self.tactics != None:
            serialized_str += ', "tactics": "' + self.tactics + '"'
        if hasattr(self, 'techniques') and self.techniques != None:
            serialized_str += ', "techniques": "' + self.techniques + '"'
        if hasattr(self, 'source_info') and self.source_info != None:
            serialized_str += ', "sourceInfo": "' + self.source_info + '"'
        if hasattr(self, 'summary_items') and self.summary_items != None:
            serialized_str += ', "rawContent": "[' + DynamicSummary.serializeItems(self.summary_items) + ']"'

        return serialized_str

    def serializeItems(items):
        raw_content = ''
        isFirst = True
        for item in items:
            if isFirst == True:
                isFirst = False
            else:
                raw_content += ','
                
            raw_content += json.dumps(DynamicSummary.serializeItem(item)).strip('"')
        return raw_content

    def serializeItem(item):
        serialized_item_tsr = '{'
        serialized_item_tsr += '"summaryItemId": "' + item.summary_item_id.urn[9:] + '"'

        if hasattr(item, 'relation_name') and item.relation_name != None:
            serialized_item_tsr += ', "relationName": "' + item.relation_name + '"'
        if hasattr(item, 'relation_id') and item.relation_id != None:
            seriserialized_item_tsralized_str += ', "relationId" :"' + item.relation_id + '"'
        if hasattr(item, 'search_key') and item.search_key != None:
            serialized_item_tsr += ', "searchKey": "' + item.search_key + '"'
        if hasattr(item, 'tactics') and item.tactics != None:
            serialized_item_tsr += ', "tactics": "' + item.tactics + '"'
        if hasattr(item, 'techniques') and item.techniques != None:
            serialized_item_tsr += ', "techniques": "' + item.techniques + '"'
        if hasattr(item, 'event_time_utc') and item.event_time_utc != None:
            serialized_item_tsr += ', "eventTimeUTC" :"' + item.event_time_utc.isoformat() + 'Z"'
        if hasattr(item, 'observable_type') and item.observable_type != None:
            serialized_item_tsr += ', "observableType": "' + item.observable_type + '"'
        if hasattr(item, 'observable_value') and item.observable_value != None:
            serialized_item_tsr += ', "observableValue": "' + item.observable_value + '"'
        if hasattr(item, 'packed_content') and item.packed_content != None:
            serialized_item_tsr += ', "packedContent": ' + item.packed_content
        serialized_item_tsr += '}'
    
        return serialized_item_tsr

    def construct_summary(self, tenant_id, summary_name, summary_description, items, \
        relation_name=None, relation_id=None, search_key=None, tactics=None, techniques=None, source_info=None, **kwargs):
        """ Building summary level data object """
        self.summary_name = summary_name
        self.azure_tenant_id = tenant_id
        self.summary_description = summary_description
        if relation_name != None:
            self.relation_name = relation_name
        if relation_id != None:
            self.relation_id = relation_id
        if search_key != None:
            self.search_key = search_key
        if tactics != None:
            self.tactics = tactics
        if techniques != None:
            self.techniques = techniques
        if source_info != None:
            self.source_info = source_info
        if summary_items != None:
            self.summary_items = items

    def construct_summary_item(self, summary_item_id, \
        relation_name=None, relation_id=None, search_key=None, tactics=None, techniques=None, event_time_utc=None, observable_type=None, observable_value=None, packed_content=None, **kwargs):
        """ Building summary item level data object """
        
        item = DynamicSummary(self.summary_id)
        item.summary_item_id = summary_item_id
        if relation_name != None:
            item.relation_name = relation_name
        if relation_id != None:
            item.relation_id = relation_id
        if search_key != None:
            item.search_key = search_key
        if tactics != None:
            item.tactics = tactics
        if techniques != None:
            item.techniques = techniques
        if event_time_utc != None:
            item.event_time_utc = event_time_utc
        if observable_type != None:
            item.observable_type = observable_type
        if observable_value != None:
            item.observable_value = observable_value
        if packed_content != None:
            item.packed_content = packed_content

        return item
    
    def construct_arm_rest_url(subscription_id, resource_group, workspace_name, summary_guid):
        "Build URL for Sentinel Dynamic Summaries REST API"
        api_version = "2023-03-01-preview"
        provider_name = "Microsoft.OperationalInsights"
        workspace_provider_name = "Microsoft.SecurityInsights/dynamicSummaries"
        root_url = "https://management.azure.com"
        arm_rest_url_template = "{0}/subscriptions/{1}/resourceGroups/{2}/providers/{3}/workspaces/{4}/providers/{5}/{6}?api-version={7}"
        return arm_rest_url_template.format(root_url, subscription_id, resource_group, provider_name, workspace_name, workspace_provider_name, summary_guid, api_version)


    def call_azure_rest_api_for_creating_dynamic_summary(token, arm_rest_url, summary):
        "Calling Microsoft Sentinel REST API"
        bearer_token = "Bearer " + token
        headers = {"Authorization": bearer_token, "content-type":"application/json" }
        response = requests.put(arm_rest_url, headers=headers, data=summary, verify=True)
        return response

    def display_result(response):
        "Display the result set as pandas.DataFrame"
        if response != None:
            df = pd.DataFrame(response.json()["value"])
            display(df)

## 2. Azure Authentication

In [None]:
tenant_id = ''
subscription_id = ''
akv_name = ''
akv_link_name = ''
workspace_id = ''
client_id_name = ''
client_secret_name = ''
resource_group_name_for_dynamic_summaries = ''
sentinel_workspace_name_for_dynamic_summaries = ''
dynamic_summary_name = ''
dynamic_summary_guid = ''

In [None]:
# You may need to change resource_uri for various cloud environments.
resource_uri = "https://api.loganalytics.io"
client_id = mssparkutils.credentials.getSecret(akv_name, client_id_name, akv_link_name)
client_secret = mssparkutils.credentials.getSecret(akv_name, client_secret_name, akv_link_name)

credential = ClientSecretCredential(
    tenant_id=tenant_id, 
    client_id=client_id, 
    client_secret=client_secret)
access_token = credential.get_token(resource_uri + "/.default")
token = access_token[0]
la_data_client = LogsQueryClient(credential=credential)

## 3. Azure Log Analytics Data Queries

In [None]:
# Get all tables available using Kusto query language.  If you need to know more about KQL, please check out the link provided at the introductory section.
tables_result = None
table_list = None
end_time =  datetime.now(timezone.utc)
start_time = end_time - timedelta(1)

all_tables_query = "union withsource = SentinelTableName * | distinct SentinelTableName | sort by SentinelTableName asc"
tables_result = la_data_client.query_workspace(
        workspace_id=workspace_id,
        query=all_tables_query,
        timespan=(start_time, end_time))

if tables_result.status == LogsQueryStatus.SUCCESS:
    df_table = pd.DataFrame(data=tables_result.tables[0].rows, columns=tables_result.tables[0].columns)
    table_list =  list(df_table["SentinelTableName"])
    column_name = "*"
    df_total = pd.DataFrame()
    df_list = []
    
    for table_name in table_list:
        print('Table name: ' + table_name)
        column_name = "*"
        kql_where_clause = get_credscan_kql_where_clause(column_name)
        table_query = "{0}  {1}".format(table_name, kql_where_clause)

        # Run query
        try:
                try_result = la_data_client.query_workspace(
                        workspace_id=workspace_id,
                        query=table_query,
                        timespan=(start_time, end_time))

                df_try = pd.DataFrame(data=try_result.tables[0].rows, columns=try_result.tables[0].columns)
                if not df_try.empty:
                        all_columns_query = "let ColumnList = " + table_name + " | getschema | project ColumnName; ColumnList "
                        columns_result = la_data_client.query_workspace(
                                workspace_id=workspace_id,
                                query=all_columns_query,
                                timespan=(start_time, end_time))
                        df_column = pd.DataFrame(data=columns_result.tables[0].rows, columns=columns_result.tables[0].columns)
                        column_list =  list(df_column["ColumnName"])

                        for column_name in column_list:
                                # Now checking each column
                                if filter_column(column_name):
                                        continue
                                
                                kql_where_clause = get_credscan_kql_where_clause(column_name)
                                col_query = "{0}  {1} | extend ColumnName='{2}', RegexResult={2} | project ColumnName, RegexResult".format(table_name, kql_where_clause, column_name)

                                # Run query
                                try:
                                        single_column_result = la_data_client.query_workspace(
                                                workspace_id=workspace_id,
                                                query=col_query,
                                                timespan=(start_time, end_time))

                                        # process result
                                        df_single_col = pd.DataFrame(data=single_column_result.tables[0].rows, columns=single_column_result.tables[0].columns)
                                        if not df_single_col.empty:
                                                print('Column name: ' + column_name)
                                                df_total = df_total.append(df_single_col)
                                except Exception as ex:
                                        print("=============Exception========")
                                        print(ex)
                                        print("==============================")
                else:
                        print("Not leak found.")
        except HttpResponseError as error:
                print("==============================")
                print(" This table got http error:")
                print(" message:" + error.message)
                print(" reason:" + error.reason)
                print("==============================")
    if not df_total.empty:
            print('results:')
            pd.options.display.max_columns = None
            display(df_total)
    else:
            print('--- No leak ---')


## 4. Save result to Microsoft Sentinel Dynamic Summaries

In [None]:
if not df_total.empty and dynamic_summary_name != None and dynamic_summary_name != '':
    summary = DynamicSummary(dynamic_summary_guid)
    summary_description = "This summary is generated from notebook - AutomationGallery-CredentialScanOnAzureLogAnalytics."

    summary_items = []
    for index, row in df_total.iterrows():
        packed_content = df_total.iloc[index].to_json()
        summary_items.append(summary.construct_summary_item(DynamicSummary.get_new_guid(), None, None, None, None, None, datetime.utcnow(), None, None, packed_content))

    summary.construct_summary(tenant_id, dynamic_summary_name, summary_description, summary_items)
    summary_json = "{ \"properties\": {" +  summary.serialize() + "}}"

    print(summary_json)

In [None]:
if not df_total.empty and dynamic_summary_name != None and dynamic_summary_name != '':
    dyn_sum_api_url = DynamicSummary.construct_arm_rest_url(subscription_id, resource_group_name_for_dynamic_summaries, sentinel_workspace_name_for_dynamic_summaries, dynamic_summary_guid)
    response = DynamicSummary.call_azure_rest_api_for_creating_dynamic_summary(token, dyn_sum_api_url, summary_json)

    print(response.status_code)