def parse_logs()

in tools/scripts/gen-function-support-docs.py [0:0]


def parse_logs(log_file):
    # "<>", "!=", "between", "case", and "||" are hard coded in spark and there's no corresponding functions.
    builtin_functions = ['<>', '!=', 'between', 'case', '||']
    function_names = all_function_names.copy()
    for f in builtin_functions:
        function_names.remove(f)

    print(function_names)

    generator_functions = ['explode', 'explode_outer', 'inline', 'inline_outer', 'posexplode',
                           'posexplode_outer', 'stack']

    # unknown functions are not in the all_function_names list. Perhaps spark implemented this function but did not
    # expose it to the user for current version.
    support_list = {'scalar': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
                    'aggregate': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
                    'generator': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
                    'window': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()}}

    try_to_binary_funcs = {'unhex', 'encode', 'unbase64'}

    unresolved = []

    def filter_fallback_reasons():
        with open(log_file, 'r') as f:
            lines = f.readlines()

        validation_logs = []

        # Filter validation logs.
        for l in lines:
            if l.startswith(' - ') and 'Native validation failed:' not in l or l.startswith('   |- '):
                validation_logs.append(l)

        # Extract fallback reasons.
        fallback_reasons = set()
        for l in validation_logs:
            if 'due to:' in l:
                fallback_reasons.add(l.split('due to:')[-1].strip())
            elif 'reason:' in l:
                fallback_reasons.add(l.split('reason:')[-1].strip())
            else:
                fallback_reasons.add(l)
        fallback_reasons = sorted(fallback_reasons)

        # Remove udf.
        return list(filter(lambda x: 'Not supported python udf' not in x and 'Not supported scala udf' not in x,
                           fallback_reasons))

    def function_name_tuple(function_name):
        return (
            function_name, None if function_name not in function_to_classname else function_to_classname[function_name])

    def function_not_found(r):
        logging.log(logging.WARNING, f"No function name or class name found in: {r}")
        unresolved.append(r)

    java_import(jvm, "org.apache.gluten.expression.ExpressionMappings")
    jexpression_mappings = jvm.org.apache.gluten.expression.ExpressionMappings.listExpressionMappings()
    gluten_expressions = {}
    for item in jexpression_mappings:
        gluten_expressions[item._1()] = item._2()

    for category in FUNCTION_CATEGORIES:
        if category == 'scalar':
            for f in functions[category]:
                # TODO: Remove this filter as it may exclude supported expressions, such as Builder.
                if f not in builtin_functions and f not in gluten_expressions.values() and function_to_classname[
                    f] not in gluten_expressions.keys():
                    logging.log(logging.WARNING, f"Function not found in gluten expressions: {f}")
                    support_list[category]['unsupported'].add(function_name_tuple(f))

        for f in GLUTEN_RESTRICTIONS[category].keys():
            support_list[category]['partial'].add(function_name_tuple(f))

    for r in filter_fallback_reasons():
        ############## Scalar functions ##############
        # Not supported: Expression not in ExpressionMappings.
        if 'Not supported to map spark function name to substrait function name' in r:
            pattern = r"class name: ([\w0-9]+)."

            # Extract class name
            match = re.search(pattern, r)

            if match:
                class_name = match.group(1)
                if class_name in classname_to_function:
                    function_name = classname_to_function[class_name]
                    if function_name in function_names:
                        support_list['scalar']['unsupported'].add((function_name, class_name))
                    else:
                        support_list['scalar']['unknown'].add((function_name, class_name))
                else:
                    logging.log(logging.INFO,
                                f"No function name for class: {class_name}. Adding class name")
                    support_list['scalar']['unsupported_expr'].add(class_name)
            else:
                function_not_found(r)

        # Not supported: Function not registered in Velox.
        elif 'Scalar function name not registered:' in r:
            pattern = r"Scalar function name not registered:\s+([\w0-9]+)"

            # Extract the function name
            match = re.search(pattern, r)

            if match:
                function_name = match.group(1)
                if function_name in function_names:
                    support_list['scalar']['unsupported'].add(function_name_tuple(function_name))
                else:
                    support_list['scalar']['unknown'].add(function_name_tuple(function_name))
            else:
                function_not_found(r)

        # Partially supported: Function registered in Velox but not registered with specific arguments.
        elif 'not registered with arguments:' in r:
            pattern = r"Scalar function ([\w0-9]+) not registered with arguments:"

            # Extract the function name
            match = re.search(pattern, r)

            if match:
                function_name = match.group(1)
                if function_name in function_names:
                    support_list['scalar']['partial'].add(function_name_tuple(function_name))
                else:
                    support_list['scalar']['unknown'].add(function_name_tuple(function_name))
            else:
                function_not_found(r)

        # Not supported: Special case for unsupported expressions.
        elif 'Not support expression' in r:
            pattern = r"Not support expression ([\w0-9]+)"

            # Extract class name
            match = re.search(pattern, r)

            if match:
                class_name = match.group(1)
                if class_name in classname_to_function:
                    function_name = classname_to_function[class_name]
                    if function_name in function_names:
                        support_list['scalar']['unsupported'].add((function_name, class_name))
                    else:
                        support_list['scalar']['unknown'].add((function_name, class_name))
                else:
                    logging.log(logging.INFO,
                                f"No function name for class: {class_name}. Adding class name")
                    support_list['scalar']['unsupported_expr'].add(class_name)
            else:
                function_not_found(r)

        # Not supported: Special case for unsupported functions.
        elif 'Function is not supported:' in r:
            pattern = r"Function is not supported:\s+([\w0-9]+)"

            # Extract the function name
            match = re.search(pattern, r)

            if match:
                function_name = match.group(1)
                if function_name in function_names:
                    support_list['scalar']['unsupported'].add(function_name_tuple(function_name))
                else:
                    support_list['scalar']['unknown'].add(function_name_tuple(function_name))
            else:
                function_not_found(r)

        ############## Aggregate functions ##############
        elif 'Could not find a valid substrait mapping' in r:
            pattern = r"Could not find a valid substrait mapping name for ([\w0-9]+)\("

            # Extract the function name
            match = re.search(pattern, r)

            if match:
                function_name = match.group(1)
                if function_name in function_names:
                    support_list['aggregate']['unsupported'].add(function_name_tuple(function_name))
                else:
                    support_list['aggregate']['unknown'].add(function_name_tuple(function_name))
            else:
                function_not_found(r)

        elif 'Unsupported aggregate mode' in r:
            pattern = r"Unsupported aggregate mode: [\w]+ for ([\w0-9]+)"

            # Extract the function name
            match = re.search(pattern, r)

            if match:
                function_name = match.group(1)
                if function_name in function_names:
                    support_list['aggregate']['partial'].add(function_name_tuple(function_name))
                else:
                    support_list['aggregate']['unknown'].add(function_name_tuple(function_name))
            else:
                function_not_found(r)

        ############## Generator functions ##############
        elif 'Velox backend does not support this generator:' in r:
            pattern = r"Velox backend does not support this generator:\s+([\w0-9]+)"

            # Extract the function name
            match = re.search(pattern, r)

            if match:
                class_name = match.group(1)
                function_name = class_name.lower()
                if function_name not in generator_functions:
                    support_list['generator']['unknown'].add((None, class_name))
                elif 'outer: true' in r:
                    support_list['generator']['unsupported'].add((function_name + '_outer', None))
                else:
                    support_list['generator']['unsupported'].add(function_name_tuple(function_name))
            else:
                function_not_found(r)

        ############## Special judgements ##############
        elif 'try_eval' in r and ' is not supported' in r:
            pattern = r"try_eval\((\w+)\) is not supported"
            match = re.search(pattern, r)

            if match:
                function_name = match.group(1)
                if function_name in try_to_binary_funcs:
                    try_to_binary_funcs.remove(function_name)

                    function_name = 'try_to_binary'
                    p = function_name_tuple(function_name)
                    if len(try_to_binary_funcs) == 0:
                        if p in support_list['scalar']['partial']:
                            support_list['scalar']['partial'].remove(p)
                        support_list['scalar']['unsupported'].add(p)

                elif 'add' in function_name:
                    function_name = 'try_add'
                    support_list['scalar']['partial'].add(function_name_tuple(function_name))
            else:
                function_not_found(r)

        elif 'Pattern is not string literal for regexp_extract' == r:
            function_name = 'regexp_extract'
            support_list['scalar']['partial'].add(function_name_tuple(function_name))

        elif 'Pattern is not string literal for regexp_extract_all' == r:
            function_name = 'regexp_extract_all'
            support_list['scalar']['partial'].add(function_name_tuple(function_name))

        else:
            unresolved.append(r)

    return support_list, unresolved