in tools/scripts/gen-function-support-docs.py [0:0]
def parse_logs(log_file):
# "<>", "!=", "between", "case", and "||" are hard coded in spark and there's no corresponding functions.
builtin_functions = ['<>', '!=', 'between', 'case', '||']
function_names = all_function_names.copy()
for f in builtin_functions:
function_names.remove(f)
print(function_names)
generator_functions = ['explode', 'explode_outer', 'inline', 'inline_outer', 'posexplode',
'posexplode_outer', 'stack']
# unknown functions are not in the all_function_names list. Perhaps spark implemented this function but did not
# expose it to the user for current version.
support_list = {'scalar': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
'aggregate': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
'generator': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
'window': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()}}
try_to_binary_funcs = {'unhex', 'encode', 'unbase64'}
unresolved = []
def filter_fallback_reasons():
with open(log_file, 'r') as f:
lines = f.readlines()
validation_logs = []
# Filter validation logs.
for l in lines:
if l.startswith(' - ') and 'Native validation failed:' not in l or l.startswith(' |- '):
validation_logs.append(l)
# Extract fallback reasons.
fallback_reasons = set()
for l in validation_logs:
if 'due to:' in l:
fallback_reasons.add(l.split('due to:')[-1].strip())
elif 'reason:' in l:
fallback_reasons.add(l.split('reason:')[-1].strip())
else:
fallback_reasons.add(l)
fallback_reasons = sorted(fallback_reasons)
# Remove udf.
return list(filter(lambda x: 'Not supported python udf' not in x and 'Not supported scala udf' not in x,
fallback_reasons))
def function_name_tuple(function_name):
return (
function_name, None if function_name not in function_to_classname else function_to_classname[function_name])
def function_not_found(r):
logging.log(logging.WARNING, f"No function name or class name found in: {r}")
unresolved.append(r)
java_import(jvm, "org.apache.gluten.expression.ExpressionMappings")
jexpression_mappings = jvm.org.apache.gluten.expression.ExpressionMappings.listExpressionMappings()
gluten_expressions = {}
for item in jexpression_mappings:
gluten_expressions[item._1()] = item._2()
for category in FUNCTION_CATEGORIES:
if category == 'scalar':
for f in functions[category]:
# TODO: Remove this filter as it may exclude supported expressions, such as Builder.
if f not in builtin_functions and f not in gluten_expressions.values() and function_to_classname[
f] not in gluten_expressions.keys():
logging.log(logging.WARNING, f"Function not found in gluten expressions: {f}")
support_list[category]['unsupported'].add(function_name_tuple(f))
for f in GLUTEN_RESTRICTIONS[category].keys():
support_list[category]['partial'].add(function_name_tuple(f))
for r in filter_fallback_reasons():
############## Scalar functions ##############
# Not supported: Expression not in ExpressionMappings.
if 'Not supported to map spark function name to substrait function name' in r:
pattern = r"class name: ([\w0-9]+)."
# Extract class name
match = re.search(pattern, r)
if match:
class_name = match.group(1)
if class_name in classname_to_function:
function_name = classname_to_function[class_name]
if function_name in function_names:
support_list['scalar']['unsupported'].add((function_name, class_name))
else:
support_list['scalar']['unknown'].add((function_name, class_name))
else:
logging.log(logging.INFO,
f"No function name for class: {class_name}. Adding class name")
support_list['scalar']['unsupported_expr'].add(class_name)
else:
function_not_found(r)
# Not supported: Function not registered in Velox.
elif 'Scalar function name not registered:' in r:
pattern = r"Scalar function name not registered:\s+([\w0-9]+)"
# Extract the function name
match = re.search(pattern, r)
if match:
function_name = match.group(1)
if function_name in function_names:
support_list['scalar']['unsupported'].add(function_name_tuple(function_name))
else:
support_list['scalar']['unknown'].add(function_name_tuple(function_name))
else:
function_not_found(r)
# Partially supported: Function registered in Velox but not registered with specific arguments.
elif 'not registered with arguments:' in r:
pattern = r"Scalar function ([\w0-9]+) not registered with arguments:"
# Extract the function name
match = re.search(pattern, r)
if match:
function_name = match.group(1)
if function_name in function_names:
support_list['scalar']['partial'].add(function_name_tuple(function_name))
else:
support_list['scalar']['unknown'].add(function_name_tuple(function_name))
else:
function_not_found(r)
# Not supported: Special case for unsupported expressions.
elif 'Not support expression' in r:
pattern = r"Not support expression ([\w0-9]+)"
# Extract class name
match = re.search(pattern, r)
if match:
class_name = match.group(1)
if class_name in classname_to_function:
function_name = classname_to_function[class_name]
if function_name in function_names:
support_list['scalar']['unsupported'].add((function_name, class_name))
else:
support_list['scalar']['unknown'].add((function_name, class_name))
else:
logging.log(logging.INFO,
f"No function name for class: {class_name}. Adding class name")
support_list['scalar']['unsupported_expr'].add(class_name)
else:
function_not_found(r)
# Not supported: Special case for unsupported functions.
elif 'Function is not supported:' in r:
pattern = r"Function is not supported:\s+([\w0-9]+)"
# Extract the function name
match = re.search(pattern, r)
if match:
function_name = match.group(1)
if function_name in function_names:
support_list['scalar']['unsupported'].add(function_name_tuple(function_name))
else:
support_list['scalar']['unknown'].add(function_name_tuple(function_name))
else:
function_not_found(r)
############## Aggregate functions ##############
elif 'Could not find a valid substrait mapping' in r:
pattern = r"Could not find a valid substrait mapping name for ([\w0-9]+)\("
# Extract the function name
match = re.search(pattern, r)
if match:
function_name = match.group(1)
if function_name in function_names:
support_list['aggregate']['unsupported'].add(function_name_tuple(function_name))
else:
support_list['aggregate']['unknown'].add(function_name_tuple(function_name))
else:
function_not_found(r)
elif 'Unsupported aggregate mode' in r:
pattern = r"Unsupported aggregate mode: [\w]+ for ([\w0-9]+)"
# Extract the function name
match = re.search(pattern, r)
if match:
function_name = match.group(1)
if function_name in function_names:
support_list['aggregate']['partial'].add(function_name_tuple(function_name))
else:
support_list['aggregate']['unknown'].add(function_name_tuple(function_name))
else:
function_not_found(r)
############## Generator functions ##############
elif 'Velox backend does not support this generator:' in r:
pattern = r"Velox backend does not support this generator:\s+([\w0-9]+)"
# Extract the function name
match = re.search(pattern, r)
if match:
class_name = match.group(1)
function_name = class_name.lower()
if function_name not in generator_functions:
support_list['generator']['unknown'].add((None, class_name))
elif 'outer: true' in r:
support_list['generator']['unsupported'].add((function_name + '_outer', None))
else:
support_list['generator']['unsupported'].add(function_name_tuple(function_name))
else:
function_not_found(r)
############## Special judgements ##############
elif 'try_eval' in r and ' is not supported' in r:
pattern = r"try_eval\((\w+)\) is not supported"
match = re.search(pattern, r)
if match:
function_name = match.group(1)
if function_name in try_to_binary_funcs:
try_to_binary_funcs.remove(function_name)
function_name = 'try_to_binary'
p = function_name_tuple(function_name)
if len(try_to_binary_funcs) == 0:
if p in support_list['scalar']['partial']:
support_list['scalar']['partial'].remove(p)
support_list['scalar']['unsupported'].add(p)
elif 'add' in function_name:
function_name = 'try_add'
support_list['scalar']['partial'].add(function_name_tuple(function_name))
else:
function_not_found(r)
elif 'Pattern is not string literal for regexp_extract' == r:
function_name = 'regexp_extract'
support_list['scalar']['partial'].add(function_name_tuple(function_name))
elif 'Pattern is not string literal for regexp_extract_all' == r:
function_name = 'regexp_extract_all'
support_list['scalar']['partial'].add(function_name_tuple(function_name))
else:
unresolved.append(r)
return support_list, unresolved