tools/ml-auto-eda/ml_eda/reporting/content_generator.py (261 lines of code) (raw):
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Generate EDA report content based on the performed analysis"""
from typing import Union, List, Tuple, Text
from ml_eda.proto import analysis_entity_pb2
from ml_eda.orchestration.analysis_tracker import AnalysisTracker
from ml_eda.constants import c
from ml_eda.reporting import template
from ml_eda.reporting import utils
from ml_eda.reporting import recommendation
Analysis = analysis_entity_pb2.Analysis
"""
Each function in this module is responsible to generate the report content for
one of the sections.
The function signature here should be
func(analysis_tracker, figure_base_path) -> (content, additional_info)
analysis_tracker: AnalysisTracker
figure_base_path: Text
content: Text
additional_info: List[Text]
The content: is the markdown content for the section
The additional_info: is the possible warnings and recommends from the analysis
Both content and additional_info can be None depends on whether the
corresponding results can be obtained.
Even though some function may not need figure_base_path, and/or may not
generate additional info. The signature is required to preserve for
downstream function calling.
"""
# pylint: disable-msg=unused-argument
def create_dataset_info_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text = ''
) -> Tuple[Text, None]:
"""Create the top dataset info section without section title. No additional
info will be generated.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), not used, for signature consistence
Returns:
Tuple[Text, None]
"""
target = analysis_tracker.get_target_attribute().name
ml_problem = analysis_tracker.get_job_config().ml_type
numerical_attributes = analysis_tracker.get_num_attribute_names()
categorical_attributes = analysis_tracker.get_cat_attribute_names()
content = template.DATASET_INFO_TEMPLATE.format(
location=analysis_tracker.get_job_config().datasource.location,
numerical_attributes=len(numerical_attributes),
categorical_attributes=len(categorical_attributes),
target_name=target,
ml_problem_type=ml_problem
)
return content, None
def create_descriptive_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text
) -> (Text, List[Text]):
"""Create descriptive section of the report. Checking based on the descriptive
results will be performed, e.g., missing values and high cardinality.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), the folder for holding figures
Returns:
Tuple[Text, List[Text]], (section_content, List[warnings])
"""
numerical_attributes = analysis_tracker.get_num_attribute_names()
categorical_attributes = analysis_tracker.get_cat_attribute_names()
# holders for section content and warnings based on descriptive analysis
contents = []
warnings = []
section_template = template.TABLE_DESCRIPTIVE_TEMPLATE
for att in numerical_attributes:
# base analysis is one holding basic descriptive statistics
base_analysis = analysis_tracker.get_analysis_by_attribute_and_name(
att, Analysis.Name.Name(Analysis.DESCRIPTIVE))[0]
# additional analysis is one holding histogram for numerical attribute
additional_analysis = analysis_tracker.get_analysis_by_attribute_and_name(
att, Analysis.Name.Name(Analysis.HISTOGRAM))[0]
contents.append(utils.create_table_descriptive_row_from_analysis(
attribute_name=att,
base_analysis=base_analysis,
additional_analysis=additional_analysis,
figure_base_path=figure_base_path
))
# check missing value condition
missing_check = recommendation.check_missing(att, base_analysis)
if missing_check:
warnings.append(missing_check)
for att in categorical_attributes:
# base analysis is one holding basic descriptive statistics
base_analysis = analysis_tracker.get_analysis_by_attribute_and_name(
att, Analysis.Name.Name(Analysis.DESCRIPTIVE))[0]
# additional analysis is one holding value counts
# for categorical attribute
additional_analysis = analysis_tracker.get_analysis_by_attribute_and_name(
att, Analysis.Name.Name(Analysis.VALUE_COUNTS))[0]
contents.append(utils.create_table_descriptive_row_from_analysis(
attribute_name=att,
base_analysis=base_analysis,
additional_analysis=additional_analysis,
figure_base_path=figure_base_path
))
# check missing value condition
missing_check = recommendation.check_missing(att, base_analysis)
if missing_check:
warnings.append(missing_check)
# check cardinality condition
cardinality_check = recommendation.check_cardinality(att, base_analysis)
if cardinality_check:
warnings.append(cardinality_check)
# finally all the descriptive analysis result will be organised in a table
table_content = section_template.format(row_content=''.join(contents))
if warnings:
table_content = table_content + utils.create_warning_notes(warnings)
return table_content, warnings
def create_pearson_correlation_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text
) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]:
"""Construct correlation section content for numerical attributes. If pearson
correlation is not performed, None will be returned.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), the folder for holding figures
Returns:
Union[Tuple[Text, List[Text]], Tuple[None, None],
(section_content, List[warining])
"""
warnings = []
# extract the correlation analysis result
# each pair of numerical attributes will have one corresponding analysis
corr_analysis = analysis_tracker.get_analysis_by_name(
Analysis.Name.Name(Analysis.PEARSON_CORRELATION))
if corr_analysis:
table_content = utils.create_no_order_pair_metric_section(
analysis_list=corr_analysis,
same_match_value=1.0,
table_name="Correlation",
figure_base_path=figure_base_path)
for analysis in corr_analysis:
# correlation condition check
corr_check = recommendation.check_pearson_correlation(analysis)
if corr_check:
warnings.append(corr_check)
if warnings:
table_content = table_content + utils.create_warning_notes(warnings)
return table_content, warnings
return None, None
def create_information_gain_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text
) -> Union[Tuple[Text, None], Tuple[None, None]]:
"""Construct information gain section content for categorical attributes. No
additional info will be generated.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), the folder for holding figures
Returns:
Union[Tuple[Text, None], Tuple[None, None]]
"""
# extract the information gain analysis result
# each pair of categorical attributes will have one corresponding analysis
info_analysis = analysis_tracker.get_analysis_by_name(
Analysis.Name.Name(Analysis.INFORMATION_GAIN))
if info_analysis:
content = utils.create_no_order_pair_metric_section(
analysis_list=info_analysis,
same_match_value=0.0,
table_name="Information-Gain",
figure_base_path=figure_base_path)
return content, None
return None, None
# pylint: disable-msg=unused-argument
def create_anova_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text = ''
) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]:
"""Construct anova section content. If anova test is not performed,
None will be returned.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), the folder for holding figures
Returns:
Union[Tuple[Text, List[Text]], Tuple[None, None]],
(section_content, List[warning])
"""
warnings = []
# extract the anova analysis result
# each pair of numerical and categorical attributes will have
# one corresponding analysis
anova_analysis = analysis_tracker.get_analysis_by_name(
Analysis.Name.Name(Analysis.ANOVA))
if anova_analysis:
table_content = utils.create_order_pair_metric_section(
analysis_list=anova_analysis,
same_match_value='NA')
for analysis in anova_analysis:
corr_check = recommendation.check_p_value(analysis)
if corr_check:
warnings.append(corr_check)
if warnings:
table_content = table_content + utils.create_warning_notes(warnings)
return table_content, warnings
return None, None
# pylint: disable-msg=unused-argument
def create_chi_square_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text = ''
) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]:
"""Construct chi-square section content. If chi-square text
is not performed, None will be returned.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), not used, for signature consistence
Returns:
Union[Tuple[Text, List[Text]], Tuple[None, None]],
(section_content, List[warning])
"""
warnings = []
# extract the anova analysis result
# each pair of categorical attributes will have
# one corresponding analysis
chi_square_analysis = analysis_tracker.get_analysis_by_name(
Analysis.Name.Name(Analysis.CHI_SQUARE))
if chi_square_analysis:
table_content = utils.create_no_order_pair_metric_section(
analysis_list=chi_square_analysis,
same_match_value='NA',
figure_base_path='NA')
for analysis in chi_square_analysis:
corr_check = recommendation.check_p_value(analysis)
if corr_check:
warnings.append(corr_check)
if warnings:
table_content = table_content + utils.create_warning_notes(warnings)
return table_content, warnings
return None, None
# pylint: disable-msg=unused-argument
def create_contingency_table_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text = ''
) -> Union[Tuple[Text, None], Tuple[None, None]]:
"""Construct contingency table section content for categorical attributes.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), not used, for signature consistence
Returns:
Union[Tuple[Text, None], Tuple[None, None]]
"""
# extract the contingency table analysis result
# each pair of categorical attributes will have one corresponding analysis
analysis_results = analysis_tracker.get_analysis_by_name(
Analysis.Name.Name(Analysis.CONTINGENCY_TABLE))
if analysis_results:
content = []
for analysis in analysis_results:
attributes = [item.name for item in analysis.features]
section_title = template.SUB_SUB_SUB_SECTION_TITLE.format(
content="{} / {}".format(attributes[0], attributes[1]))
analysis_content_str = utils.create_table_from_table_metric(
analysis.tmetrics[0])
content.extend([section_title, analysis_content_str, "\n<br/>\n"])
return ''.join(content), None
return None, None
# pylint: disable-msg=unused-argument
def create_table_descriptive_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text = ''
) -> Union[Tuple[Text, None], Tuple[None, None]]:
"""Construct descriptive table section content for categorical attributes.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), not used, for signature consistence
Returns:
Union[str, None]
"""
# extract the descriptive table analysis result
# each pair of categorical attributes will have one corresponding analysis
analysis_results = analysis_tracker.get_analysis_by_name(
Analysis.Name.Name(Analysis.TABLE_DESCRIPTIVE))
if analysis_results:
content = []
for analysis in analysis_results:
attributes = [item.name for item in analysis.features][::-1]
section_title = template.SUB_SUB_SUB_SECTION_TITLE.format(
content="{} / {}".format(attributes[0], attributes[1]))
analysis_content_str = utils.create_table_from_table_metric(
analysis.tmetrics[0])
content.extend([section_title, analysis_content_str, "\n<br/>\n"])
return ''.join(content), None
return None, None
# pylint: disable-msg=unused-argument
def create_target_highlight_section(
analysis_tracker: AnalysisTracker,
figure_base_path: Text = ''
) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]:
"""Create the section highlight the correlation analysis performed between
target and other attributes.
Args:
analysis_tracker: (AnalysisTracker), holder for all the analysis
figure_base_path: (string), not used, for signature consistence
Returns:
Union[Tuple[str, List[str]], None], (section_content, List[warning])
"""
# pylint: disable-msg=too-many-locals
def _other_attribute_name(target_name: str,
analysis: analysis_entity_pb2.Analysis) -> str:
attribute_name = [att.name for att in analysis.features
if att.name != target_name][0]
return attribute_name
def _check_analysis(analysis_list: List[List[analysis_entity_pb2.Analysis]]):
for item in analysis_list:
for analysis in item:
if analysis.name in checking_map:
if checking_map[analysis.name](analysis):
yield _other_attribute_name(target, analysis)
def _consolidate_analysis(metric_names, analysis_tracker):
revised_names = []
analysis_list = []
for name in metric_names:
analysis = analysis_tracker.get_analysis_by_attribute_and_name(
target, name)
if analysis:
revised_names.append(name)
analysis_list.append(analysis)
return revised_names, analysis_list
checking_map = {
Analysis.ANOVA: recommendation.check_p_value,
Analysis.PEARSON_CORRELATION: recommendation.check_pearson_correlation,
Analysis.CHI_SQUARE: recommendation.check_p_value
}
target = analysis_tracker.get_target_attribute().name
ml_problem = analysis_tracker.get_job_config().ml_type
# pylint: disable-msg=no-else-return
if ml_problem == c.ml_type.NULL:
return None, None
else:
if ml_problem == c.ml_type.REGRESSION:
target_type = c.datasource.TYPE_NUMERICAL
# Correlation for numerical attributes
# ANOVA for categorical attributes
numerical_metric_names = [
Analysis.Name.Name(Analysis.PEARSON_CORRELATION)]
categorical_metric_names = [
Analysis.Name.Name(Analysis.ANOVA)]
elif ml_problem == c.ml_type.CLASSIFICATION:
target_type = c.datasource.TYPE_CATEGORICAL
# ANOVA for numerical attributes
# IG and Chi-square for categorical attributes
numerical_metric_names = [Analysis.Name.Name(Analysis.ANOVA)]
categorical_metric_names = [
Analysis.Name.Name(Analysis.INFORMATION_GAIN),
Analysis.Name.Name(Analysis.CHI_SQUARE)]
else:
raise ValueError('The ML problem type is not supported')
recommend_features = []
section_content = []
r_numerical_metrics, r_numerical_analysis = \
_consolidate_analysis(numerical_metric_names, analysis_tracker)
r_categorical_metrics, r_categorical_analysis = \
_consolidate_analysis(categorical_metric_names, analysis_tracker)
if r_numerical_metrics:
section_content.append(template.SUB_SUB_SECTION_TITLE.format(
content="Numerical features and target"
))
# recommendation based on checking results
recommend_features.extend(_check_analysis(r_numerical_analysis))
numerical_highlight = utils.create_target_metrics_highlight(
target_name=target,
metric_name_list=r_numerical_metrics,
metric_analysis_list=r_numerical_analysis
)
section_content.append(numerical_highlight)
if r_categorical_metrics:
section_content.append(template.SUB_SUB_SECTION_TITLE.format(
content="Categorical features and target"
))
recommend_features.extend(_check_analysis(r_categorical_analysis))
# recommendation based on checking results
categorical_highlight = utils.create_target_metrics_highlight(
target_name=target,
metric_name_list=r_categorical_metrics,
metric_analysis_list=r_categorical_analysis
)
section_content.append(categorical_highlight)
if not section_content:
return None, None
else:
target_str_template = template.TARGET_HEADLINE_TEMPLATE
target_str = target_str_template.format(target=target,
target_type=target_type)
section_content.insert(0, target_str)
return ''.join(section_content), recommend_features