tools/ml-auto-eda/ml_eda/reporting/content_generator.py (261 lines of code) (raw):

# Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Generate EDA report content based on the performed analysis""" from typing import Union, List, Tuple, Text from ml_eda.proto import analysis_entity_pb2 from ml_eda.orchestration.analysis_tracker import AnalysisTracker from ml_eda.constants import c from ml_eda.reporting import template from ml_eda.reporting import utils from ml_eda.reporting import recommendation Analysis = analysis_entity_pb2.Analysis """ Each function in this module is responsible to generate the report content for one of the sections. The function signature here should be func(analysis_tracker, figure_base_path) -> (content, additional_info) analysis_tracker: AnalysisTracker figure_base_path: Text content: Text additional_info: List[Text] The content: is the markdown content for the section The additional_info: is the possible warnings and recommends from the analysis Both content and additional_info can be None depends on whether the corresponding results can be obtained. Even though some function may not need figure_base_path, and/or may not generate additional info. The signature is required to preserve for downstream function calling. """ # pylint: disable-msg=unused-argument def create_dataset_info_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Tuple[Text, None]: """Create the top dataset info section without section title. No additional info will be generated. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Tuple[Text, None] """ target = analysis_tracker.get_target_attribute().name ml_problem = analysis_tracker.get_job_config().ml_type numerical_attributes = analysis_tracker.get_num_attribute_names() categorical_attributes = analysis_tracker.get_cat_attribute_names() content = template.DATASET_INFO_TEMPLATE.format( location=analysis_tracker.get_job_config().datasource.location, numerical_attributes=len(numerical_attributes), categorical_attributes=len(categorical_attributes), target_name=target, ml_problem_type=ml_problem ) return content, None def create_descriptive_section( analysis_tracker: AnalysisTracker, figure_base_path: Text ) -> (Text, List[Text]): """Create descriptive section of the report. Checking based on the descriptive results will be performed, e.g., missing values and high cardinality. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Tuple[Text, List[Text]], (section_content, List[warnings]) """ numerical_attributes = analysis_tracker.get_num_attribute_names() categorical_attributes = analysis_tracker.get_cat_attribute_names() # holders for section content and warnings based on descriptive analysis contents = [] warnings = [] section_template = template.TABLE_DESCRIPTIVE_TEMPLATE for att in numerical_attributes: # base analysis is one holding basic descriptive statistics base_analysis = analysis_tracker.get_analysis_by_attribute_and_name( att, Analysis.Name.Name(Analysis.DESCRIPTIVE))[0] # additional analysis is one holding histogram for numerical attribute additional_analysis = analysis_tracker.get_analysis_by_attribute_and_name( att, Analysis.Name.Name(Analysis.HISTOGRAM))[0] contents.append(utils.create_table_descriptive_row_from_analysis( attribute_name=att, base_analysis=base_analysis, additional_analysis=additional_analysis, figure_base_path=figure_base_path )) # check missing value condition missing_check = recommendation.check_missing(att, base_analysis) if missing_check: warnings.append(missing_check) for att in categorical_attributes: # base analysis is one holding basic descriptive statistics base_analysis = analysis_tracker.get_analysis_by_attribute_and_name( att, Analysis.Name.Name(Analysis.DESCRIPTIVE))[0] # additional analysis is one holding value counts # for categorical attribute additional_analysis = analysis_tracker.get_analysis_by_attribute_and_name( att, Analysis.Name.Name(Analysis.VALUE_COUNTS))[0] contents.append(utils.create_table_descriptive_row_from_analysis( attribute_name=att, base_analysis=base_analysis, additional_analysis=additional_analysis, figure_base_path=figure_base_path )) # check missing value condition missing_check = recommendation.check_missing(att, base_analysis) if missing_check: warnings.append(missing_check) # check cardinality condition cardinality_check = recommendation.check_cardinality(att, base_analysis) if cardinality_check: warnings.append(cardinality_check) # finally all the descriptive analysis result will be organised in a table table_content = section_template.format(row_content=''.join(contents)) if warnings: table_content = table_content + utils.create_warning_notes(warnings) return table_content, warnings def create_pearson_correlation_section( analysis_tracker: AnalysisTracker, figure_base_path: Text ) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]: """Construct correlation section content for numerical attributes. If pearson correlation is not performed, None will be returned. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Union[Tuple[Text, List[Text]], Tuple[None, None], (section_content, List[warining]) """ warnings = [] # extract the correlation analysis result # each pair of numerical attributes will have one corresponding analysis corr_analysis = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.PEARSON_CORRELATION)) if corr_analysis: table_content = utils.create_no_order_pair_metric_section( analysis_list=corr_analysis, same_match_value=1.0, table_name="Correlation", figure_base_path=figure_base_path) for analysis in corr_analysis: # correlation condition check corr_check = recommendation.check_pearson_correlation(analysis) if corr_check: warnings.append(corr_check) if warnings: table_content = table_content + utils.create_warning_notes(warnings) return table_content, warnings return None, None def create_information_gain_section( analysis_tracker: AnalysisTracker, figure_base_path: Text ) -> Union[Tuple[Text, None], Tuple[None, None]]: """Construct information gain section content for categorical attributes. No additional info will be generated. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Union[Tuple[Text, None], Tuple[None, None]] """ # extract the information gain analysis result # each pair of categorical attributes will have one corresponding analysis info_analysis = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.INFORMATION_GAIN)) if info_analysis: content = utils.create_no_order_pair_metric_section( analysis_list=info_analysis, same_match_value=0.0, table_name="Information-Gain", figure_base_path=figure_base_path) return content, None return None, None # pylint: disable-msg=unused-argument def create_anova_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]: """Construct anova section content. If anova test is not performed, None will be returned. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), the folder for holding figures Returns: Union[Tuple[Text, List[Text]], Tuple[None, None]], (section_content, List[warning]) """ warnings = [] # extract the anova analysis result # each pair of numerical and categorical attributes will have # one corresponding analysis anova_analysis = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.ANOVA)) if anova_analysis: table_content = utils.create_order_pair_metric_section( analysis_list=anova_analysis, same_match_value='NA') for analysis in anova_analysis: corr_check = recommendation.check_p_value(analysis) if corr_check: warnings.append(corr_check) if warnings: table_content = table_content + utils.create_warning_notes(warnings) return table_content, warnings return None, None # pylint: disable-msg=unused-argument def create_chi_square_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]: """Construct chi-square section content. If chi-square text is not performed, None will be returned. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Union[Tuple[Text, List[Text]], Tuple[None, None]], (section_content, List[warning]) """ warnings = [] # extract the anova analysis result # each pair of categorical attributes will have # one corresponding analysis chi_square_analysis = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.CHI_SQUARE)) if chi_square_analysis: table_content = utils.create_no_order_pair_metric_section( analysis_list=chi_square_analysis, same_match_value='NA', figure_base_path='NA') for analysis in chi_square_analysis: corr_check = recommendation.check_p_value(analysis) if corr_check: warnings.append(corr_check) if warnings: table_content = table_content + utils.create_warning_notes(warnings) return table_content, warnings return None, None # pylint: disable-msg=unused-argument def create_contingency_table_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, None], Tuple[None, None]]: """Construct contingency table section content for categorical attributes. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Union[Tuple[Text, None], Tuple[None, None]] """ # extract the contingency table analysis result # each pair of categorical attributes will have one corresponding analysis analysis_results = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.CONTINGENCY_TABLE)) if analysis_results: content = [] for analysis in analysis_results: attributes = [item.name for item in analysis.features] section_title = template.SUB_SUB_SUB_SECTION_TITLE.format( content="{} / {}".format(attributes[0], attributes[1])) analysis_content_str = utils.create_table_from_table_metric( analysis.tmetrics[0]) content.extend([section_title, analysis_content_str, "\n<br/>\n"]) return ''.join(content), None return None, None # pylint: disable-msg=unused-argument def create_table_descriptive_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, None], Tuple[None, None]]: """Construct descriptive table section content for categorical attributes. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Union[str, None] """ # extract the descriptive table analysis result # each pair of categorical attributes will have one corresponding analysis analysis_results = analysis_tracker.get_analysis_by_name( Analysis.Name.Name(Analysis.TABLE_DESCRIPTIVE)) if analysis_results: content = [] for analysis in analysis_results: attributes = [item.name for item in analysis.features][::-1] section_title = template.SUB_SUB_SUB_SECTION_TITLE.format( content="{} / {}".format(attributes[0], attributes[1])) analysis_content_str = utils.create_table_from_table_metric( analysis.tmetrics[0]) content.extend([section_title, analysis_content_str, "\n<br/>\n"]) return ''.join(content), None return None, None # pylint: disable-msg=unused-argument def create_target_highlight_section( analysis_tracker: AnalysisTracker, figure_base_path: Text = '' ) -> Union[Tuple[Text, List[Text]], Tuple[None, None]]: """Create the section highlight the correlation analysis performed between target and other attributes. Args: analysis_tracker: (AnalysisTracker), holder for all the analysis figure_base_path: (string), not used, for signature consistence Returns: Union[Tuple[str, List[str]], None], (section_content, List[warning]) """ # pylint: disable-msg=too-many-locals def _other_attribute_name(target_name: str, analysis: analysis_entity_pb2.Analysis) -> str: attribute_name = [att.name for att in analysis.features if att.name != target_name][0] return attribute_name def _check_analysis(analysis_list: List[List[analysis_entity_pb2.Analysis]]): for item in analysis_list: for analysis in item: if analysis.name in checking_map: if checking_map[analysis.name](analysis): yield _other_attribute_name(target, analysis) def _consolidate_analysis(metric_names, analysis_tracker): revised_names = [] analysis_list = [] for name in metric_names: analysis = analysis_tracker.get_analysis_by_attribute_and_name( target, name) if analysis: revised_names.append(name) analysis_list.append(analysis) return revised_names, analysis_list checking_map = { Analysis.ANOVA: recommendation.check_p_value, Analysis.PEARSON_CORRELATION: recommendation.check_pearson_correlation, Analysis.CHI_SQUARE: recommendation.check_p_value } target = analysis_tracker.get_target_attribute().name ml_problem = analysis_tracker.get_job_config().ml_type # pylint: disable-msg=no-else-return if ml_problem == c.ml_type.NULL: return None, None else: if ml_problem == c.ml_type.REGRESSION: target_type = c.datasource.TYPE_NUMERICAL # Correlation for numerical attributes # ANOVA for categorical attributes numerical_metric_names = [ Analysis.Name.Name(Analysis.PEARSON_CORRELATION)] categorical_metric_names = [ Analysis.Name.Name(Analysis.ANOVA)] elif ml_problem == c.ml_type.CLASSIFICATION: target_type = c.datasource.TYPE_CATEGORICAL # ANOVA for numerical attributes # IG and Chi-square for categorical attributes numerical_metric_names = [Analysis.Name.Name(Analysis.ANOVA)] categorical_metric_names = [ Analysis.Name.Name(Analysis.INFORMATION_GAIN), Analysis.Name.Name(Analysis.CHI_SQUARE)] else: raise ValueError('The ML problem type is not supported') recommend_features = [] section_content = [] r_numerical_metrics, r_numerical_analysis = \ _consolidate_analysis(numerical_metric_names, analysis_tracker) r_categorical_metrics, r_categorical_analysis = \ _consolidate_analysis(categorical_metric_names, analysis_tracker) if r_numerical_metrics: section_content.append(template.SUB_SUB_SECTION_TITLE.format( content="Numerical features and target" )) # recommendation based on checking results recommend_features.extend(_check_analysis(r_numerical_analysis)) numerical_highlight = utils.create_target_metrics_highlight( target_name=target, metric_name_list=r_numerical_metrics, metric_analysis_list=r_numerical_analysis ) section_content.append(numerical_highlight) if r_categorical_metrics: section_content.append(template.SUB_SUB_SECTION_TITLE.format( content="Categorical features and target" )) recommend_features.extend(_check_analysis(r_categorical_analysis)) # recommendation based on checking results categorical_highlight = utils.create_target_metrics_highlight( target_name=target, metric_name_list=r_categorical_metrics, metric_analysis_list=r_categorical_analysis ) section_content.append(categorical_highlight) if not section_content: return None, None else: target_str_template = template.TARGET_HEADLINE_TEMPLATE target_str = target_str_template.format(target=target, target_type=target_type) section_content.insert(0, target_str) return ''.join(section_content), recommend_features