stats/statistical

# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # import logging import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline from statsmodels.multivariate.manova import MANOVA from rest_framework.exceptions import APIException log = logging.getLogger(__name__) def linear_regression(input_data, data): y = data['risk'] x = data.drop(columns=['risk']) reg = LinearRegression().fit(x, y) predictions = reg.predict(input_data) probability = predictions[0] color = 'green' if probability > 0.5 else 'red' return {"color": color, "probability": probability} def polynomial_regression(input_data, data): y = data['risk'] x = data.drop(columns=['risk']) model = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))]) reg = model.fit(x, y) predictions = reg.predict(input_data) probability = predictions[0] color = 'green' if probability > 0.5 else 'red' return {"color": color, "probability": probability} def manova(test_row, data, categorical): data = data.dropna() data.loc[len(data)] = test_row le = LabelEncoder() for val in categorical: data[val] = le.fit_transform(data[val]) for col in data.columns: if (col not in categorical): data[col] = (data[col] - np.mean(data[col])) / np.std(data[col]) test_row = data.iloc[len(data) - 1] data.drop([len(data) - 1]) data_good = data[data[10] == 0] data_bad = data[data[10] == 1] x_good = data_good.drop([10, 9], axis=1) y_good = data_good[[9]] x_bad = data_bad.drop([10, 9], axis=1) y_bad = data_bad[[9]] man_good = MANOVA(endog=x_good, exog=y_good) man_bad = MANOVA(endog=x_bad, exog=y_bad) output_good = man_good.mv_test() output_bad = man_bad.mv_test() out_good = np.array(output_good['x0']['stat']) out_bad = np.array(output_bad['x0']['stat']) # Wilki's Lambda WL_good = out_good[0][0] # Pillai's Trace PT_good = out_good[1][0] # Hotelling-Lawley Trace HT_good = out_good[2][0] # Roy's Greatest Roots RGR_good = out_good[3][0] WL_bad = out_bad[0][0] PT_bad = out_bad[1][0] HT_bad = out_bad[2][0] RGR_bad = out_bad[3][0] x = test_row.drop([10, 9]) y = test_row[[9]] data_test_x = x_good.append(x) data_test_y = y_good.append(y) man_test = MANOVA(endog=data_test_x, exog=data_test_y) output_test = man_test.mv_test() out_test = np.array(output_test['x0']['stat']) # Wilki's Lambda WL_test_good = out_test[0][0] # Pillai's Trace PT_test_good = out_test[1][0] # Hotelling-Lawley Trace HT_test_good = out_test[2][0] # Roy's Greatest Roots RGR_test_good = out_test[3][0] data_test_x = x_bad.append(x) data_test_y = y_bad.append(y) man_test = MANOVA(endog=data_test_x, exog=data_test_y) output_test = man_test.mv_test() out_test = np.array(output_test['x0']['stat']) WL_test_bad = out_test[0][0] PT_test_bad = out_test[1][0] HT_test_bad = out_test[2][0] RGR_test_bad = out_test[3][0] scorecard = { "method": "MANOVA", "WL_good": WL_good, "WL_test_good": WL_test_good, "WL_bad": WL_bad, "WL_test_bad": WL_test_bad } ret = "WL good : " + str(WL_good) + " WL test good : " + str(WL_test_good) + \ "\nWL bad : " + \ str(WL_bad) + " WL test bad : " + \ str(WL_test_bad) return scorecard def rename_df_columns(df): dat_dict = df.to_dict() new_dat_dict = {} for key, value in dat_dict.items(): newKey = key if type(key) == str: newKey = newKey.lower().replace(' ', '_') new_dat_dict[newKey] = dat_dict[key] del dat_dict df = pd.DataFrame.from_dict(new_dat_dict) del new_dat_dict return df def prepare_data(data): data['job'] = data['job'].astype('int') cols = data.columns num_cols = data._get_numeric_data().columns categorical = list(set(cols) - set(num_cols)) le = LabelEncoder() for val in categorical: data[val] = le.fit_transform(data[val]) for col in data.columns: if col not in categorical: data[col] = (data[col] - np.mean(data[col])) / np.std(data[col]) input_data = data.iloc[len(data) - 1] input_data = input_data.to_dict() input_data = pd.DataFrame(input_data, index=[0]).drop(columns=['risk']) return data, input_data def stat_score(input_data, model_type): df = pd.read_csv(f'zoo/data/german.csv', index_col=0) dataset = df.drop(columns=['Saving accounts', 'Checking account']) dataset = dataset.dropna() # rename columns(Make them lowercase and snakecase) dataset = rename_df_columns(dataset) # Assume input risk is bad input_data['risk'] = 'bad' dataset.loc[len(dataset)] = input_data # Prepare and normalize data dataset, input_data = prepare_data(dataset) try: if model_type == 'manova': raise APIException( "Statistical Method Manova is not implemented yet") # output = manova(input_data, dataset) elif model_type == 'linearRegression': output = linear_regression(input_data, dataset) elif model_type == 'polynomialRegression': output = polynomial_regression(input_data, dataset) output['method'] = model_type return output except Exception as e: log.debug(f"An Exception Occurred; {str(e)}")

stats/statistical_scoring.py (138 lines of code) (raw):