Source code for mlbugdetection.critical_values

import pickle
import numpy as np
from matplotlib import pyplot as plt
import numpy as np
from .analysis_report import AnalysisReport

[docs]def highest_and_lowest_indexes(predictions : list, keep_n : int = 3): '''Return indexes of highest changes (positive or negative) in predictions Parameters ---------- predictions : list Array that contains predictions to be analysed keep_n : int Number of values that are to be keeped in each list ''' dy = list(np.diff(predictions)) negatives = list(filter(lambda x: (x < 0), dy)) positives = list(filter(lambda x: (x > 0), dy)) highest_positives = sorted(positives, reverse=True)[:keep_n] lowest_negatives = sorted(negatives)[:keep_n] highest_indexes = [[dy.index(x), dy.index(x)+1] for x in highest_positives] lowest_indexes = [[dy.index(x), dy.index(x)+1] for x in lowest_negatives] return highest_indexes, lowest_indexes
[docs]def find_critical_values(model, sample, feature : str, start : int, stop : int, step : float = 1, keep_n : int = 3): '''Critical Values Finder Finds highest changes (positive or negative) in predict_proba over an specified inteval [`start`, `stop`]. Parameters ---------- model : sklearn model or str Model already trained and tested from scikit-learn. Could be a model object or a path to a model file. sample : pandas DataFrame A single row of the dataframe that will be used for the analysis. feature : str Feature of dataframe that will be analysed. start : int The starting value of the feature's interval. stop : int The end value of the feature's interval. step : float, default=1 Size of the step between ranges "start" and "stop". Ex: step = 0.1 between ranges 0 and 1 will result in [0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]. keep_n : int, default=3 Number of values that are to be keeped in each list. Returns ------- AnalysisReport object with following attributes: For more information: >>> from mlbugdetection.analysis_report import AnalysisReport >>> help(AnalysisReport) model_name : str Name of the model being analysed. analysed_feature : str Name of the feature being analysed. feature_range : tuple Range of values of the feature being analysed: (start, stop). metrics : dictionary Dictionary with all the calculated metrics, such as: 'positive_changes_proba' : List List of feature ranges that resulted in the biggest positive changes in the model`s prediction probability. 'positive_changes_proba' : List List of biggest positive variations in the model`s prediction probability. 'negative_changes_ranges' : List List of feature ranges that resulted in the biggest negative changes in the model`s prediction probability. 'negative_changes_proba' : List List of biggest negative variations in the model`s prediction probability. 'classification_change_ranges' : List List of feature ranges that resulted in a change of the model`s classification. 'classification_change_proba' : List List of prediction probability values before and after the classification change. graphs : List List of all the figures created. ''' if len(sample) > 1: raise Exception("Sample must have only one example, please use 'find_several_critical_values' for multiple samples") if type(model) == str: with open(model, 'rb') as f: model = pickle.load(f) report = AnalysisReport() column_values = [] predictions = [] range_ = np.arange(start, stop, step) report.model_name = type(model).__name__ report.analysed_feature= feature report.feature_range = (start, stop) fig = plt.figure(figsize=(6, 3), dpi=150) report.graphs.append(fig) for val in range_: column_values.append(val) sample[feature] = val predictions.append(model.predict_proba(sample)[0][1]) highest_positives, lowest_negatives = highest_and_lowest_indexes(predictions, keep_n=keep_n) if len(highest_positives) > 0: report.metrics["positive_changes"] = {} report.metrics["classification_change"] = {} change_count = 0 for indexes in highest_positives: range0 = round(column_values[indexes[0]],3) range1 = round(column_values[indexes[1]],3) pred0 = round(predictions[indexes[0]],3) pred1 = round(predictions[indexes[1]],3) report.metrics["positive_changes"][change_count] = {} report.metrics["positive_changes"][change_count]["ranges"] = (range0,range1) report.metrics["positive_changes"][change_count]["proba"] = (pred0,pred1) if(max(pred0, pred1) >= 0.5 and (min(pred0, pred1) < 0.5 )): report.metrics["classification_change"][change_count] = {} report.metrics["classification_change"][change_count]["ranges"] = (range0,range1) report.metrics["classification_change"][change_count]["proba"] = (pred0,pred1) plt.axvline(x = range0, color = 'g', linestyle = '--', alpha = 0.5) plt.axvline(x = range1, color = 'g', linestyle = '--', alpha = 0.5) change_count += 1 if len(lowest_negatives) > 0: report.metrics["negative_changes"] = {} report.metrics["classification_change"] = {} change_count = 0 for indexes in lowest_negatives: range0 = round(column_values[indexes[0]],3) range1 = round(column_values[indexes[1]],3) pred0 = round(predictions[indexes[0]],3) pred1 = round(predictions[indexes[1]],3) report.metrics["negative_changes"][change_count] = {} report.metrics["negative_changes"][change_count]["ranges"] = (range0,range1) report.metrics["negative_changes"][change_count]["proba"] = (pred0,pred1) if(max(pred0, pred1) >= 0.5 and (min(pred0, pred1) < 0.5 )): report.metrics["classification_change"][change_count] = {} report.metrics["classification_change"][change_count]["ranges"] = (range0,range1) report.metrics["classification_change"][change_count]["proba"] = (pred0,pred1) plt.axvline(x = range0, color = 'r', linestyle = '--', alpha = 0.2) plt.axvline(x = range1, color = 'r', linestyle = '--', alpha = 0.2) change_count += 1 if ((len(lowest_negatives) > 0) or (len(highest_positives) > 0)): plt.plot(column_values, predictions) plt.title(type(model).__name__) plt.xlabel(f'Feature {feature} value') plt.ylabel('Predict proba') return report
[docs]def find_several_critical_values(model, samples, feature : str, start : int, stop : int, step : float = 1, bins : int = 15, keep_n : int = 5, log : bool = False): '''Critical Values Finder in Several Samples Finds mean, median, standard deviation, variation of the critical values found in the samples over an specified inteval [`start`, `stop`]. Parameters ---------- model : sklearn model or str Model already trained and tested from scikit-learn. Could be a model object or a path to a model file. samples : pandas DataFrame Two or more rows of the dataframe that will be used for the analysis. feature : str Feature of dataframe that will be analysed. start : int The starting value of the feature's interval. stop : int The end value of the feature's interval. step : float, default=1 Size of the step between ranges "start" and "stop". Ex: step = 0.1 between ranges 0 and 1 will result in [0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]. bins : int, default=15 It defines the number of equal-width bins in the range. keep_n : int, default=5 Number of the highest values to use for mean, median, std, var calculation. log : bool, default=False If True, the histogram axis will be set to a log scale. Returns ------- AnalysisReport object with following attributes: For more information: >>> from mlbugdetection.analysis_report import AnalysisReport >>> help(AnalysisReport) model_name : str Name of the model being analysed. analysed_feature : str Name of the feature being analysed. feature_range : tuple Range of values of the feature being analysed: (start, stop). metrics : dictionary Dictionary with all the calculated metrics, such as: 'positive_means' : dictionary Contains the following: 'mean' : float Mean of the all the positive changes means 'median' : float Median of the all the positive changes means 'std' : float Standard Deviation of the all the positive changes means 'var' : float Variation of the all the positive changes means 'negative_means' : dictionary Contains the following: 'mean' : float Mean of the all the negative changes means 'median' : float Median of the all the negative changes means 'std' : float Standard Deviation of the all the negative changes means 'var' : float Variation of the all the negative changes means graphs : List List of all the figures created. ''' if len(samples) < 2: raise Exception("Sample must have multiple examples, please use 'find_critical_values' for single example") if type(model) == str: with open(model, 'rb') as f: model = pickle.load(f) samples = samples.copy() report = AnalysisReport() column_values = [] predictions = [] range_ = np.arange(start, stop, step) report.model_name = type(model).__name__ report.analysed_feature = feature report.feature_range = (start, stop) report.metrics["critical_indexes"] = [] predictions_dict = {} for i in range(samples.shape[0]): predictions_dict[i] = { "preds" : [] } for val in range_: column_values.append(val) samples.loc[:, feature] = val samples_predictions = model.predict_proba(samples) for i in range(len(samples_predictions)): predictions_dict[i]["preds"].append(samples_predictions[i][1]) positive_means = [] negative_means = [] means = [] for key in predictions_dict.keys(): predictions_dict[key]["diff"] = list(np.diff(predictions_dict[key]["preds"])) predictions_dict[key]["positive_diffs"] = list(filter(lambda x: (x > 0), predictions_dict[key]["diff"])) predictions_dict[key]["negative_diffs"] = list(filter(lambda x: (x <= 0), predictions_dict[key]["diff"])) previous_prediction = predictions_dict[key]["preds"][0] for pred in predictions_dict[key]["preds"]: if (max(previous_prediction, pred) >= 0.5 and (min(previous_prediction, pred) < 0.5 )): report.metrics["critical_indexes"].append(key) previous_prediction = pred highest_positive_diffs = sorted(predictions_dict[key]["positive_diffs"], reverse=True)[:keep_n] highest_negative_diffs = sorted(predictions_dict[key]["negative_diffs"])[:keep_n] positive_means.append(np.mean(highest_positive_diffs) if len(highest_positive_diffs) > 0 else 0) negative_means.append(np.mean(highest_negative_diffs) if len(highest_negative_diffs) > 0 else 0) report.metrics["positive_means"] = {} report.metrics["negative_means"] = {} report.metrics['positive_means']['mean'] = np.nanmean(positive_means) report.metrics['positive_means']['median'] = np.nanmedian(positive_means) report.metrics['positive_means']['std'] = np.nanstd(positive_means) report.metrics['positive_means']['var'] = np.nanvar(positive_means) report.metrics['negative_means']['mean'] = np.nanmean(negative_means) report.metrics['negative_means']['median'] = np.nanmedian(negative_means) report.metrics['negative_means']['std'] = np.nanstd(negative_means) report.metrics['negative_means']['var'] = np.nanvar(negative_means) # print("Positive means:") # print(f"\tMean: {report.metrics['positive_means']['mean']}") # print(f"\tMedian: {report.metrics['positive_means']['median']}") # print(f"\tStandard Deviation: {report.metrics['positive_means']['std']}") # print(f"\tVariance: {report.metrics['positive_means']['var']}") # print("Negative means:") # print(f"\tMean: {report.metrics['negative_means']['mean']}") # print(f"\tMedian: {report.metrics['negative_means']['median']}") # print(f"\tStandard Deviation: {report.metrics['negative_means']['std']}") # print(f"\tVariance: {report.metrics['negative_means']['var']}") fig, ax= plt.subplots(1,2, figsize=(16,4)) ax[0].set(xlabel="Mean", ylabel="Frequency") ax[0].hist(positive_means, bins=bins, log=log) ax[0].set_title("Histogram of positive means") ax[1].set(xlabel="Mean", ylabel="Frequency") ax[1].hist(negative_means, bins=bins, log=log) ax[1].set_title("Histogram of negative means") report.graphs.append(fig) # report.save_graphs() return report