Source code for mlbugdetection.critical_values

import pickle
import numpy as np
from matplotlib import pyplot as plt
import numpy as np
from .analysis_report import AnalysisReport

[docs]def highest_and_lowest_indexes(predictions : list, keep_n : int = 3):
    '''Return indexes of highest changes (positive or negative) 
       in predictions 

    Parameters
    ----------
    predictions : list
        Array that contains predictions to be analysed

    keep_n : int
        Number of values that are to be keeped in each list
    '''
    dy = list(np.diff(predictions))
    negatives = list(filter(lambda x: (x < 0), dy))
    positives = list(filter(lambda x: (x > 0), dy))
    
    highest_positives = sorted(positives, reverse=True)[:keep_n]
    lowest_negatives = sorted(negatives)[:keep_n]

    highest_indexes = [[dy.index(x), dy.index(x)+1] for x in highest_positives]
    lowest_indexes  = [[dy.index(x), dy.index(x)+1] for x in lowest_negatives]

    return highest_indexes, lowest_indexes

[docs]def find_critical_values(model, sample, feature : str, start : int, stop : int, step : float = 1, keep_n : int = 3):
    '''Critical Values Finder
        Finds highest changes (positive or negative) in predict_proba 
        over an specified inteval [`start`, `stop`].

    Parameters
    ----------
    model : sklearn model or str
        Model already trained and tested from scikit-learn. Could be a model object or a path to a model file.

    sample : pandas DataFrame
        A single row of the dataframe that will be used for the analysis.

    feature : str
        Feature of dataframe that will be analysed.
    
    start : int
        The starting value of the feature's interval.
    
    stop : int
        The end value of the feature's interval.
    
    step : float, default=1
        Size of the step between ranges "start" and "stop".
        Ex: step = 0.1 between ranges 0 and 1 will result in [0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9].

    keep_n : int, default=3
        Number of values that are to be keeped in each list.

    Returns
    -------
    AnalysisReport object with following attributes:
        For more information:
        >>> from mlbugdetection.analysis_report import AnalysisReport
        >>> help(AnalysisReport)

    model_name : str
        Name of the model being analysed.
    
    analysed_feature : str
        Name of the feature being analysed.
    
    feature_range : tuple
        Range of values of the feature being analysed: (start, stop).
    
    metrics : dictionary
        Dictionary with all the calculated metrics, such as:
        
        'positive_changes_proba' : List
            List of feature ranges that resulted in the biggest positive 
            changes in the model`s prediction probability.

         'positive_changes_proba' : List
            List of biggest positive variations in the model`s prediction 
            probability.
        
        'negative_changes_ranges' : List
            List of feature ranges that resulted in the biggest negative 
            changes in the model`s prediction probability.

        'negative_changes_proba' : List
            List of biggest negative variations in the model`s prediction 
            probability.
        
        'classification_change_ranges' : List
            List of feature ranges that resulted in a change of the model`s 
            classification.
        
        'classification_change_proba' : List
            List of prediction probability values before and after the 
            classification change.

    graphs : List
            List of all the figures created.
        
    '''
    if len(sample) > 1:
        raise Exception("Sample must have only one example, please use 'find_several_critical_values' for multiple samples")
                
    if type(model) == str:
        with open(model, 'rb') as f:
            model = pickle.load(f)

    report = AnalysisReport()
    column_values = []
    predictions = []
    range_ = np.arange(start, stop, step)
    report.model_name = type(model).__name__
    report.analysed_feature= feature
    report.feature_range = (start, stop)
    fig = plt.figure(figsize=(6, 3), dpi=150)
    report.graphs.append(fig)

    for val in range_:
        column_values.append(val)
        sample[feature] = val
        predictions.append(model.predict_proba(sample)[0][1])

    highest_positives, lowest_negatives = highest_and_lowest_indexes(predictions, keep_n=keep_n)
    if len(highest_positives) > 0:
        report.metrics["positive_changes"] = {}  
        report.metrics["classification_change"] = {}  

        change_count = 0
        for indexes in highest_positives:
            range0 = round(column_values[indexes[0]],3)
            range1 = round(column_values[indexes[1]],3)
            pred0 = round(predictions[indexes[0]],3)
            pred1 = round(predictions[indexes[1]],3)
            report.metrics["positive_changes"][change_count] = {}
            report.metrics["positive_changes"][change_count]["ranges"] = (range0,range1)
            report.metrics["positive_changes"][change_count]["proba"] = (pred0,pred1)
            if(max(pred0, pred1) >= 0.5 and (min(pred0, pred1) < 0.5 )):
                report.metrics["classification_change"][change_count] = {}
                report.metrics["classification_change"][change_count]["ranges"] = (range0,range1)
                report.metrics["classification_change"][change_count]["proba"] = (pred0,pred1)
                plt.axvline(x = range0, color = 'g', linestyle = '--', alpha = 0.5)
                plt.axvline(x = range1, color = 'g', linestyle = '--', alpha = 0.5)
            
            change_count += 1
    if len(lowest_negatives) > 0:
        report.metrics["negative_changes"] = {}  
        report.metrics["classification_change"] = {}  

        change_count = 0
        for indexes in lowest_negatives:
            range0 = round(column_values[indexes[0]],3)
            range1 = round(column_values[indexes[1]],3)
            pred0 = round(predictions[indexes[0]],3)
            pred1 = round(predictions[indexes[1]],3)
            
            report.metrics["negative_changes"][change_count] = {}
            report.metrics["negative_changes"][change_count]["ranges"] = (range0,range1)
            report.metrics["negative_changes"][change_count]["proba"] = (pred0,pred1)
            if(max(pred0, pred1) >= 0.5 and (min(pred0, pred1) < 0.5 )):
                report.metrics["classification_change"][change_count] = {}
                report.metrics["classification_change"][change_count]["ranges"] = (range0,range1)
                report.metrics["classification_change"][change_count]["proba"] = (pred0,pred1)
                plt.axvline(x = range0, color = 'r', linestyle = '--', alpha = 0.2)
                plt.axvline(x = range1, color = 'r', linestyle = '--', alpha = 0.2)
            change_count += 1
    if ((len(lowest_negatives) > 0) or (len(highest_positives) > 0)):
        plt.plot(column_values, predictions)
        plt.title(type(model).__name__)
        plt.xlabel(f'Feature {feature} value')
        plt.ylabel('Predict proba')
    return report


[docs]def find_several_critical_values(model, samples, feature : str, start : int, stop : int, step : float = 1, bins : int = 15, keep_n : int = 5, log : bool = False):
    '''Critical Values Finder in Several Samples
        Finds mean, median, standard deviation, variation of the critical values
        found in the samples over an specified inteval [`start`, `stop`].

    Parameters
    ----------
    model : sklearn model or str
        Model already trained and tested from scikit-learn. Could be a model object or a path to a model file.

    samples : pandas DataFrame
        Two or more rows of the dataframe that will be used for the analysis.

    feature : str
        Feature of dataframe that will be analysed.
    
    start : int
        The starting value of the feature's interval.
    
    stop : int
        The end value of the feature's interval.
    
    step : float, default=1
        Size of the step between ranges "start" and "stop".
        Ex: step = 0.1 between ranges 0 and 1 will result in [0  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9].

    bins : int, default=15
        It defines the number of equal-width bins in the range.

    keep_n : int, default=5
        Number of the highest values to use for mean, median, std, var calculation.

    log : bool, default=False
        If True, the histogram axis will be set to a log scale.

    Returns
    -------
    AnalysisReport object with following attributes:
        For more information:
        >>> from mlbugdetection.analysis_report import AnalysisReport
        >>> help(AnalysisReport)

    model_name : str
        Name of the model being analysed.
    
    analysed_feature : str
        Name of the feature being analysed.
    
    feature_range : tuple
        Range of values of the feature being analysed: (start, stop).
    
    metrics : dictionary
        Dictionary with all the calculated metrics, such as:

        'positive_means' : dictionary
            Contains the following:

            'mean' : float
                Mean of the all the positive changes means
        
            'median' : float
                Median of the all the positive changes means
        
            'std' : float
                Standard Deviation of the all the positive changes means
        
            'var' : float
                Variation of the all the positive changes means
        
        'negative_means' : dictionary
            Contains the following:

            'mean' : float
                Mean of the all the negative changes means
        
            'median' : float
                Median of the all the negative changes means
        
            'std' : float
                Standard Deviation of the all the negative changes means
        
            'var' : float
                Variation of the all the negative changes means

    graphs : List
        List of all the figures created.
    '''
    if len(samples) < 2:
        raise Exception("Sample must have multiple examples, please use 'find_critical_values' for single example")
    
    if type(model) == str:
        with open(model, 'rb') as f:
            model = pickle.load(f)
    
    samples = samples.copy()
    report = AnalysisReport()
    column_values = []
    predictions = []
    range_ = np.arange(start, stop, step)
    
    report.model_name = type(model).__name__
    report.analysed_feature = feature
    report.feature_range = (start, stop)
    report.metrics["critical_indexes"] = []

    predictions_dict = {}
    for i in range(samples.shape[0]):
        predictions_dict[i] = {
            "preds" : []
        }
    for val in range_:
        column_values.append(val)
        samples.loc[:, feature] = val
        samples_predictions = model.predict_proba(samples)
        for i in range(len(samples_predictions)):
            predictions_dict[i]["preds"].append(samples_predictions[i][1])

    positive_means = []
    negative_means = []
    means = []
    for key in predictions_dict.keys():
        predictions_dict[key]["diff"] = list(np.diff(predictions_dict[key]["preds"]))

        predictions_dict[key]["positive_diffs"] = list(filter(lambda x: (x > 0), predictions_dict[key]["diff"]))
        predictions_dict[key]["negative_diffs"] = list(filter(lambda x: (x <= 0), predictions_dict[key]["diff"]))

        previous_prediction = predictions_dict[key]["preds"][0]
        for pred in predictions_dict[key]["preds"]:
            if (max(previous_prediction, pred) >= 0.5 and (min(previous_prediction, pred) < 0.5 )):
                report.metrics["critical_indexes"].append(key)
            previous_prediction = pred

        highest_positive_diffs = sorted(predictions_dict[key]["positive_diffs"], reverse=True)[:keep_n]
        highest_negative_diffs = sorted(predictions_dict[key]["negative_diffs"])[:keep_n]

        positive_means.append(np.mean(highest_positive_diffs) if len(highest_positive_diffs) > 0 else 0)
        negative_means.append(np.mean(highest_negative_diffs) if len(highest_negative_diffs) > 0 else 0)
        
    report.metrics["positive_means"] = {}
    report.metrics["negative_means"] = {}
    
    report.metrics['positive_means']['mean'] = np.nanmean(positive_means)
    report.metrics['positive_means']['median'] = np.nanmedian(positive_means)
    report.metrics['positive_means']['std'] = np.nanstd(positive_means)
    report.metrics['positive_means']['var'] = np.nanvar(positive_means)

    report.metrics['negative_means']['mean'] = np.nanmean(negative_means)
    report.metrics['negative_means']['median'] = np.nanmedian(negative_means)
    report.metrics['negative_means']['std'] = np.nanstd(negative_means)
    report.metrics['negative_means']['var'] = np.nanvar(negative_means)

    # print("Positive means:")
    # print(f"\tMean: {report.metrics['positive_means']['mean']}")
    # print(f"\tMedian: {report.metrics['positive_means']['median']}")
    # print(f"\tStandard Deviation: {report.metrics['positive_means']['std']}")
    # print(f"\tVariance: {report.metrics['positive_means']['var']}")

    # print("Negative means:")
    # print(f"\tMean: {report.metrics['negative_means']['mean']}")
    # print(f"\tMedian: {report.metrics['negative_means']['median']}")
    # print(f"\tStandard Deviation: {report.metrics['negative_means']['std']}")
    # print(f"\tVariance: {report.metrics['negative_means']['var']}")


    fig, ax= plt.subplots(1,2, figsize=(16,4))
    ax[0].set(xlabel="Mean", ylabel="Frequency")
    ax[0].hist(positive_means, bins=bins, log=log)
    ax[0].set_title("Histogram of positive means")
    ax[1].set(xlabel="Mean", ylabel="Frequency")
    ax[1].hist(negative_means, bins=bins, log=log)
    ax[1].set_title("Histogram of negative means")
    report.graphs.append(fig)
    # report.save_graphs()
    return report