Source code for mlbugdetection.monotonic

import pickle
import numpy as np
from matplotlib import pyplot as plt
from .analysis_report import AnalysisReport

[docs]def monotonicity_mse(predictions):
    """Monotonicity Mean Square Error

        Calculates the MSE between a list of prediction brobabilities and the closest monotonic version
        of this list.

    Parameters
    ----------

    predictions : List
        List of prediction probabilities calculated on the check_monotonicity function.
    
    Returns
    -------
        desc | asc : List 
            List of closest monotonic version of "predictions".

        mse_desc | mse_as : int
            MSE between "predictions" and desc/asc.
    """

    desc = np.minimum.accumulate(predictions)
    asc = np.maximum.accumulate(predictions)
    mse_desc = (np.square(predictions - desc)).mean(axis=0)
    mse_asc = (np.square(predictions - asc)).mean(axis=0)
    if min(mse_asc,mse_desc) == mse_desc:
        return desc, min(mse_asc,mse_desc)
    else:
        return asc, min(mse_asc,mse_desc)

[docs]def check_monotonicity_single_sample(model, sample, feature, start, stop, step=1):
    '''Monotonicity Analysis for a single example

    Parameters
    ----------
    model : sklearn model or str
        Model that will be used to make predictions. Could be a model object or a path to a model file.

    sample : pandas.DataFrame
        Pandas DataFrame containing one row that will be used as base point.

    feature : str
        Name of the feature being analysed.

    start : int
        The starting value of the feature's interval.

    stop : int
        The end value of the feature's interval.

    step : float, default=1
        Size of the step between ranges "start" and "stop".
        Ex: step = 0.1 between ranges 0 and 1 will result in [0  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]

    Returns
    -------
    AnalysisReport object with following attributes:
        For more information:
        >>> from mlbugdetection.analysis_report import AnalysisReport
        >>> help(AnalysisReport)

    model_name : str
        Name of the model being analysed.
    
    analysed_feature : str
        Name of the feature being analysed.
    
    feature_range : tuple
        Range of values of the feature being analysed: (start, stop).
    
    metrics : dictionary
        Dictionary with all the calculated metrics, such as:
        
        'monotonic' : bool
             If the list of values is monotonic.

        'monotonic_mse': float
            MSE between the list of values and it`s closest monotonic aproximation. 

    graphs : List
            List of all the figures created.
    '''

    if len(sample) > 1:
        raise Exception("Sample must have only one example, please use 'check_monotonicity_multiple_samples' for multiple samples")
    if type(model) == str:
        with open(model, 'rb') as f:
            model = pickle.load(f)
            
    report = AnalysisReport()
    colValues = []
    predictions = []


    for i in np.arange(start,stop,step):
        colValues.append(i)
        sample[feature] = i
        prediction = model.predict_proba(sample)
        predictions.append(prediction[0][1])
    
    monotonic =  (all(predictions[i] <= predictions[i + 1] for i in range(len(predictions) - 1)) or all(predictions[i] >= predictions[i + 1] for i in range(len(predictions) - 1)))
    report.model_name = type(model).__name__
    report.analysed_feature = feature
    report.feature_range = (start, stop)
    fig = plt.figure(figsize=(6, 3), dpi=150)
    report.graphs.append(fig)
    if monotonic:
        report.metrics["monotonic"] = True
        report.metrics["monotonic_mse"] = 0
    else:
        report.metrics["monotonic"] = False
        monotonic_curve, m_mse_score = monotonicity_mse(predictions)
        report.metrics["monotonic_mse"] = m_mse_score
        plt.plot(colValues, monotonic_curve, linestyle='dashed', color='red', alpha=0.7, label="Monotonic Approximation")
    plt.plot(colValues, predictions, color='blue', alpha=0.7, label="Predictions Curve")
    plt.title(f"Model: {type(model).__name__}")
    plt.xlabel(f'Feature {feature} value')
    plt.ylabel('Predict proba')
    plt.legend(loc="lower right")
    return report

[docs]def check_monotonicity_multiple_samples(model, samples, feature, start, stop, step=1):
    '''Monotonicity Analysis for multiple examples

    Parameters
    ----------
    model : sklearn model or str
        Model that will be used to make predictions. Could be a model object or a path to a model file.

    samples : pandas.DataFrame
        Pandas DataFrame containing two or more rows that will be used as base point.

    feature : str
        Name of the feature being analysed.

    start : int
        The starting value of the feature's interval.

    stop : int
        The end value of the feature's interval.

    step : float, default=1
            Size of the step between ranges "start" and "stop".
            Ex: step = 0.1 between ranges 0 and 1 will result in [0  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]

    Returns
    -------
    AnalysisReport object with following attributes:
        For more information:
        >>> from mlbugdetection.analysis_report import AnalysisReport
        >>> help(AnalysisReport)

    model_name : str
        Name of the model being analysed.
    
    analysed_feature : str
        Name of the feature being analysed.
    
    feature_range : tuple
        Range of values of the feature being analysed: (start, stop).
    
    metrics : dictionary
        Dictionary with all the calculated metrics, such as:
        
        'monotonic' : bool
             If the list of values is monotonic.

        'monotonic_mse': float
            MSE between the list of values and it`s closest monotonic aproximation. 
        
        'monotonic_means_std': float
            Standard deviation of the means of the predictions probabilities.

    graphs : List
            List of all the figures created.
    '''
    if len(samples) < 2:
        raise Exception("Samples must have multiple examples, please use 'check_monotonicity_single_sample' for single example")
    if type(model) == str:
        with open(model, 'rb') as f:
            model = pickle.load(f)
            
    report = AnalysisReport()
    colValues = []
    predictions = []

    for i in np.arange(start,stop,step):
        colValues.append(i)
        samples[feature] = i
        prediction = model.predict_proba(samples)
        predictions.append(np.mean(prediction[0][1]))

    monotonic =  (all(predictions[i] <= predictions[i + 1] for i in range(len(predictions) - 1)) or all(predictions[i] >= predictions[i + 1] for i in range(len(predictions) - 1)))
    report.model_name = type(model).__name__
    report.analysed_feature = feature
    report.feature_range = (start, stop)
    fig = plt.figure(figsize=(6, 3), dpi=150)
    report.graphs.append(fig)
    if monotonic:
        report.metrics["monotonic"] = True
        report.metrics["monotonic_mse"] = 0
    else:
        report.metrics["monotonic"] = False
        monotonic_curve, m_mse_score = monotonicity_mse(predictions)
        report.metrics["monotonic_mse"] = m_mse_score
        plt.plot(colValues, monotonic_curve, linestyle='dashed', color='red', alpha=0.7, label="Monotonic Approximation")
    report.metrics["monotonic_means_std"] = np.nanstd(predictions)
    plt.plot(colValues, predictions, color='blue', alpha=0.7, label="Predictions Curve")
    plt.title(f"Model: {type(model).__name__}")
    plt.xlabel(f'Feature {feature} value')
    plt.ylabel('Predict proba')
    plt.legend(loc="lower right")
    return report