Source code for immuneML.reports.ml_reports.ConfounderAnalysis

from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.ml_methods.MLMethod import MLMethod
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.ml_reports.MLReport import MLReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder


[docs]class ConfounderAnalysis(MLReport):
    """
    A report that plots the numbers of false positives and false negatives with respect to each value of
    the metadata features specified by the user. This allows checking whether a given machine learning model makes more
    misclassifications for some values of a metadata feature than for the others.

    Arguments:

        metadata_labels (list): A list of the metadata features to use as a basis for the calculations

    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        my_confounder_report:
            ConfounderAnalysis:
                metadata_labels:
                  - age
                  - sex

    """

[docs]    @classmethod
    def build_object(cls, **kwargs):

        ParameterValidator.assert_keys(kwargs.keys(), ['metadata_labels', 'name'], ConfounderAnalysis.__name__, ConfounderAnalysis.__name__)
        ParameterValidator.assert_type_and_value(kwargs['metadata_labels'], list, ConfounderAnalysis.__name__, 'metadata_labels')
        ParameterValidator.assert_all_type_and_value(kwargs['metadata_labels'], str, ConfounderAnalysis.__name__, 'metadata_labels')
        ParameterValidator.assert_type_and_value(kwargs['name'], str, ConfounderAnalysis.__name__, 'name')

        return ConfounderAnalysis(metadata_labels=kwargs['metadata_labels'], name=kwargs['name'])

    def __init__(self, metadata_labels: List[str], train_dataset: Dataset = None, test_dataset: Dataset = None,
                 method: MLMethod = None, result_path: Path = None, name: str = None, hp_setting: HPSetting = None,
                 label=None, number_of_processes: int=1):
        super().__init__(train_dataset=train_dataset, test_dataset=test_dataset, method=method, result_path=result_path,
                         name=name, hp_setting=hp_setting, label=label, number_of_processes=number_of_processes)
        self.metadata_labels = metadata_labels

    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        paths = []

        # make predictions
        predictions = self.method.predict(self.test_dataset.encoded_data, self.label)[self.label.name]

        true_labels = self.test_dataset.get_metadata(self.metadata_labels + [self.label.name])
        metrics = ["FP", "FN"]

        plot = make_subplots(rows=len(self.metadata_labels), cols=2)
        listOfPlot = []

        for label_index, meta_label in enumerate(self.metadata_labels):
            csv_data = {}
            for metric_index, metric in enumerate(metrics):
                plotting_data = self._metrics(metric=metric, label_name=self.label.name, meta_label=meta_label,
                                              predictions=predictions, true_labels=true_labels)

                csv_data[f"{metric}"] = plotting_data[f"{metric}"]

                plot.add_trace(go.Bar(x=plotting_data[meta_label], y=plotting_data[metric]), row=label_index + 1, col=metric_index + 1)
                plot.update_xaxes(title_text=f"{meta_label}", row=label_index + 1, col=metric_index + 1, type='category')
                plot.update_yaxes(title_text=f"{metric}", row=label_index + 1, col=metric_index + 1, rangemode="nonnegative", tick0=0, dtick=1)

            csv_data[f"{meta_label}"] = plotting_data[f"{meta_label}"]

            csv_data = pd.DataFrame(csv_data)

            listOfPlot.append(csv_data)

        plot.update_traces(marker_color=px.colors.sequential.Teal[3], showlegend=False)
        filename = self.result_path / "plots.html"
        plot.write_html(str(filename))
        report_output_fig = ReportOutput(filename)
        paths.append(report_output_fig)

        result_table_path = self._write_results_table(listOfPlot, self.metadata_labels)
        return ReportResult(name=self.name,
                            info="Plots the numbers of false positives and false negatives with respect to each value of the metadata features specified by the user.",
                            output_figures=paths, output_tables=[ReportOutput(result_table_path[0])])

    def _write_results_table(self, plotting_data, labels):
        filepaths = []
        for label_index, label in enumerate(labels):
            filepath = self.result_path / f"{label}.csv"
            plotting_data[label_index].to_csv(filepath, index=False)
            filepaths.append(filepath)
        return filepaths

    @staticmethod
    def _metrics(metric, label_name, meta_label, predictions, true_labels):
        # indices of samples at which misclassification occurred
        if metric == "FP":
            metric_inds = np.nonzero(np.greater(predictions, true_labels[label_name]))[0].tolist()
        else:
            metric_inds = np.nonzero(np.less(predictions, true_labels[label_name]))[0].tolist()

        metadata_values = true_labels[meta_label]
        # indices of misclassification with respect to the metadata label
        label_inds = np.array(metadata_values)[metric_inds]

        metric_vals = []
        unique_levels = np.unique(metadata_values)

        # number of metric occurrences at each metadata level
        for val in unique_levels:
            metric_vals.append(np.count_nonzero(label_inds == val))

        plotting_data = pd.DataFrame(
            {f"{metric}": metric_vals, f"{meta_label}": unique_levels})

        return plotting_data