Source code for immuneML.reports.ml_reports.Coefficients

import logging
from numbers import Number
from pathlib import Path

import pandas as pd
import plotly.express as px
import yaml

from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.ml_methods.LogisticRegression import LogisticRegression
from immuneML.ml_methods.MLMethod import MLMethod
from immuneML.ml_methods.RandomForestClassifier import RandomForestClassifier
from immuneML.ml_methods.SVC import SVC
from immuneML.ml_methods.SVM import SVM
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.ml_reports.CoefficientPlottingSetting import CoefficientPlottingSetting
from immuneML.reports.ml_reports.CoefficientPlottingSettingList import CoefficientPlottingSettingList
from immuneML.reports.ml_reports.MLReport import MLReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from scripts.specification_util import update_docs_per_mapping



[docs]
class Coefficients(MLReport):
    """
    A report that plots the coefficients for a given ML method in a barplot. Can be used for :ref:`LogisticRegression`,
    :ref:`SVM`, :ref:`SVC`, and :ref:`RandomForestClassifier`. In the case of RandomForest, the feature importances will be plotted.

    When used in :ref:`TrainMLModel` instruction, the report can be specified under 'models', both on
    the selection and assessment levels.

    Which coefficients should be plotted (for example: only nonzero, above a certain threshold, ...) can be specified.
    Multiple options can be specified simultaneously. By default the 25 largest coefficients are plotted.
    The full set of coefficients will also be exported as a csv file.


    Arguments:

        coefs_to_plot (list): A list specifying which coefficients should be plotted. For options see :py:obj:`~immuneML.reports.ml_reports.CoefficientPlottingSetting.CoefficientPlottingSetting`.

        cutoff (list): If 'cutoff' is specified under 'coefs_to_plot', the cutoff values can be specified here. The coefficients which have an absolute value equal to or greater than the cutoff will be plotted.

        n_largest (list): If 'n_largest' is specified under 'coefs_to_plot', the values for n can be specified here. These should be integer values. The n largest coefficients are determined based on their absolute values.

    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        my_coef_report:
            Coefficients:
                coefs_to_plot:
                    - all
                    - nonzero
                    - cutoff
                    - n_largest
                cutoff:
                    - 0.1
                    - 0.01
                n_largest:
                    - 5
                    - 10

    """


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        location = "Coefficients"
        coefs_to_plot = [coef.upper() for coef in kwargs["coefs_to_plot"]]

        name = kwargs["name"] if "name" in kwargs else None

        ParameterValidator.assert_all_in_valid_list(coefs_to_plot, [item.name.upper() for item in CoefficientPlottingSetting], location,
                                                    "coefs_to_plot")

        if CoefficientPlottingSetting.CUTOFF.name in coefs_to_plot:
            cutoff = kwargs["cutoff"]
            ParameterValidator.assert_type_and_value(cutoff, list, location, "cutoff")
            ParameterValidator.assert_all_type_and_value(cutoff, Number, location, "cutoff", min_inclusive=1e-15)
        else:
            cutoff = []

        if CoefficientPlottingSetting.N_LARGEST.name in coefs_to_plot:
            n_largest = kwargs["n_largest"]
            ParameterValidator.assert_type_and_value(n_largest, list, location, "n_largest")
            ParameterValidator.assert_all_type_and_value(n_largest, int, location, "n_largest", min_inclusive=1)
        else:
            n_largest = []

        coefs = CoefficientPlottingSettingList()
        for keyword in coefs_to_plot:
            coefs.append(CoefficientPlottingSetting[keyword.upper()])

        return Coefficients(coefs_to_plot=coefs, cutoff=cutoff, n_largest=n_largest, name=name)


    def __init__(self, coefs_to_plot: CoefficientPlottingSettingList, cutoff: list, n_largest: list, train_dataset: Dataset = None,
                 test_dataset: Dataset = None, method: MLMethod = None, result_path: Path = None, name: str = None, hp_setting: HPSetting = None,
                 label=None, number_of_processes: int = 1):
        super().__init__(train_dataset=train_dataset, test_dataset=test_dataset, method=method, result_path=result_path,
                         name=name, hp_setting=hp_setting, label=label, number_of_processes=number_of_processes)
        self._coefs_to_plot = coefs_to_plot
        self._cutoff = cutoff
        self._n_largest = n_largest

    def _generate(self):
        PathBuilder.build(self.result_path)
        paths = []

        self._set_plotting_parameters()

        plot_data = self._retrieve_plot_data()
        plot_data["abs_coefficients"] = abs(plot_data["coefficients"])
        plot_data.sort_values(by="abs_coefficients", inplace=True, ascending=False)

        result_table_path = self._write_results_table(plot_data[["features", "coefficients"]])
        self._write_settings()

        if CoefficientPlottingSetting.ALL in self._coefs_to_plot:
            report_output_fig = self._plot(plotting_data=plot_data, output_name="all_coefficients")
            paths.append(report_output_fig)

        if CoefficientPlottingSetting.NONZERO in self._coefs_to_plot:
            nonzero_data = plot_data[plot_data["coefficients"] != 0]
            report_output_fig = self._plot(plotting_data=nonzero_data, output_name="nonzero_coefficients")
            paths.append(report_output_fig)

        if CoefficientPlottingSetting.CUTOFF in self._coefs_to_plot:
            for cutoff_val in self._cutoff:
                cutoff_data = plot_data[plot_data["abs_coefficients"] >= cutoff_val]
                report_output_fig = self._plot(plotting_data=cutoff_data, output_name="cutoff_{}_coefficients".format(cutoff_val))
                paths.append(report_output_fig)

        if CoefficientPlottingSetting.N_LARGEST in self._coefs_to_plot:
            for n_val in self._n_largest:
                n_largest_data = plot_data.nlargest(n=n_val, columns=["abs_coefficients"])
                report_output_fig = self._plot(plotting_data=n_largest_data, output_name="largest_{}_coefficients".format(n_val))
                paths.append(report_output_fig)

        return ReportResult(self.name,
                            info=f"{self._y_axis_title}s of the trained {self.method.__class__.__name__} model",
                            output_tables=[ReportOutput(result_table_path, "features and coefficients csv")],
                            output_figures=[p for p in paths if p is not None])

    def _set_plotting_parameters(self):
        if isinstance(self.method, RandomForestClassifier):
            self._param_field = "feature_importances"
            self._y_axis_title = "Feature importance"
        else:
            # SVM, logistic regression, ...
            self._param_field = "coefficients"
            self._y_axis_title = "Coefficient value"

    def _write_settings(self):
        if self.hp_setting is not None:
            file_path = self.result_path / "settings.yaml"
            with file_path.open("w") as file:
                yaml.dump({"preprocessing": self.hp_setting.preproc_sequence_name,
                           "encoder": self.hp_setting.encoder_name,
                           "ml_method": self.hp_setting.ml_method_name},
                          file)

    def _write_results_table(self, plotting_data):
        filepath = self.result_path / "coefficients.csv"
        plotting_data.to_csv(filepath, index=False)
        return filepath

    def _retrieve_plot_data(self):
        coefficients = self.method.get_params()[self._param_field]

        feature_names = self._retrieve_feature_names()

        return pd.DataFrame({"coefficients": coefficients, "features": feature_names})

    def _retrieve_feature_names(self):
        if self.train_dataset and self.train_dataset.encoded_data:
            return self.train_dataset.encoded_data.feature_names

    def _plot(self, plotting_data, output_name):
        if plotting_data.empty:
            logging.warning(f"Coefficients: empty data subset specified, skipping {output_name} plot...")
        else:

            filename = self.result_path / f"{output_name}.html"

            figure = px.bar(plotting_data, x='features', y='coefficients', template='plotly_white',
                            title=f"{type(self.method).__name__}{' (' + self.method.name + ') - ' if self.method.name is not None else ' - '}"
                                  f"{' '.join(output_name.split('_'))}")
            figure.update_traces(marker_color=px.colors.sequential.Teal[3])

            with filename.open("w") as file:
                figure.write_html(file)

            return ReportOutput(filename)


[docs]
    def check_prerequisites(self):

        run_report = True

        if not any([isinstance(self.method, legal_method) for legal_method in (RandomForestClassifier, LogisticRegression, SVM, SVC)]):
            logging.warning(f"Coefficients report can only be created for RandomForestClassifier, LogisticRegression, SVC, or SVM, but got "
                            f"{type(self.method).__name__} instead. Coefficients report will not be created.")
            run_report = False

        return run_report



[docs]
    @staticmethod
    def get_documentation():
        doc = str(Coefficients.__doc__)
        valid_values = str([option.name for option in CoefficientPlottingSetting])[1:-1].replace("'", "`")
        mapping = {
            "For options see :py:obj:`~immuneML.reports.ml_reports.CoefficientPlottingSetting.CoefficientPlottingSetting`.":
                f"Valid values are: {valid_values}."
        }
        doc = update_docs_per_mapping(doc, mapping)
        return doc