Source code for immuneML.ml_methods.classifiers.SklearnMethod

import abc
import logging
import os
import warnings
import inspect
from pathlib import Path

import dill
import numpy as np
import pandas as pd
import yaml
from sklearn.metrics import get_scorer_names
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.validation import check_is_fitted

from immuneML.data_model.EncodedData import EncodedData
from immuneML.environment.Label import Label
from immuneML.ml_methods.classifiers.MLMethod import MLMethod
from immuneML.ml_methods.util.Util import Util
from immuneML.ml_metrics.ClassificationMetric import ClassificationMetric
from immuneML.util.FilenameHandler import FilenameHandler
from immuneML.util.PathBuilder import PathBuilder



[docs]
class SklearnMethod(MLMethod):
    """
    Base class for ML methods imported from scikit-learn. The classes inheriting SklearnMethod acting as wrappers around imported
    ML methods from scikit-learn have to implement:
    - the __init__() method,
    - get_params(label) and
    - _get_ml_model()

    Other methods can also be overwritten if needed.
    The arguments and specification described bellow applied for all classes inheriting SklearnMethod.

    **Specification arguments:**

    - parameters: a dictionary of parameters that will be directly passed to scikit-learn's class upon calling __init__()
      method; for detailed list see scikit-learn's documentation of the specific class inheriting SklearnMethod

    - parameter_grid: a dictionary of parameters which all have to be valid arguments for scikit-learn's corresponding class' __init__() method
      (same as parameters), but unlike parameters argument can contain list of values instead of one value; if this is specified and
      "model_selection_cv" is True (in the specification) or just if fit_by_cross_validation() is called, a grid search will be performed over
      these parameters and the optimal model will be kept


    **YAML specification:**

        definitions:
            ml_methods:
                ml_methods:
                    log_reg:
                        LogisticRegression: # name of the class inheriting SklearnMethod
                            # sklearn parameters (same names as in original sklearn class)
                            max_iter: 1000 # specific parameter value
                            penalty: l1
                            # Additional parameter that determines whether to print convergence warnings
                            show_warnings: True
                        # if any of the parameters under LogisticRegression is a list and model_selection_cv is True,
                        # a grid search will be done over the given parameters, using the number of folds specified in model_selection_n_folds,
                        # and the optimal model will be selected
                        model_selection_cv: True
                        model_selection_n_folds: 5
                    svm_with_cv:
                        SVM: # name of another class inheriting SklearnMethod
                            # sklearn parameters (same names as in original sklearn class)
                            alpha: 10
                            # Additional parameter that determines whether to print convergence warnings
                            show_warnings: True
                        # no grid search will be done
                        model_selection_cv: False

    """

    FIT_CV = "fit_CV"
    FIT = "fit"

    def __init__(self, parameter_grid: dict = None, parameters: dict = None):
        super(SklearnMethod, self).__init__()
        self.model = None
        self.random_cv_obj = None

        if parameter_grid is not None and "show_warnings" in parameter_grid:
            self.show_warnings = parameter_grid.pop("show_warnings")[0]
        elif parameters is not None and "show_warnings" in parameters:
            self.show_warnings = parameters.pop("show_warnings")
        else:
            self.show_warnings = True

        self._parameter_grid = parameter_grid
        self._parameters = parameters

    def _fit(self, encoded_data: EncodedData, cores_for_training: int = 2):
        mapped_y = Util.map_to_new_class_values(encoded_data.labels[self.label.name], self.class_mapping)

        self.model = self._fit_model(encoded_data.examples, mapped_y, encoded_data.example_weights, cores_for_training)

    def _predict(self, encoded_data: EncodedData):
        self.check_is_fitted(self.label.name)

        predictions = self.apply_with_weights(self.model.predict,
                                              encoded_data.example_weights,
                                              X=encoded_data.examples)

        return {self.label.name: Util.map_to_old_class_values(np.array(predictions), self.class_mapping)}

    def _predict_proba(self, encoded_data: EncodedData):
        if self.can_predict_proba():
            probabilities = self.apply_with_weights(self.model.predict_proba, encoded_data.example_weights,
                                                    X=encoded_data.examples)
            class_names = Util.map_to_old_class_values(self.model.classes_, self.class_mapping)

            return {self.label.name: {class_name: probabilities[:, i] for i, class_name in enumerate(class_names)}}
        else:
            logging.warning(f"{self.__class__.__name__}: cannot predict probabilities.")
            return None

    def _fit_model(self, X, y, w=None, cores_for_training: int = 1):
        self.model = self._get_ml_model(cores_for_training, X)

        if w is not None and not self._check_method_supports_example_weight(
                self.model.fit) and not self._check_method_supports_example_weight(self.model.predict):
            logging.warning(
                f"{self.__class__.__name__}: cannot fit this classifier with example weights, fitting without example weights instead... Example weights will still be applied when computing evaluation metrics after fitting.")

        if not self.show_warnings:
            warnings.simplefilter("ignore")
            os.environ["PYTHONWARNINGS"] = "ignore"

        self.model = self._get_ml_model(cores_for_training, X)
        self.apply_with_weights(self.model.fit, w, X=X, y=y)

        if not self.show_warnings:
            del os.environ["PYTHONWARNINGS"]
            warnings.simplefilter("always")

        return self.model


[docs]
    def apply_with_weights(self, method, weights, **kwargs):
        """
        Can be used to run self.model.fit, self.model.predict or self.model.predict_proba with sample weights if supported

        :param method: self.model.fit, self.model.predict or self.model.predict_proba
        :return: the result of the supplied method
        """
        if weights is not None and self._check_method_supports_example_weight(method):
            return method(**kwargs, sample_weight=weights)
        else:
            return method(**kwargs)


    def _check_method_supports_example_weight(self, method):
        return "sample_weight" in inspect.signature(method).parameters


[docs]
    def can_predict_proba(self) -> bool:
        return False



[docs]
    def check_is_fitted(self, label_name: str):
        if self.label.name == label_name or label_name is None:
            return check_is_fitted(self.model,
                                   ["estimators_", "coef_", "estimator", "_fit_X", "dual_coef_", "classes_"],
                                   all_or_any=any)


    def _fit_by_cross_validation(self, encoded_data: EncodedData, number_of_splits: int, cores_for_training: int):

        mapped_y = Util.map_to_new_class_values(encoded_data.labels[self.label.name], self.class_mapping)

        self._fit_model_by_cross_validation(X=encoded_data.examples, y=mapped_y,
                                            w=encoded_data.example_weights,
                                            number_of_splits=number_of_splits,
                                            cores_for_training=cores_for_training)

    def _fit_model_by_cross_validation(self, X, y, w, number_of_splits: int, cores_for_training: int):

        model = self._get_ml_model()
        scoring = ClassificationMetric.get_sklearn_score_name(ClassificationMetric.get_metric(self.optimization_metric))

        if scoring not in get_scorer_names():
            scoring = "balanced_accuracy"
            logging.warning(
                f"{self.__class__.__name__}: specified optimization metric ({self.optimization_metric}) is not defined as a sklearn scoring function, using {scoring} instead... ")

        if not self.show_warnings:
            warnings.simplefilter("ignore")
            os.environ["PYTHONWARNINGS"] = "ignore"

        self.random_cv_obj = RandomizedSearchCV(model, param_distributions=self._parameter_grid, cv=number_of_splits,
                                                n_jobs=cores_for_training,
                                                scoring=scoring, refit=True)

        self.random_cv_obj = self.apply_with_weights(self.random_cv_obj.fit, w, X=X, y=y)

        if not self.show_warnings:
            del os.environ["PYTHONWARNINGS"]
            warnings.simplefilter("always")

        self.model = self.random_cv_obj.best_estimator_

        return self.model


[docs]
    def store(self, path: Path):
        PathBuilder.build(path)
        file_path = path / f"{self._get_model_filename()}.pickle"
        with file_path.open("wb") as file:
            dill.dump(self.model, file)

        params_path = path / f"{self._get_model_filename()}.yaml"

        try:
            if self.random_cv_obj is not None:
                pd.DataFrame(self.random_cv_obj.cv_results_).to_csv(
                    path / f"{self._get_model_filename()}_cv_results.csv",
                    index=False)
        except Exception as e:
            logging.warning(f"SklearnMethod: could not save cross-validation results in {self._get_model_filename()}: "
                            f"{e}")

        with params_path.open("w") as file:
            desc = {
                **(self.get_params()),
                "feature_names": self.get_feature_names(),
                "classes": self.model.classes_.tolist(),
                "class_mapping": self.class_mapping,
            }

            if self.label is not None:
                desc["label"] = self.label.get_desc_for_storage()

            yaml.dump(desc, file)


    def _get_model_filename(self):
        return FilenameHandler.get_filename(self.__class__.__name__, "")


[docs]
    def load(self, path: Path):
        name = f"{self._get_model_filename()}.pickle"
        file_path = path / name
        if file_path.is_file():
            with file_path.open("rb") as file:
                self.model = dill.load(file)
        else:
            raise FileNotFoundError(f"{self.__class__.__name__} model could not be loaded from {file_path}"
                                    f". Check if the path to the {name} file is properly set.")

        params_path = path / f"{self._get_model_filename()}.yaml"

        if params_path.is_file():
            with params_path.open("r") as file:
                desc = yaml.safe_load(file)
                if "label" in desc:
                    setattr(self, "label", Label(**desc["label"]))
                for param in ["feature_names", "classes", "class_mapping"]:
                    if param in desc:
                        setattr(self, param, desc[param])


    @abc.abstractmethod
    def _get_ml_model(self, cores_for_training: int = 2, X=None):
        pass


[docs]
    @abc.abstractmethod
    def get_params(self, for_refitting=False):
        """Returns the model parameters in a readable yaml-friendly way (consisting of lists, dictionaries and
        strings).

        Args:
            for_refitting: """
        pass



[docs]
    def get_package_info(self) -> str:
        sklearn_version = ""
        try:
            from sklearn import __version__ as version
            sklearn_version = f"; scikit-learn: {version}"
        except Exception as e:
            logging.warning("Could not get scikit-learn version: " + str(e))

        return Util.get_immuneML_version() + sklearn_version



[docs]
    def get_compatible_encoders(self):
        from immuneML.encodings.diversity_encoding.EvennessProfileEncoder import EvennessProfileEncoder
        from immuneML.encodings.kmer_frequency.KmerFrequencyEncoder import KmerFrequencyEncoder
        from immuneML.encodings.onehot.OneHotEncoder import OneHotEncoder
        from immuneML.encodings.word2vec.Word2VecEncoder import Word2VecEncoder
        from immuneML.encodings.reference_encoding.MatchedSequencesEncoder import MatchedSequencesEncoder
        from immuneML.encodings.reference_encoding.MatchedReceptorsEncoder import MatchedReceptorsEncoder
        from immuneML.encodings.reference_encoding.MatchedRegexEncoder import MatchedRegexEncoder
        from immuneML.encodings.motif_encoding.MotifEncoder import MotifEncoder
        from immuneML.encodings.protein_embedding.ESMCEncoder import ESMCEncoder
        from immuneML.encodings.protein_embedding.ProtT5Encoder import ProtT5Encoder
        from immuneML.encodings.protein_embedding.TCRBertEncoder import TCRBertEncoder
        from immuneML.encodings.diversity_encoding.ShannonDiversityEncoder import ShannonDiversityEncoder
        from immuneML.encodings.composite_encoding.CompositeEncoder import CompositeEncoder
        from immuneML.encodings.baseline_encoding.GeneFrequencyEncoder import GeneFrequencyEncoder
        from immuneML.encodings.baseline_encoding.MetadataEncoder import MetadataEncoder

        from immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder import \
            CompAIRRSequenceAbundanceEncoder

        return [KmerFrequencyEncoder, OneHotEncoder, Word2VecEncoder, EvennessProfileEncoder,
                MatchedSequencesEncoder, MatchedReceptorsEncoder, MatchedRegexEncoder, MotifEncoder,
                ESMCEncoder, ProtT5Encoder, TCRBertEncoder, ShannonDiversityEncoder, CompAIRRSequenceAbundanceEncoder,
                CompositeEncoder, GeneFrequencyEncoder, MetadataEncoder]



[docs]
    @staticmethod
    def get_usage_documentation(model_name):
        return f"""
    
    Scikit-learn models can be trained in two modes: 
    
    1. Creating a model using a given set of hyperparameters, and relying on the selection and assessment loop in the
    TrainMLModel instruction to select the optimal model. 
    
    2. Passing a range of different hyperparameters to {model_name}, and using a third layer of nested cross-validation 
    to find the optimal hyperparameters through grid search. In this case, only the {model_name} model with the optimal 
    hyperparameter settings is further used in the inner selection loop of the TrainMLModel instruction. 
    
    By default, mode 1 is used. In order to use mode 2, model_selection_cv and model_selection_n_folds must be set. 
    
    
    **Specification arguments:**

    - {model_name} (dict): Under this key, hyperparameters can be specified that will be passed to the scikit-learn class.
      Any scikit-learn hyperparameters can be specified here. In mode 1, a single value must be specified for each of the scikit-learn
      hyperparameters. In mode 2, it is possible to specify a range of different hyperparameters values in a list. It is also allowed
      to mix lists and single values in mode 2, in which case the grid search will only be done for the lists, while the
      single-value hyperparameters will be fixed. 
      In addition to the scikit-learn hyperparameters, parameter show_warnings (True/False) can be specified here. This determines
      whether scikit-learn warnings, such as convergence warnings, should be printed. By default show_warnings is True.
        
    - model_selection_cv (bool): If any of the hyperparameters under {model_name} is a list and model_selection_cv is True, 
      a grid search will be done over the given hyperparameters, using the number of folds specified in model_selection_n_folds.
      By default, model_selection_cv is False. 
        
    - model_selection_n_folds (int): The number of folds that should be used for the cross validation grid search if model_selection_cv is True.
        
    """