Source code for immuneML.ml_methods.MLMethod

import abc
from pathlib import Path

from sklearn.exceptions import NotFittedError

from immuneML.data_model.encoded_data.EncodedData import EncodedData
from immuneML.environment import Label


[docs]class MLMethod(metaclass=abc.ABCMeta):
    """
    Base class for different machine learning methods, defining which functions should be implemented. These public functions are the only ones that
    will be used outside the method, during training, assessment or while making predictions. Most often the methods will be classifiers (binary or
    multi-class) that should learn some label on either immune repertoires (sets of receptor sequences), receptors (paired sequences) or receptor
    sequences (lists of amino acids).

    Here we refer to machine learning methods (algorithms) as a method that, given a set of examples and corresponding labels, constructs a model
    (such as logistic regression), whereas we define the model to be already fit to data using the learning method (algorithm), such as logistic
    regression with specific coefficients.

    The functions of this class provide a standard set of ML functions: fitting the model (with or without cross-validation) and making predictions
    (either class predictions or class probabilities if possible). Other functions provide for various utilities, such as storing and loading the
    model, checking if it was fit already, retrieving coefficients for user-friendly output etc.

    Note that when providing class probabilities the classes should have a specific (constant) order, and in case of binary classification, they
    should be ordered so that the negative class comes first and the positive one comes second. For this handling classes, see
    py:`immuneML.ml_methods.util.Util.Util.make_class_mapping` method that will automatically create class mapping for classification.

    """
    def __init__(self):
        self.name = None
        self.label = None

[docs]    @abc.abstractmethod
    def fit(self, encoded_data: EncodedData, label: Label, cores_for_training: int = 2):
        """
        The fit function fits the parameters of the machine learning model.

        Arguments:

            encoded_data (EncodedData): an instance of EncodedData class which includes encoded examples (repertoires, receptors or sequences), their
                labels, names of the features and other additional information. Most often, only examples and labels will be used. Examples are either a
                dense numpy matrix or a sparse matrix, where columns correspond to features and rows correspond to examples. There are a few encodings
                which make multidimensional outputs that do not follow this pattern, but they are tailored to specific ML methods which require such input
                (for instance, one hot encoding and ReceptorCNN method).

            label (Label): the label for which the classifier will be created. immuneML also supports multi-label classification, but it is
                handled outside MLMethod class by creating an MLMethod instance for each label. This means that each MLMethod should handle only one label.

            cores_for_training (int): if parallelization is available in the MLMethod (and the availability depends on the specific classifier), this
                is the number of processes that will be creating when fitting the model to speed up the computation.

        Returns:

            it doesn't return anything, but fits the model parameters instead

        """
        pass

[docs]    @abc.abstractmethod
    def predict(self, encoded_data: EncodedData, label: Label):
        """
        The predict function predicts the class for the given label across examples provided in encoded data.

        Arguments:

            encoded_data (EncodedData): an instance of EncodedData class which includes encoded examples (repertoires, receptors or sequences), their
                labels, names of the features and other additional information. Most often, only examples and labels will be used. Examples are either a
                dense numpy matrix or a sparse matrix, where columns correspond to features and rows correspond to examples. There are a few encodings
                which make multidimensional outputs that do not follow this pattern, but they are tailored to specific ML methods which require such input
                (for instance, one hot encoding and ReceptorCNN method).

            label: (Label): the label for which the classifier will be created. immuneML also supports multi-label classification, but it is
                handled outside MLMethod class by creating an MLMethod instance for each label. This means that each MLMethod should handle only one label.

        Returns:

            a dictionary where the key is the label_name and the value is a list of class predictions (one prediction per example):
            e.g., {label_name: [class1, class2, class2, class1]}

        """
        pass

[docs]    @abc.abstractmethod
    def fit_by_cross_validation(self, encoded_data: EncodedData, number_of_splits: int = 5, label: Label = None, cores_for_training: int = -1,
                                optimization_metric=None):
        """
        The fit_by_cross_validation function should implement finding the best model hyperparameters through cross-validation. In immuneML,
        preprocessing, encoding and ML hyperparameters can be optimized by using nested cross-validation (see TrainMLModelInstruction for more
        details). This function is in that setting the third level of nested cross-validation as it can optimize only over the model hyperparameters.
        It represents an alternative to optimizing the model hyperparameters in the TrainMLModelInstruction. Which one should be used depends on the
        use-case and specific models: models based on scikit-learn implementations come with this option by default (see SklearnMethod class), while
        custom classifiers typically do not implement this and just call fit() function and throw a warning instead.

        Arguments:

            encoded_data (EncodedData): an instance of EncodedData class which includes encoded examples (repertoires, receptors or sequences), their
                labels, names of the features and other additional information. Most often, only examples and labels will be used. Examples are either a
                dense numpy matrix or a sparse matrix, where columns correspond to features and rows correspond to examples. There are a few encodings
                which make multidimensional outputs that do not follow this pattern, but they are tailored to specific ML methods which require such input
                (for instance, one hot encoding and ReceptorCNN method).

            number_of_splits (int): number of splits for the cross-validation to be performed for selection the best hyperparameters of the ML model;
                note that if this is used in combination with nested cross-validation in TrainMLModel instruction, it can result in very few examples in
                each split depending on the orginal dataset size and the nested cross-validation setup.

            label (Label): the label for which the classifier will be created. immuneML also supports multi-label classification, but it is
                handled outside MLMethod class by creating an MLMethod instance for each label. This means that each MLMethod should handle only one label.

            cores_for_training (int): number of processes to be used during the cross-validation for model selection

            optimization_metric (str): the name of the optimization metric to be used to select the best model during cross-validation; when used with
                TrainMLModel instruction which is almost exclusively the case when the immuneML is run from the specification, this maps to the
                optimization metric in the instruction.

        Returns:

            it doesn't return anything, but fits the model parameters instead

        """
        pass

[docs]    @abc.abstractmethod
    def store(self, path: Path, feature_names: list = None, details_path: Path = None):
        """
        The store function stores the object on which it is called so that it can be imported later using load function. It typically uses pickle,
        yaml or similar modules to store the information. It can store one or multiple files.

        Arguments:

            path (Path): path to folder where to store the model

            feature_names (list): list of feature names in the encoded data; this can be stored as well to make it easier to map linear models to
                specific features as provided by the encoded (e.g., in case of logistic regression, this feature list defines what coefficients refer to)

            details_path (Path): path to folder where to store the details of the model. The details can be there to better understand the model but
                are not mandatory and are typically not loaded with the model afterwards. This is user-friendly file that can be examined manually by the
                user. It does not have to be created or can be created at the same folder as the path parameters points to. In practice, when used with
                TrainMLModel instruction, this parameter will either be None or have the same value as path parameter.

        Returns:

            it does not have a return value

        """
        pass

[docs]    @abc.abstractmethod
    def load(self, path: Path):
        """
        The load function can load the model given the folder where the same class of the model was previously stored using the store function.
        It reads in the parameters of the model and sets the values to the object attributes so that the model can be reused. For instance, this is
        used in MLApplication instruction when the previously trained model is applied on a new dataset.

        Arguments:

            path (Path): path to the folder where the model was stored using store() function

        Returns:

            it does not have a return value, but sets the attribute values of the object instead

        """
        pass

[docs]    @abc.abstractmethod
    def check_if_exists(self, path: Path) -> bool:
        """
        The check_if_exists function checks if there is a stored model on the given path. Might be useful in the future for implementing checkpoints.
        See SklearnMethod for example usage.

        Arguments:

            path (Path): path to folder where it should be checked if the model was stored previously

        Returns:

            True/False: whether the model was stored previously on the given path or not

        """
        pass

    @abc.abstractmethod
    def get_classes(self) -> list:
        """The get_classes function returns a list of classes for which the method was trained."""
        pass

[docs]    @abc.abstractmethod
    def get_params(self):
        """Returns the model parameters in a readable yaml-friendly way (consisting of lists, dictionaries and strings)."""
        pass

[docs]    @abc.abstractmethod
    def predict_proba(self, encoded_data: EncodedData, Label: Label):
        """
        The predict_proba function predicts class probabilities for the given label if the model supports probabilistic output. If not, it should
        raise a warning and return predicted classes without probabilities.

        Note that when providing class probabilities the classes should have a specific (constant) order, and in case of binary classification, they
        should be ordered so that the negative class comes first and the positive one comes second. For this handling classes, see
        py:mod:`immuneML.ml_methods.util.Util.Util.make_binary_class_mapping` method that will automatically create class mapping for binary classification.

        Arguments:

            encoded_data (EncodedData): an object of EncodedData class where the examples attribute should be used to make predictions. examples
            attribute includes encoded examples in matrix format (numpy 2D array or a sparse matrix depending on the encoding). EncodedData object
            provided here can include labels (in model assessment scenario) or not (when predicting the class probabilities on new data which has not
            been labels), so the labels attribute of the EncodedData object should NOT be used in this function, even if it is set.

            label (Label): the label for which the prediction should be made. It can be used to check if it matches the label that the
            model has been trained for and if not, an exception should be thrown. It is often an AssertionError as this can be checked before any
            prediction is made, but could also be a RuntimeError. It both cases, it should include a user-friendly message.

        Returns:

            a dictionary where the key is the label name and the value a 2D numpy array with class probabilities of dimension
            [number_of_examples x number_of_classes_for_label], for instance for label CMV where the class can be either True or False and there are
            3 examples to predict the class probabilities for:
            {CMV: [[0.2, 0.8], [0.55, 0.45], [0.98, 0.02]]}

        """
        pass

[docs]    @abc.abstractmethod
    def get_label_name(self) -> str:
        """Returns the name of the label for which the model was fitted."""
        pass

[docs]    @abc.abstractmethod
    def get_package_info(self) -> str:
        """
        Returns the package and version used for implementing the ML method if an external package was used or immuneML version if it is custom
        implementation. See py:mod:`immuneML.ml_methods.SklearnMethod.SklearnMethod` and py:mod:`immuneML.ml_methods.ProbabilisticBinaryClassifier.ProbabilisticBinaryClassifier`
        for examples of both.
        """
        pass

[docs]    @abc.abstractmethod
    def get_feature_names(self) -> list:
        """
        Returns the list of feature names (a list of strings) if available where the feature names were provided by the encoder in the
        EncodedData object.
        """
        pass

[docs]    @abc.abstractmethod
    def can_predict_proba(self) -> bool:
        """
        Returns whether the ML model can be used to predict class probabilities or class assignment only.
        """
        return False

[docs]    @abc.abstractmethod
    def get_class_mapping(self) -> dict:
        """Returns a dictionary containing the mapping between label values and values internally used in the classifier"""
        pass

[docs]    @abc.abstractmethod
    def get_compatible_encoders(self):
        pass

[docs]    def check_encoder_compatibility(self, encoder):
        """Checks whether the given encoder is compatible with this ML method, and throws an error if it is not."""
        is_valid = False

        for encoder_class in self.get_compatible_encoders():
            if issubclass(encoder.__class__, encoder_class):
                is_valid = True
                break

        if not is_valid:
            raise ValueError(f"{encoder.__class__.__name__} is not compatible with ML Method {self.__class__.__name__}. "
                             f"Please use one of the following encoders instead: {', '.join([enc_class.__name__ for enc_class in self.get_compatible_encoders()])}")

[docs]    def get_classes(self):
        return self.label.values