Source code for immuneML.encodings.preprocessing.FeatureScaler

import pickle
from pathlib import Path

from sklearn.preprocessing import StandardScaler, normalize, binarize

from immuneML.analysis.data_manipulation.NormalizationType import NormalizationType
from immuneML.util.PathBuilder import PathBuilder


[docs]class FeatureScaler:

    SKLEARN_NORMALIZATION_TYPES = ["l1", "l2", "max"]

[docs]    @staticmethod
    def standard_scale(scaler_file: Path, design_matrix, with_mean: bool = True):
        """
        scale to zero mean and unit variance on feature level
        :param scaler_file: path to scaler file fitted on train set or where the resulting scaler file will be stored
        :param design_matrix: rows -> examples, columns -> features
        :param with_mean: whether to scale to zero mean or not (could lose sparsity if scaled)
        :return: scaled design matrix
        """

        if with_mean and hasattr(design_matrix, "todense"):
            scaled_design_matrix = design_matrix.todense()
        else:
            scaled_design_matrix = design_matrix

        if scaler_file.is_file():
            with scaler_file.open('rb') as file:
                scaler = pickle.load(file)
                scaled_design_matrix = scaler.transform(scaled_design_matrix)
        else:
            scaler = StandardScaler(with_mean=with_mean)
            scaled_design_matrix = scaler.fit_transform(scaled_design_matrix)

            directory = scaler_file.parent
            PathBuilder.build_from_objects(directory)

            with scaler_file.open('wb') as file:
                pickle.dump(scaler, file)

        return scaled_design_matrix

[docs]    @staticmethod
    def normalize(design_matrix, normalization_type: NormalizationType):
        """
        Normalize on example level so that the norm type applies

        Args:
            design_matrix: rows -> examples, columns -> features
            normalization_type: l1, l2, max, binary, none

        Returns:
             normalized design matrix
        """
        if normalization_type.name == "NONE":
            normalized_design_matrix = design_matrix
        elif normalization_type.name == "BINARY":
            normalized_design_matrix = binarize(design_matrix)
        elif normalization_type.value in FeatureScaler.SKLEARN_NORMALIZATION_TYPES:
            normalized_design_matrix = normalize(design_matrix, norm=normalization_type.value, axis=1)
        else:
            raise NotImplementedError("Normalization type {} ({}) is not implemented.".format(normalization_type.name, normalization_type.value))

        return normalized_design_matrix