Source code for immuneML.encodings.baseline_encoding.GeneFrequencyEncoder

from collections import Counter
from typing import List

import pandas as pd
from sklearn.preprocessing import StandardScaler

from immuneML.analysis.data_manipulation.NormalizationType import NormalizationType
from immuneML.data_model.EncodedData import EncodedData
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.encodings.DatasetEncoder import DatasetEncoder
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.preprocessing.FeatureScaler import FeatureScaler
from immuneML.util.NumpyHelper import NumpyHelper
from immuneML.util.ParameterValidator import ParameterValidator


[docs] class GeneFrequencyEncoder(DatasetEncoder): """ GeneFrequencyEncoder represents a repertoire by the frequency of V and/or J genes used. **Dataset type:** - RepertoireDatasets **Specification arguments:** - genes (list): List of genes to use for the encoding. Possible values are 'V', and 'J'. At least one gene must be specified. - normalization_type (str): Type of normalization to apply to the gene frequencies. Possible values are 'none', 'binary', 'relative_frequency', 'max', 'l2'. Defaults to 'relative_frequency'. - scale_to_zero_mean (bool): Whether to scale the features to zero mean. Defaults to True. - scale_to_unit_variance (bool): Whether to scale the features to unit variance. Defaults to True. **YAML specification:** .. code-block:: yaml encodings: gene_frequency_encoding: GeneFrequency: genes: [V, J] normalization_type: relative_frequency scale_to_unit_variance: true scale_to_zero_mean: true """ def __init__(self, genes: List[str], normalization_type: NormalizationType, scale_to_zero_mean: bool, scale_to_unit_variance: bool, name: str = None): super().__init__(name=name) self.genes = genes self.normalization_type = normalization_type self.scale_to_zero_mean = scale_to_zero_mean self.scale_to_unit_variance = scale_to_unit_variance self.scaler = None self.feature_names = None @property def all_feature_names(self) -> List[str]: return [feature_name for gene in self.genes for feature_name in self.feature_names[gene]]
[docs] @staticmethod def build_object(dataset: Dataset, **params): valid_keys = ['genes', 'normalization_type', 'scale_to_zero_mean', 'scale_to_unit_variance', 'name'] ParameterValidator.assert_keys(params.keys(), valid_keys, "GeneFrequencyEncoder", "parameters", exclusive=False) ParameterValidator.assert_all_in_valid_list(params['genes'], ['V', 'J'], "GeneFrequencyEncoder", "genes") ParameterValidator.assert_type_and_value(params['scale_to_zero_mean'], bool, "GeneFrequencyEncoder", "scale_to_zero_mean") ParameterValidator.assert_type_and_value(params['scale_to_unit_variance'], bool, "GeneFrequencyEncoder", "scale_to_unit_variance") normalization_type = NormalizationType[params['normalization_type'].upper()] return GeneFrequencyEncoder(genes=params['genes'], normalization_type=normalization_type, scale_to_zero_mean=params['scale_to_zero_mean'], scale_to_unit_variance=params['scale_to_unit_variance'], name=params.get('name', 'gene_frequency'))
[docs] def encode(self, dataset, params: EncoderParams) -> Dataset: if isinstance(dataset, RepertoireDataset): return self._encode_repertoire_dataset(dataset, params) else: raise RuntimeError(f"{self.__class__.__name__}: {self.name}: invalid dataset type: {type(dataset)}.")
def _encode_repertoire_dataset(self, dataset: RepertoireDataset, params: EncoderParams) -> RepertoireDataset: counters = {gene: [] for gene in self.genes} for rep in dataset.repertoires: for gene in self.genes: genes = [gene_call.split("*")[0] for gene_call in getattr(rep.data, f'{gene.lower()}_call').tolist()] counters[gene].append(Counter(genes)) gene_dfs = {gene: pd.DataFrame(counters[gene]).fillna(0) for gene in self.genes} if params.learn_model: self.feature_names = {gene: df.columns.tolist() for gene, df in gene_dfs.items()} else: for gene in self.genes: gene_dfs[gene] = gene_dfs[gene].reindex(columns=self.feature_names[gene], fill_value=0) gene_dfs = {gene: FeatureScaler.normalize(df.values, self.normalization_type) for gene, df in gene_dfs.items()} examples = NumpyHelper.concat_arrays_rowwise([gene_dfs[gene] for gene in self.genes]) if params.encode_labels: labels = dataset.get_metadata(params.label_config.get_labels_by_name(), return_df=False) else: labels = None return self._make_encoded_dataset(dataset, examples, labels, params) def _make_encoded_dataset(self, dataset, examples, labels, params: EncoderParams): examples = self._scale_examples(examples, params) encoded_data = EncodedData(examples=examples, labels=labels, example_ids=dataset.get_example_ids(), feature_names=self.all_feature_names, encoding=GeneFrequencyEncoder.__name__, info={'genes': self.genes}, feature_annotations=pd.DataFrame({"feature": self.all_feature_names})) encoded_dataset = dataset.clone() encoded_dataset.encoded_data = encoded_data return encoded_dataset def _scale_examples(self, examples, params): if params.learn_model: self.scaler = StandardScaler(with_mean=self.scale_to_zero_mean, with_std=self.scale_to_unit_variance) examples = FeatureScaler.standard_scale_fit(self.scaler, examples, with_mean=self.scale_to_zero_mean) else: examples = FeatureScaler.standard_scale(self.scaler, examples, with_mean=self.scale_to_zero_mean) return examples