Source code for immuneML.encodings.distance_encoding.TCRdistEncoder

from pathlib import Path

import pandas as pd

from immuneML.data_model.dataset.ReceptorDataset import ReceptorDataset
from immuneML.data_model.encoded_data.EncodedData import EncodedData
from immuneML.encodings.DatasetEncoder import DatasetEncoder
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.util.EncoderHelper import EncoderHelper


[docs]class TCRdistEncoder(DatasetEncoder): """ Encodes the given ReceptorDataset as a distance matrix between all receptors, where the distance is computed using TCRdist from the paper: Dash P, Fiore-Gartland AJ, Hertz T, et al. Quantifiable predictive features define epitope-specific T cell receptor repertoires. Nature. 2017; 547(7661):89-93. `doi:10.1038/nature22383 <https://www.nature.com/articles/nature22383>`_. For the implementation, `TCRdist3 <https://tcrdist3.readthedocs.io/en/latest/>`_ library was used (source code available `here <https://github.com/kmayerb/tcrdist3>`_). Arguments: cores (int): number of processes to use for the computation YAML specification: .. indent with spaces .. code-block:: yaml my_tcr_dist_enc: # user-defined name TCRdist: cores: 4 """ def __init__(self, cores: int, name: str = None): self.cores = cores self.name = name self.distance_matrix = None self.context = None
[docs] @staticmethod def build_object(dataset, **params): if isinstance(dataset, ReceptorDataset): return TCRdistEncoder(**params) else: raise ValueError("TCRdistEncoder is not defined for dataset types which are not ReceptorDataset.")
[docs] def set_context(self, context: dict): self.context = context return self
[docs] def encode(self, dataset, params: EncoderParams): train_receptor_ids = EncoderHelper.prepare_training_ids(dataset, params) if params.learn_model: self._build_tcr_dist_matrix(dataset, params.label_config.get_labels_by_name()) distance_matrix = self.distance_matrix.loc[dataset.get_example_ids(), train_receptor_ids] labels = self._build_labels(dataset, params) if params.encode_labels else None encoded_dataset = dataset.clone() encoded_dataset.encoded_data = EncodedData(examples=distance_matrix, labels=labels, example_ids=distance_matrix.index.values, encoding=TCRdistEncoder.__name__) return encoded_dataset
def _build_tcr_dist_matrix(self, dataset: ReceptorDataset, label_names): from immuneML.util.TCRdistHelper import TCRdistHelper current_dataset = dataset if self.context is None or "dataset" not in self.context else self.context["dataset"] tcr_rep = TCRdistHelper.compute_tcr_dist(current_dataset, label_names, self.cores) self.distance_matrix = pd.DataFrame(tcr_rep.pw_alpha + tcr_rep.pw_beta, index=tcr_rep.clone_df.clone_id.values, columns=tcr_rep.clone_df.clone_id.values) def _build_labels(self, dataset: ReceptorDataset, params: EncoderParams) -> dict: labels = {label: [] for label in params.label_config.get_labels_by_name()} for receptor in dataset.get_data(): for label_name in labels.keys(): labels[label_name].append(receptor.metadata[label_name]) return labels
[docs] @staticmethod def export_encoder(path: Path, encoder) -> str: encoder_file = DatasetEncoder.store_encoder(encoder, path / "encoder.pickle") return encoder_file