Source code for immuneML.encodings.reference_encoding.MatchedReceptorsEncoder

import abc
from typing import List

from immuneML.caching.CacheHandler import CacheHandler
from immuneML.data_model.receptor.BCReceptor import BCReceptor
from immuneML.data_model.receptor.Receptor import Receptor
from immuneML.data_model.receptor.TCABReceptor import TCABReceptor
from immuneML.data_model.receptor.TCGDReceptor import TCGDReceptor
from immuneML.encodings.DatasetEncoder import DatasetEncoder
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.reference_encoding.MatchedReferenceUtil import MatchedReferenceUtil
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.ReflectionHandler import ReflectionHandler


[docs]class MatchedReceptorsEncoder(DatasetEncoder): """ Encodes the dataset based on the matches between a dataset containing unpaired (single chain) data, and a paired reference receptor dataset. For each paired reference receptor, the frequency of either chain in the dataset is counted. This encoding should be used in combination with the :ref:`Matches` report. Arguments: reference (dict): A dictionary describing the reference dataset file, specified the same as regular data import. See the :py:mod:`~immuneML.IO.sequence_import` for specification details. Must contain paired receptor sequences. max_edit_distances (dict): A dictionary specifying the maximum edit distance between a target sequence (from the repertoire) and the reference sequence. A maximum distance can be specified per chain, for example to allow for less strict matching of TCR alpha and BCR light chains. When only an integer is specified, this distance is applied to all possible chains. YAML Specification: .. indent with spaces .. code-block:: yaml my_mr_encoding: MatchedReceptors: reference: format: IRIS params: path: path/to/file.txt paired: True all_dual_chains: True all_genes: True max_edit_distances: alpha: 1 beta: 0 """ dataset_mapping = { "RepertoireDataset": "MatchedReceptorsRepertoireEncoder" } def __init__(self, reference_receptors: List[Receptor], max_edit_distances: dict, name: str = None): self.reference_receptors = reference_receptors self.max_edit_distances = max_edit_distances self.name = name @staticmethod def _prepare_parameters(reference: dict, max_edit_distances: dict, name: str = None): location = "MatchedReceptorsEncoder" legal_chains = [chain for receptor in (TCABReceptor(), TCGDReceptor(), BCReceptor()) for chain in receptor.get_chains()] if type(max_edit_distances) is int: max_edit_distances = {chain: max_edit_distances for chain in legal_chains} elif type(max_edit_distances) is dict: ParameterValidator.assert_keys(max_edit_distances.keys(), legal_chains, location, "max_edit_distances", exclusive=False) else: ParameterValidator.assert_type_and_value(max_edit_distances, dict, location, 'max_edit_distances') reference_receptors = MatchedReferenceUtil.prepare_reference(reference, location=location, paired=True) return { "reference_receptors": reference_receptors, "max_edit_distances": max_edit_distances, "name": name }
[docs] @staticmethod def build_object(dataset=None, **params): try: prepared_params = MatchedReceptorsEncoder._prepare_parameters(**params) encoder = ReflectionHandler.get_class_by_name( MatchedReceptorsEncoder.dataset_mapping[dataset.__class__.__name__], "reference_encoding/")(**prepared_params) except ValueError: raise ValueError("{} is not defined for dataset of type {}.".format(MatchedReceptorsEncoder.__name__, dataset.__class__.__name__)) return encoder
[docs] def encode(self, dataset, params: EncoderParams): cache_key = CacheHandler.generate_cache_key(self._prepare_caching_params(dataset, params)) encoded_dataset = CacheHandler.memo(cache_key, lambda: self._encode_new_dataset(dataset, params)) return encoded_dataset
def _prepare_caching_params(self, dataset, params: EncoderParams): chains = [(receptor.get_chain(receptor.get_chains()[0]), receptor.get_chain(receptor.get_chains()[1])) for receptor in self.reference_receptors] encoding_params_desc = {"max_edit_distance": sorted(self.max_edit_distances.items()), "reference_receptors": sorted([chain_a.get_sequence() + chain_a.metadata.v_gene + chain_a.metadata.j_gene + "|" + chain_b.get_sequence() + chain_b.metadata.v_gene + chain_b.metadata.j_gene for chain_a, chain_b in chains])} return (("dataset_identifiers", tuple(dataset.get_example_ids())), ("dataset_metadata", dataset.metadata_file), ("dataset_type", dataset.__class__.__name__), ("labels", tuple(params.label_config.get_labels_by_name())), ("encoding", MatchedReceptorsEncoder.__name__), ("learn_model", params.learn_model), ("encoding_params", encoding_params_desc), ) @abc.abstractmethod def _encode_new_dataset(self, dataset, params: EncoderParams): pass