Source code for immuneML.encodings.reference_encoding.MatchedSequencesEncoder

import abc

from immuneML.caching.CacheHandler import CacheHandler
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequenceList import ReceptorSequenceList
from immuneML.encodings.DatasetEncoder import DatasetEncoder
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.encodings.reference_encoding.MatchedReferenceUtil import MatchedReferenceUtil
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.ReflectionHandler import ReflectionHandler


[docs]class MatchedSequencesEncoder(DatasetEncoder): """ Encodes the dataset based on the matches between a RepertoireDataset and a reference sequence dataset. This encoding should be used in combination with the :ref:`Matches` report. Arguments: reference (dict): A dictionary describing the reference dataset file. See the :py:mod:`~immuneML.IO.sequence_import` for specification details. max_edit_distance (dict): The maximum edit distance between a target sequence (from the repertoire) and the reference sequence. A maximum distance can be specified per chain. YAML Specification: .. indent with spaces .. code-block:: yaml my_ms_encoding: MatchedSequences: reference: path: path/to/file.txt format: VDJDB max_edit_distance: 1 """ dataset_mapping = { "RepertoireDataset": "MatchedSequencesRepertoireEncoder" } def __init__(self, max_edit_distance: int, reference_sequences: ReceptorSequenceList, name: str = None): self.max_edit_distance = max_edit_distance self.reference_sequences = reference_sequences self.name = name @staticmethod def _prepare_parameters(max_edit_distance: int, reference: dict, name: str = None): location = "MatchedSequencesEncoder" ParameterValidator.assert_type_and_value(max_edit_distance, int, location, "max_edit_distance", min_inclusive=0) reference_sequences = MatchedReferenceUtil.prepare_reference(reference_params=reference, location=location, paired=False) return { "max_edit_distance": max_edit_distance, "reference_sequences": reference_sequences, "name": name }
[docs] @staticmethod def build_object(dataset=None, **params): try: prepared_parameters = MatchedSequencesEncoder._prepare_parameters(**params) encoder = ReflectionHandler.get_class_by_name(MatchedSequencesEncoder.dataset_mapping[dataset.__class__.__name__], "reference_encoding/")(**prepared_parameters) except ValueError: raise ValueError("{} is not defined for dataset of type {}.".format(MatchedSequencesEncoder.__name__, dataset.__class__.__name__)) return encoder
[docs] def encode(self, dataset, params: EncoderParams): cache_key = CacheHandler.generate_cache_key(self._prepare_caching_params(dataset, params)) encoded_dataset = CacheHandler.memo(cache_key, lambda: self._encode_new_dataset(dataset, params)) return encoded_dataset
def _prepare_caching_params(self, dataset, params: EncoderParams): encoding_params_desc = {"max_edit_distance": self.max_edit_distance, "reference_sequences": sorted([seq.get_sequence() + seq.metadata.v_gene + seq.metadata.j_gene for seq in self.reference_sequences])} return (("dataset_identifiers", tuple(dataset.get_example_ids())), ("dataset_metadata", dataset.metadata_file), ("dataset_type", dataset.__class__.__name__), ("labels", tuple(params.label_config.get_labels_by_name())), ("encoding", MatchedSequencesEncoder.__name__), ("learn_model", params.learn_model), ("encoding_params", encoding_params_desc), ) @abc.abstractmethod def _encode_new_dataset(self, dataset, params: EncoderParams): pass