Source code for immuneML.simulation.implants.Motif

# quality: gold
from dataclasses import dataclass

from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.simulation.motif_instantiation_strategy.MotifInstantiationStrategy import MotifInstantiationStrategy
from immuneML.util.ReflectionHandler import ReflectionHandler
from scripts.specification_util import update_docs_per_mapping


[docs]@dataclass class Motif: """ Class describing motifs where each motif is defined by a seed and a way of creating specific instances of the motif (instantiation_strategy); When instantiation_strategy is set, specific motif instances will be produced by calling instantiate_motif(seed) method of instantiation_strategy Arguments: seed (str): An amino acid sequence that represents the basic motif seed. All implanted motifs correspond to the seed, or a modified version thereof, as specified in it's instantiation strategy. If this argument is set, seed_chain1 and seed_chain2 arguments are not used. instantiation (:py:obj:`~immuneML.simulation.motif_instantiation_strategy.MotifInstantiationStrategy.MotifInstantiationStrategy`): Which strategy to use for implanting the seed. It should be one of the classes inheriting MotifInstantiationStrategy. In the YAML specification this can either be one of these values as a string in which case the default parameters will be used. Alternatively, instantiation can be specified with parameters as in the example YAML specification below. For the detailed list of parameters, see the specific instantiation strategies below. seed_chain1 (str): in case when representing motifs for paired chain data, it is possible to define a motif seed per chain; if this parameter is set, the generated motif instances will include a motif instance for both chains; for more details on how it works see `seed` argument above. Used only if the seed argument is not set. seed_chain2 (str): used for paired chain data, for the other receptor chain; for more details on how it works see `seed` argument. This argument is used only if the seed argument is not set. name_chain1: name of the first chain if paired receptor data are simulated. The value should be an instance of :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`. This argument is used only if the seed argument is not set. name_chain2: name of the second chain 2 if paired receptor data are simulated. The value should be an instance of :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`. This argument is used only if the seed argument is not set. YAML specification: .. indent with spaces .. code-block:: yaml motifs: # examples for single chain receptor data my_simple_motif: # this will be the identifier of the motif seed: AAA instantiation: GappedKmer my_gapped_motif: seed: AA/A instantiation: GappedKmer: min_gap: 1 max_gap: 2 # examples for paired chain receptor data my_paired_motif: seed_chain1: AAA # seed for chain1 or chain2 can optionally include gap, same as for single chain receptor data name_chain1: ALPHA # alpha chain of TCR seed_chain2: CCC name_chain2: BETA # beta chain of TCR instantiation: GappedKmer # same as for single chain receptor data """ identifier: str instantiation: MotifInstantiationStrategy seed: str = None seed_chain1: str = None name_chain1: Chain = None seed_chain2: str = None name_chain2: Chain = None
[docs] def instantiate_motif(self, chain_name: Chain = None): """ Creates a motif instance based on the seed; if seed parameter is defined for the motif, it is assumed that single chain data are used for the analysis. If seed is None, then it is assumed that paired chain receptor data are required in which case this function will return a motif instance per chain along with the names of the chains Returns: a motif instance if single chain immune receptor data are simulated or a dict where keys are chain names and values are motif instances for the corresponding chains """ assert self.instantiation is not None, "Motif: set instantiation strategy before instantiating a motif." # TODO: handle PWMs also, here it always uses seed if self.seed is not None: return self.instantiation.instantiate_motif(self.seed) else: assert self.name_chain1 is not None and self.name_chain2 is not None, \ f"Motif: chain names have to be set when working with paired chain data, here these are: {self.name_chain1} and {self.name_chain2}." assert chain_name is not None, "Motif: when working with paired chain data, please specify the chain for which the motif is instantiated." assert chain_name in [self.name_chain1, self.name_chain2], \ f"Motif: specified chain name {chain_name.name.lower()} is not in valid list of chain names specified for motif {self.identifier}: " \ f"{[self.name_chain1.name.lower(), self.name_chain2.name.lower()]}." return self.instantiation.instantiate_motif(self.seed_chain1 if chain_name == self.name_chain1 else self.seed_chain2)
[docs] def get_max_length(self): if self.seed is not None: return len(self.seed.replace("/", "")) + self.instantiation.get_max_gap() else: return max(len(self.seed_chain1.replace("/", "")), len(self.seed_chain2.replace("/", ""))) + self.instantiation.get_max_gap()
def __str__(self): return self.identifier + " - " + \ (self.seed if self.seed is not None else f"{self.name_chain1}_{self.seed_chain1}__{self.name_chain2}_{self.seed_chain2}")
[docs] @staticmethod def get_documentation(): doc = str(Motif.__doc__) valid_strategy_values = ReflectionHandler.all_nonabstract_subclass_basic_names(MotifInstantiationStrategy, "Instantiation", "motif_instantiation_strategy/") valid_strategy_values = str(valid_strategy_values)[1:-1].replace("'", "`") chain_values = str([name for name in Chain])[1:-1].replace("'", "`") mapping = { "It should be one of the classes inheriting MotifInstantiationStrategy.": f"Valid values are: {valid_strategy_values}.", "The value should be an instance of :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`.": f"Valid values are: {chain_values}." } doc = update_docs_per_mapping(doc, mapping) return doc