Source code for immuneML.simulation.implants.Motif

# quality: gold
from dataclasses import dataclass

from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.simulation.motif_instantiation_strategy.MotifInstantiationStrategy import MotifInstantiationStrategy
from immuneML.util.ReflectionHandler import ReflectionHandler
from scripts.specification_util import update_docs_per_mapping


[docs]@dataclass
class Motif:
    """
    Class describing motifs where each motif is defined by a seed and
    a way of creating specific instances of the motif (instantiation_strategy);

    When instantiation_strategy is set, specific motif instances will be
    produced by calling instantiate_motif(seed) method of instantiation_strategy


    Arguments:

        seed (str): An amino acid sequence that represents the basic motif seed. All implanted motifs correspond to the seed, or a modified
        version thereof, as specified in it's instantiation strategy. If this argument is set, seed_chain1 and seed_chain2 arguments are not used.

        instantiation (:py:obj:`~immuneML.simulation.motif_instantiation_strategy.MotifInstantiationStrategy.MotifInstantiationStrategy`):
        Which strategy to use for implanting the seed. It should be one of the classes inheriting MotifInstantiationStrategy.
        In the YAML specification this can either be one of these values as a string in which case the default parameters will be used.
        Alternatively, instantiation can be specified with parameters as in the example YAML specification below. For the detailed list of
        parameters, see the specific instantiation strategies below.

        seed_chain1 (str): in case when representing motifs for paired chain data, it is possible to define a motif seed per chain; if this parameter
        is set, the generated motif instances will include a motif instance for both chains; for more details on how it works see `seed` argument
        above. Used only if the seed argument is not set.

        seed_chain2 (str): used for paired chain data, for the other receptor chain; for more details on how it works see `seed` argument. This
        argument is used only if the seed argument is not set.

        name_chain1: name of the first chain if paired receptor data are simulated. The value should be an instance of
        :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`. This argument is used only if the seed argument is not set.

        name_chain2: name of the second chain 2 if paired receptor data are simulated. The value should be an instance of
        :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`. This argument is used only if the seed argument is not set.


    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        motifs:
            # examples for single chain receptor data
            my_simple_motif: # this will be the identifier of the motif
                seed: AAA
                instantiation: GappedKmer
            my_gapped_motif:
                seed: AA/A
                instantiation:
                    GappedKmer:
                        min_gap: 1
                        max_gap: 2
            # examples for paired chain receptor data
            my_paired_motif:
                seed_chain1: AAA # seed for chain1 or chain2 can optionally include gap, same as for single chain receptor data
                name_chain1: ALPHA # alpha chain of TCR
                seed_chain2: CCC
                name_chain2: BETA # beta chain of TCR
                instantiation: GappedKmer # same as for single chain receptor data

    """

    identifier: str
    instantiation: MotifInstantiationStrategy
    seed: str = None
    seed_chain1: str = None
    name_chain1: Chain = None
    seed_chain2: str = None
    name_chain2: Chain = None

[docs]    def instantiate_motif(self, chain_name: Chain = None):
        """
        Creates a motif instance based on the seed; if seed parameter is defined for the motif, it is assumed that single chain data are used for
        the analysis. If seed is None, then it is assumed that paired chain receptor data are required in which case this function will return a
        motif instance per chain along with the names of the chains

        Returns:
             a motif instance if single chain immune receptor data are simulated or a dict where keys are chain names and values are motif instances
             for the corresponding chains
        """
        assert self.instantiation is not None, "Motif: set instantiation strategy before instantiating a motif."
        # TODO: handle PWMs also, here it always uses seed
        if self.seed is not None:
            return self.instantiation.instantiate_motif(self.seed)
        else:
            assert self.name_chain1 is not None and self.name_chain2 is not None, \
                f"Motif: chain names have to be set when working with paired chain data, here these are: {self.name_chain1} and {self.name_chain2}."
            assert chain_name is not None, "Motif: when working with paired chain data, please specify the chain for which the motif is instantiated."
            assert chain_name in [self.name_chain1, self.name_chain2], \
                f"Motif: specified chain name {chain_name.name.lower()} is not in valid list of chain names specified for motif {self.identifier}: " \
                f"{[self.name_chain1.name.lower(), self.name_chain2.name.lower()]}."

            return self.instantiation.instantiate_motif(self.seed_chain1 if chain_name == self.name_chain1 else self.seed_chain2)

[docs]    def get_max_length(self):
        if self.seed is not None:
            return len(self.seed.replace("/", "")) + self.instantiation.get_max_gap()
        else:
            return max(len(self.seed_chain1.replace("/", "")), len(self.seed_chain2.replace("/", ""))) + self.instantiation.get_max_gap()

    def __str__(self):
        return self.identifier + " - " + \
               (self.seed if self.seed is not None else f"{self.name_chain1}_{self.seed_chain1}__{self.name_chain2}_{self.seed_chain2}")

[docs]    @staticmethod
    def get_documentation():
        doc = str(Motif.__doc__)

        valid_strategy_values = ReflectionHandler.all_nonabstract_subclass_basic_names(MotifInstantiationStrategy, "Instantiation",
                                                                                       "motif_instantiation_strategy/")
        valid_strategy_values = str(valid_strategy_values)[1:-1].replace("'", "`")
        chain_values = str([name for name in Chain])[1:-1].replace("'", "`")
        mapping = {
            "It should be one of the classes inheriting MotifInstantiationStrategy.": f"Valid values are: {valid_strategy_values}.",
            "The value should be an instance of :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`.":
                f"Valid values are: {chain_values}."
        }
        doc = update_docs_per_mapping(doc, mapping)
        return doc