Source code for immuneML.simulation.signal_implanting_strategy.HealthySequenceImplanting

import copy
import logging
import random
from pathlib import Path

from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.simulation.sequence_implanting.SequenceImplantingStrategy import SequenceImplantingStrategy
from immuneML.simulation.signal_implanting_strategy.ImplantingComputation import ImplantingComputation, get_implanting_function
from immuneML.simulation.signal_implanting_strategy.SignalImplantingStrategy import SignalImplantingStrategy


[docs]class HealthySequenceImplanting(SignalImplantingStrategy):
    """
    This class represents a :py:obj:`~immuneML.simulation.signal_implanting_strategy.SignalImplantingStrategy.SignalImplantingStrategy`
    where signals will be implanted in 'healthy sequences', meaning sequences in which no signal has been implanted
    previously. This ensures that there is only one signal per receptor sequence.

    If for the given number of sequences in the repertoire and repertoire implanting rate, the total number of sequences for implanting turns out to
    be less than 1 (e.g. for 12000 sequences and repertoire implanting rate 0.00005, it should implant the signal in 0.6 sequences), the signal will
    not be implanted in that repertoire and a warning with repertoire identifier along with the repertoire implanting rate and number of sequences
    in the repertoire will be raised.

    Arguments:

        implanting: name of the implanting strategy, here HealthySequence

        sequence_position_weights (dict): A dictionary describing the relative weights for implanting a signal at each given IMGT position in the
        receptor sequence. If sequence_position_weights are not set, then SequenceImplantingStrategy will make all of the positions equally likely
        for each receptor sequence.

        implanting_computation (str): defines how to determine the number of sequences to implant the signal in a repertoire; it relies on
        repertoire_implanting_rate, but in case where the number of sequences for implanting is not an integer, this option can be useful.
        If implanting rate is set to 'round', then the number of sequences for implanting in a repertoire will be rounded to the nearest integer value of the
        product of repertoire implanting rate and the number of sequences in a repertoire (e.g., if the product value is 1.2, the signal will be
        implanted in one sequence only). If implanting rate is set to 'Poisson', the number of sequences for implanting will be sampled
        from the Poisson distribution with the value of the lambda parameter being repertoire implanting rate multiplied by the number of sequences
        in the repertoire.


    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        motifs:
            my_motif:
                ...

        signals:
            my_signal:
                motifs:
                    - my_motif
                    - ...
                implanting: HealthySequence
                implanting_computation: Poisson
                sequence_position_weights:
                    109: 1
                    110: 2
                    111: 5
                    112: 1

    """
    def __init__(self, implanting: SequenceImplantingStrategy = None, sequence_position_weights: dict = None,
                 implanting_computation: ImplantingComputation = None):
        super().__init__(implanting, sequence_position_weights)
        self.compute_implanting = get_implanting_function(implanting_computation)

[docs]    def implant_in_repertoire(self, repertoire: Repertoire, repertoire_implanting_rate: float, signal, path: Path) -> Repertoire:
        max_motif_length = self._calculate_max_motif_length(signal)
        sequences_to_be_processed, other_sequences = self._choose_sequences_for_implanting(repertoire,
                                                                                           repertoire_implanting_rate,
                                                                                           max_motif_length)
        processed_sequences = self._implant_in_sequences(sequences_to_be_processed, signal)
        sequences = other_sequences + processed_sequences
        metadata = self._build_new_metadata(repertoire.metadata, signal)
        new_repertoire = self._build_new_repertoire(sequences, metadata, signal, path)

        return new_repertoire

    def _build_new_metadata(self, metadata: dict, signal) -> dict:
        new_metadata = copy.deepcopy(metadata) if metadata is not None else {}
        new_metadata[signal.id] = True
        return new_metadata

    def _calculate_max_motif_length(self, signal):
        max_motif_length = max([motif.get_max_length() for motif in signal.motifs])
        return max_motif_length

    def _build_new_repertoire(self, sequences, repertoire_metadata, signal, path: Path) -> Repertoire:
        if repertoire_metadata is not None:
            metadata = copy.deepcopy(repertoire_metadata)
        else:
            metadata = {}

        # when adding implant to a repertoire, only signal id is stored:
        # more detailed information is available in each receptor_sequence
        # (specific motif and motif instance)
        metadata[signal.id] = True
        repertoire = Repertoire.build_from_sequence_objects(sequences, path, metadata)

        return repertoire

    def _implant_in_sequences(self, sequences_to_be_processed: list, signal):
        assert self.sequence_implanting_strategy is not None, \
            "HealthySequenceImplanting: add receptor_sequence implanting strategy when creating a HealthySequenceImplanting object."

        sequences = []
        for sequence in sequences_to_be_processed:
            processed_sequence = self.implant_in_sequence(sequence, signal)
            sequences.append(processed_sequence)

        return sequences

    def _choose_sequences_for_implanting(self, repertoire: Repertoire, repertoire_implanting_rate: float, max_motif_length: int):
        number_of_sequences_to_implant = self.compute_implanting(repertoire_implanting_rate * len(repertoire.sequences))
        if number_of_sequences_to_implant == 0:
            logging.warning(f"HealthySequenceImplanting: there are {len(repertoire.sequences)} sequences in repertoire {repertoire.identifier} "
                            f"for the given repertoire implanting rate of {repertoire_implanting_rate}; no motif will be implanted. To implant "
                            f"motifs, increase 'repertoire_implanting_rate' in the specification.")
        unusable_sequences = []
        unprocessed_sequences = []

        for sequence in repertoire.sequences:
            if sequence.annotation is not None and sequence.annotation.implants is not None and len(sequence.annotation.implants) > 0:
                unusable_sequences.append(sequence)
            elif len(sequence.get_sequence()) < max_motif_length:
                unusable_sequences.append(sequence)
            else:
                unprocessed_sequences.append(sequence)

        assert number_of_sequences_to_implant <= len(unprocessed_sequences), \
            "HealthySequenceImplanting: there are not enough sequences in the repertoire to provide given repertoire infection rate. " \
            f"Reduce repertoire infection rate to proceed. Total unprocessed sequences: {len(unprocessed_sequences)}, " \
            f"number of sequences to implant: {number_of_sequences_to_implant}."

        random.shuffle(unprocessed_sequences)
        sequences_to_be_infected = unprocessed_sequences[:number_of_sequences_to_implant]
        other_sequences = unusable_sequences + unprocessed_sequences[number_of_sequences_to_implant:]

        return sequences_to_be_infected, other_sequences

[docs]    def implant_in_receptor(self, receptor, signal, is_noise: bool):
        raise RuntimeError("HealthySequenceImplanting was called on a receptor object. Check the simulation parameters.")