Source code for immuneML.simulation.signal_implanting_strategy.HealthySequenceImplanting

import copy
import logging
import random
from pathlib import Path

from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.simulation.sequence_implanting.SequenceImplantingStrategy import SequenceImplantingStrategy
from immuneML.simulation.signal_implanting_strategy.ImplantingComputation import ImplantingComputation, get_implanting_function
from immuneML.simulation.signal_implanting_strategy.SignalImplantingStrategy import SignalImplantingStrategy


[docs] class HealthySequenceImplanting(SignalImplantingStrategy): """ This class represents a :py:obj:`~immuneML.simulation.signal_implanting_strategy.SignalImplantingStrategy.SignalImplantingStrategy` where signals will be implanted in 'healthy sequences', meaning sequences in which no signal has been implanted previously. This ensures that there is only one signal per receptor sequence. If for the given number of sequences in the repertoire and repertoire implanting rate, the total number of sequences for implanting turns out to be less than 1 (e.g. for 12000 sequences and repertoire implanting rate 0.00005, it should implant the signal in 0.6 sequences), the signal will not be implanted in that repertoire and a warning with repertoire identifier along with the repertoire implanting rate and number of sequences in the repertoire will be raised. Arguments: implanting: name of the implanting strategy, here HealthySequence sequence_position_weights (dict): A dictionary describing the relative weights for implanting a signal at each given IMGT position in the receptor sequence. If sequence_position_weights are not set, then SequenceImplantingStrategy will make all of the positions equally likely for each receptor sequence. implanting_computation (str): defines how to determine the number of sequences to implant the signal in a repertoire; it relies on repertoire_implanting_rate, but in case where the number of sequences for implanting is not an integer, this option can be useful. If implanting rate is set to 'round', then the number of sequences for implanting in a repertoire will be rounded to the nearest integer value of the product of repertoire implanting rate and the number of sequences in a repertoire (e.g., if the product value is 1.2, the signal will be implanted in one sequence only). If implanting rate is set to 'Poisson', the number of sequences for implanting will be sampled from the Poisson distribution with the value of the lambda parameter being repertoire implanting rate multiplied by the number of sequences in the repertoire. YAML specification: .. indent with spaces .. code-block:: yaml motifs: my_motif: ... signals: my_signal: motifs: - my_motif - ... implanting: HealthySequence implanting_computation: Poisson sequence_position_weights: 109: 1 110: 2 111: 5 112: 1 """ def __init__(self, implanting: SequenceImplantingStrategy = None, sequence_position_weights: dict = None, implanting_computation: ImplantingComputation = None): super().__init__(implanting, sequence_position_weights) self.compute_implanting = get_implanting_function(implanting_computation)
[docs] def implant_in_repertoire(self, repertoire: Repertoire, repertoire_implanting_rate: float, signal, path: Path) -> Repertoire: max_motif_length = self._calculate_max_motif_length(signal) sequences_to_be_processed, other_sequences = self._choose_sequences_for_implanting(repertoire, repertoire_implanting_rate, max_motif_length) processed_sequences = self._implant_in_sequences(sequences_to_be_processed, signal) sequences = other_sequences + processed_sequences metadata = self._build_new_metadata(repertoire.metadata, signal) new_repertoire = self._build_new_repertoire(sequences, metadata, signal, path) return new_repertoire
def _build_new_metadata(self, metadata: dict, signal) -> dict: new_metadata = copy.deepcopy(metadata) if metadata is not None else {} new_metadata[signal.id] = True return new_metadata def _calculate_max_motif_length(self, signal): max_motif_length = max([motif.get_max_length() for motif in signal.motifs]) return max_motif_length def _build_new_repertoire(self, sequences, repertoire_metadata, signal, path: Path) -> Repertoire: if repertoire_metadata is not None: metadata = copy.deepcopy(repertoire_metadata) else: metadata = {} # when adding implant to a repertoire, only signal id is stored: # more detailed information is available in each receptor_sequence # (specific motif and motif instance) metadata[signal.id] = True repertoire = Repertoire.build_from_sequence_objects(sequences, path, metadata) return repertoire def _implant_in_sequences(self, sequences_to_be_processed: list, signal): assert self.sequence_implanting_strategy is not None, \ "HealthySequenceImplanting: add receptor_sequence implanting strategy when creating a HealthySequenceImplanting object." sequences = [] for sequence in sequences_to_be_processed: processed_sequence = self.implant_in_sequence(sequence, signal) sequences.append(processed_sequence) return sequences def _choose_sequences_for_implanting(self, repertoire: Repertoire, repertoire_implanting_rate: float, max_motif_length: int): number_of_sequences_to_implant = self.compute_implanting(repertoire_implanting_rate * len(repertoire.sequences)) if number_of_sequences_to_implant == 0: logging.warning(f"HealthySequenceImplanting: there are {len(repertoire.sequences)} sequences in repertoire {repertoire.identifier} " f"for the given repertoire implanting rate of {repertoire_implanting_rate}; no motif will be implanted. To implant " f"motifs, increase 'repertoire_implanting_rate' in the specification.") unusable_sequences = [] unprocessed_sequences = [] for sequence in repertoire.sequences: if sequence.annotation is not None and sequence.annotation.implants is not None and len(sequence.annotation.implants) > 0: unusable_sequences.append(sequence) elif len(sequence.get_sequence()) < max_motif_length: unusable_sequences.append(sequence) else: unprocessed_sequences.append(sequence) assert number_of_sequences_to_implant <= len(unprocessed_sequences), \ "HealthySequenceImplanting: there are not enough sequences in the repertoire to provide given repertoire infection rate. " \ f"Reduce repertoire infection rate to proceed. Total unprocessed sequences: {len(unprocessed_sequences)}, " \ f"number of sequences to implant: {number_of_sequences_to_implant}." random.shuffle(unprocessed_sequences) sequences_to_be_infected = unprocessed_sequences[:number_of_sequences_to_implant] other_sequences = unusable_sequences + unprocessed_sequences[number_of_sequences_to_implant:] return sequences_to_be_infected, other_sequences
[docs] def implant_in_receptor(self, receptor, signal, is_noise: bool): raise RuntimeError("HealthySequenceImplanting was called on a receptor object. Check the simulation parameters.")