Source code for immuneML.simulation.motif_instantiation_strategy.GappedKmerInstantiation

import random

import numpy as np

from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.simulation.implants.MotifInstance import MotifInstance
from immuneML.simulation.motif_instantiation_strategy.MotifInstantiationStrategy import MotifInstantiationStrategy


[docs]class GappedKmerInstantiation(MotifInstantiationStrategy):
    """
    Creates a motif instance from a given seed and additional optional parameters.
    Currently, at most a single gap can be specified in the sequence.

    Arguments:

        min_gap (int): The minimum gap length, in case the original seed contains a gap.

        max_gap (int): The maximum gap length, in case the original seed contains a gap.

        hamming_distance_probabilities (dict): The probability of modifying the given seed with each
        number of modifications. The keys represent the number of modifications (hamming distance)
        between the original seed and the implanted motif, and the values represent the probabilities for
        the respective number of modifications. For example {0: 0.7, 1: 0.3} means that 30% of the time one position
        will be modified, and the remaining 70% of the time the motif will remain unmodified with respect
        to the seed. The values of hamming_distance_probabilities must sum to 1.

        position_weights (dict): A dictionary containing the relative probabilities of choosing
        each position for hamming distance modification. The keys represent the position in the seed, where
        counting starts at 0. If the index of a gap is specified in position_weights, it will be removed. The values
        represent the relative probabilities for modifying each position when it gets selected for modification.
        For example {0: 0.6, 1: 0, 2: 0.4} means that when a sequence is selected for a modification (as
        specified in hamming_distance_probabilities), then 60% of the time the amino acid at index 0 is modified,
        and the remaining 40% of the time the amino acid at index 2. If the values of position_weights do not sum
        to 1, the remainder will be redistributed over all positions, including those not specified.

        alphabet_weights (dict): A dictionary describing the relative probabilities of choosing each amino acid
        for hamming distance modification. The keys represent the amino acids and the values the relative
        probabilities for choosing this amino acid. If the values of alphabet_weights do not sum to 1, the remainder
        will be redistributed over all possible amino acids, including those not specified.


    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        GappedKmer:
            min_gap: 1
            max_gap: 2
            hamming_distance_probabilities:
                0: 0.7
                1: 0.3
            position_weights: # note that index 2, the position of the gap, is excluded from position_weights
                0: 1
                1: 0
                3: 0
            alphabet_weights:
                A: 0.2
                C: 0.2
                D: 0.4
                E: 0.2

    """

    def __init__(self, hamming_distance_probabilities: dict = None, min_gap: int = 0, max_gap: int = 0,
                 alphabet_weights: dict = None, position_weights: dict = None):
        if hamming_distance_probabilities is not None:
            hamming_distance_probabilities = {key: float(value) for key, value in hamming_distance_probabilities.items()}
            assert all(isinstance(key, int) for key in hamming_distance_probabilities.keys()) \
                   and all(isinstance(val, float) for val in hamming_distance_probabilities.values()) \
                   and 0.99 <= sum(hamming_distance_probabilities.values()) <= 1, \
                "GappedKmerInstantiation: for each possible Hamming distance a probability between 0 and 1 has to be assigned " \
                "so that the probabilities for all distance possibilities sum to 1."

        self._hamming_distance_probabilities = hamming_distance_probabilities
        self.position_weights = position_weights
        # if weights are not given for each letter of the alphabet, distribute the remaining probability
        # equally among letters
        self.alphabet_weights = self.set_default_weights(alphabet_weights, EnvironmentSettings.get_sequence_alphabet())
        self._min_gap = min_gap
        self._max_gap = max_gap

[docs]    def get_max_gap(self) -> int:
        return self._max_gap

[docs]    def instantiate_motif(self, base) -> MotifInstance:
        allowed_positions = list(range(len(base)))
        instance = list(base)

        if "/" in base:
            gap_index = base.index("/")
            allowed_positions.remove(gap_index)

        self.position_weights = self.set_default_weights(self.position_weights, allowed_positions)

        gap_size = np.random.choice(range(self._min_gap, self._max_gap + 1))
        instance = self._substitute_letters(self.position_weights,
                                            self.alphabet_weights,
                                            allowed_positions, instance)
        instance = "".join(instance)

        return MotifInstance(instance, gap_size)

    def _substitute_letters(self, position_weights, alphabet_weights, allowed_positions: list, instance: list):

        if self._hamming_distance_probabilities:
            substitution_count = random.choices(list(self._hamming_distance_probabilities.keys()),
                                                list(self._hamming_distance_probabilities.values()), k=1)[0]
            allowed_position_weights = {key: value for key, value in position_weights.items() if key in allowed_positions}
            position_probabilities = self._prepare_probabilities(allowed_position_weights)
            positions = list(np.random.choice(allowed_positions, size=substitution_count, p=position_probabilities))

            while substitution_count > 0:
                if position_weights[positions[substitution_count - 1]] > 0:  # if the position is allowed to be changed
                    position = positions[substitution_count - 1]
                    alphabet_probabilities = self._prepare_probabilities(alphabet_weights)
                    instance[position] = np.random.choice(EnvironmentSettings.get_sequence_alphabet(), size=1,
                                                          p=alphabet_probabilities)[0]
                substitution_count -= 1

        return instance

    def _prepare_keys(self, weights):
        keys = list(weights.keys())
        keys.sort()
        return keys

    def _prepare_probabilities(self, weights: dict):
        keys = self._prepare_keys(weights)
        s = sum([weights[key] for key in keys])
        return [weights[key] / s for key in keys]

[docs]    def set_default_weights(self, weights, keys):
        if weights is not None and len(weights.keys()) < len(keys):
            remaining_probability = (1 - sum(weights.values())) / (len(keys) - len(weights.keys()))
            additional_keys = set(keys) - set(weights.keys())

            for key in additional_keys:
                weights[key] = remaining_probability

        elif weights is None:
            remaining_probability = 1 / len(keys)
            weights = {key: remaining_probability for key in keys}

        return weights