Source code for immuneML.dsl.definition_parsers.SimulationParser

import copy

from immuneML.dsl.symbol_table.SymbolTable import SymbolTable
from immuneML.dsl.symbol_table.SymbolType import SymbolType
from immuneML.simulation.Implanting import Implanting
from immuneML.simulation.Simulation import Simulation
from immuneML.util.Logger import log
from immuneML.util.ParameterValidator import ParameterValidator


[docs] class SimulationParser: """ YAML specification: .. highlight:: yaml .. code-block:: yaml definitions: dataset: my_dataset: ... motifs: m1: seed: AAC # "/" character denotes the gap in the seed if present (e.g. AA/C) instantiation: GappedKmer: # probability that when hamming distance is allowed a letter in the seed will be replaced by # other alphabet letters - alphabet_weights alphabet_weights: A: 0.2 C: 0.2 D: 0.4 E: 0.2 # Relative probabilities of choosing each position in the seed for hamming distance modification. # The probabilities will be scaled to sum to one - position_weights position_weights: 0: 1 1: 0 2: 0 hamming_distance_probabilities: 0: 0.5 # Hamming distance of 0 (no change) with probability 0.5 1: 0.5 # Hamming distance of 1 (one letter change) with probability 0.5 min_gap: 0 max_gap: 1 signals: s1: motifs: # list of all motifs for signal which will be uniformly sampled to get a motif instance for implanting - m1 sequence_position_weights: # likelihood of implanting at IMGT position of receptor sequence 107: 0.5 implanting: HealthySequence # choose only sequences with no other signals for to implant one of the motifs simulations: sim1: # one Simulation object consists of a dict of Implanting objects i1: dataset_implanting_rate: 0.5 # percentage of repertoire where the signals will be implanted repertoire_implanting_rate: 0.01 # percentage of sequences within repertoire where the signals will be implanted signals: - s1 instructions: my_simulation_instruction: type: Simulation dataset: my_dataset simulation: sim1 batch_size: 5 # number of repertoires that can be loaded at the same time # (only affects the speed) export_formats: [AIRR, ImmuneML] """
[docs] @staticmethod def parse_simulations(simulations: dict, symbol_table: SymbolTable): for key, simulation in simulations.items(): symbol_table = SimulationParser._parse_simulation(key, simulation, symbol_table) return symbol_table, simulations
@staticmethod @log def _parse_simulation(key: str, simulation: dict, symbol_table: SymbolTable) -> SymbolTable: location = "SimulationParser" valid_implanting_keys = ["dataset_implanting_rate", "repertoire_implanting_rate", "signals", "is_noise"] implantings = [] for impl_key, implanting in simulation.items(): ParameterValidator.assert_keys(implanting.keys(), valid_implanting_keys, location, impl_key, exclusive=False) ParameterValidator.assert_keys(implanting["signals"], symbol_table.get_keys_by_type(SymbolType.SIGNAL), location, impl_key, False) implanting_params = copy.deepcopy(implanting) implanting_params["signals"] = [symbol_table.get(signal) for signal in implanting["signals"]] implanting_params["name"] = impl_key implantings.append(Implanting(**implanting_params)) assert sum([settings["dataset_implanting_rate"] for settings in simulation.values()]) <= 1, \ "The total dataset implanting rate can not exceed 1." symbol_table.add(key, SymbolType.SIMULATION, Simulation(implantings)) return symbol_table