Source code for immuneML.api.galaxy.GalaxySimulationTool

import logging
import shutil
from pathlib import Path

import yaml

from immuneML.api.galaxy.GalaxyTool import GalaxyTool
from immuneML.api.galaxy.Util import Util
from immuneML.workflows.instructions.SimulationInstruction import SimulationInstruction


[docs]class GalaxySimulationTool(GalaxyTool):

    """
    GalaxySimulationTool is an alternative to running immuneML with the simulation instruction directly. It accepts a YAML specification file and a
    path to the output directory. It implants the signals in the dataset that was provided either as an existing dataset with a set of files or in
    the random dataset as described in the specification file.

    This tool is meant to be used as an endpoint for Galaxy tool that will create a Galaxy collection out of a dataset in immuneML format that can
    be readily used by other immuneML-based Galaxy tools.

    The specification supplied for this tool is identical to immuneML specification, except that it can include only one instruction which has to
    be of type 'Simulation':

    .. code-block: yaml

        definitions:
            datasets:
                my_synthetic_dataset:
                    format: RandomRepertoireDataset
                    params:
                        repertoire_count: 100
                        labels: {}
            motifs:
                my_simple_motif: # a simple motif without gaps or hamming distance
                  seed: AAA
                  instantiation: GappedKmer

                my_complex_motif: # complex motif containing a gap + hamming distance
                    seed: AA/A  # ‘/’ denotes gap position if present, if not, there’s no gap
                    instantiation:
                        GappedKmer:
                            min_gap: 1
                            max_gap: 2
                            hamming_distance_probabilities: # probabilities for each number of
                                0: 0.7                    # modification to the seed
                                1: 0.3
                            position_weights: # probabilities for modification per position
                                0: 1
                                1: 0 # note that index 2, the position of the gap,
                                3: 0 # is excluded from position_weights
                            alphabet_weights: # probabilities for using each amino acid in
                                A: 0.2      # a hamming distance modification
                                C: 0.2
                                D: 0.4
                                E: 0.2

            signals:
                my_signal:
                    motifs:
                    - my_simple_motif
                    - my_complex_motif
                    implanting: HealthySequence
                    sequence_position_weights:
                        109: 1
                        110: 2
                        111: 5
                        112: 1
            simulations:
                my_simulation:
                    my_implanting:
                        signals:
                        - my_signal
                        dataset_implanting_rate: 0.5
                        repertoire_implanting_rate: 0.25
        instructions:
            my_simulation_instruction: # user-defined name of the instruction
                type: Simulation # which instruction to execute
                dataset: my_dataset # which dataset to use for implanting the signals
                simulation: my_simulation # how to implanting the signals - definition of the simulation
                number_of_processes: 4 # how many parallel processes to use during execution
                export_formats: [AIRR] # in which formats to export the dataset, ImmuneML format will be added automatically
        output: # the output format
            format: HTML

    """

    def __init__(self, specification_path: Path, result_path: Path, **kwargs):
        Util.check_parameters(specification_path, result_path, kwargs, GalaxySimulationTool.__name__)
        super().__init__(specification_path, result_path, **kwargs)

    def _run(self):
        self.prepare_specs()

        Util.run_tool(self.yaml_path, self.result_path)

        dataset_location = list(self.result_path.glob("*/exported_dataset/*/"))[0]
        shutil.copytree(dataset_location, self.result_path / 'result/')

        logging.info(f"{GalaxySimulationTool.__name__}: immuneML has finished and the signals were implanted in the dataset.")

[docs]    def prepare_specs(self):
        with self.yaml_path.open("r") as file:
            specs = yaml.safe_load(file)

        instruction_name = Util.check_instruction_type(specs, GalaxySimulationTool.__name__, SimulationInstruction.__name__[:-11])
        Util.check_export_format(specs, GalaxySimulationTool.__name__, instruction_name)

        Util.update_dataset_key(specs, GalaxySimulationTool.__name__)
        Util.check_paths(specs, "GalaxySimulationTool")
        Util.update_result_paths(specs, self.result_path, self.yaml_path)