Source code for immuneML.api.galaxy.DatasetGenerationOverviewTool

from pathlib import Path

import yaml

from immuneML.api.galaxy.GalaxyTool import GalaxyTool
from immuneML.api.galaxy.Util import Util
from immuneML.app.ImmuneMLApp import ImmuneMLApp
from immuneML.util.Logger import print_log
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.instructions.exploratory_analysis.ExploratoryAnalysisInstruction import \
    ExploratoryAnalysisInstruction



[docs]
class DatasetGenerationOverviewTool(GalaxyTool):
    """
    DatasetGenerationOverviewTool is an alternative to running ImmuneMLApp directly.
    This tool is meant to be used as an endpoint for Galaxy tool that will create a Galaxy collection out of a dataset in immuneML format.

    This tool accepts a path to a YAML specification which uses a single dataset, and runs the ExploratoryAnalysisInstruction with optional reports.
    The created dataset will be located in the supplied output directory, under the 'galaxy_dataset' folder.
    The main dataset file will have the name of the dataset given in the specification and has an extension .yaml.
    """

    def __init__(self, specification_path: Path, result_path: Path, **kwargs):
        Util.check_parameters(specification_path, result_path, kwargs, "Dataset generation tool")
        super().__init__(specification_path, result_path, **kwargs)

    def _run(self):
        PathBuilder.build(self.result_path)
        self._update_specs()
        state = ImmuneMLApp(self.yaml_path, self.result_path).run()[0]
        dataset = list(state.exploratory_analysis_units.values())[0].dataset

        Util.export_galaxy_dataset(dataset, self.result_path)

        print_log(f"Exported dataset.")

    def _update_specs(self):
        with self.yaml_path.open('r') as file:
            specs = yaml.safe_load(file)

        ParameterValidator.assert_keys_present(specs.keys(), ["definitions", "instructions"], DatasetGenerationOverviewTool.__name__, "YAML specification")
        ParameterValidator.assert_all_in_valid_list(specs.keys(), ["definitions", "instructions", "output"], DatasetGenerationOverviewTool.__name__, "YAML specification")

        self._check_dataset(specs)
        self._check_instruction(specs)

        Util.update_dataset_key(specs, DatasetGenerationOverviewTool.__name__)
        Util.check_paths(specs, DatasetGenerationOverviewTool.__name__)
        Util.update_result_paths(specs, self.result_path, self.yaml_path)

    def _check_dataset(self, specs):
        ParameterValidator.assert_keys_present(specs["definitions"].keys(), ['datasets'], DatasetGenerationOverviewTool.__name__, 'definitions')
        assert len(specs['definitions']['datasets'].keys()) == 1, \
            f"{DatasetGenerationOverviewTool.__name__}: only one dataset can be defined with this Galaxy tool, got these " \
            f"instead: {list(specs['definitions']['datasets'].keys())}."

    def _check_instruction(self, specs):
        assert len(specs['instructions'].keys()) == 1, \
            f"{DatasetGenerationOverviewTool.__name__}: only one instruction of type ExploratoryAnalysis can be defined with this Galaxy tool, got these " \
            f"instructions instead: {list(specs['instructions'].keys())}."

        instruction_name = Util.check_instruction_type(specs, DatasetGenerationOverviewTool.__name__, ExploratoryAnalysisInstruction.__name__[:-11])

        dataset_name = None
        for analysis_key, analysis_specs in specs['instructions'][instruction_name]["analyses"].items():
            ParameterValidator.assert_keys_present(analysis_specs.keys(), ["dataset", "report"],
                                                   DatasetGenerationOverviewTool.__name__, f"{instruction_name}/analyses/{analysis_key}")

            if dataset_name is None:
                dataset_name = analysis_specs["dataset"]
            else:
                assert analysis_specs["dataset"] == dataset_name, f"{DatasetGenerationOverviewTool.__name__}: expected only one dataset name. Found: {dataset_name} and {analysis_specs['dataset']}"