Source code for immuneML.api.galaxy.Util

import logging
from pathlib import Path

import yaml
import os
import glob
import pandas as pd

from immuneML.IO.dataset_export.AIRRExporter import AIRRExporter
from immuneML.app.ImmuneMLApp import ImmuneMLApp
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder


[docs] class Util:
[docs] @staticmethod def check_parameters(yaml_path: Path, output_dir: Path, kwargs, location): assert isinstance(yaml_path, Path), f"{location}: yaml_path is {output_dir}, expected Path object." assert isinstance(output_dir, Path), f"{location}: output_dir is {output_dir}, expected Path object pointing to a folder to store the results." assert yaml_path.is_file(), f"{location}: path to the specification is not correct, got {yaml_path}, expecting path to a YAML file."
[docs] @staticmethod def check_paths(specs: dict, tool_name: str): for key in specs.keys(): if isinstance(specs[key], str): if "/" in specs[key] and specs[key] != "./" and any(name_part in key for name_part in ('path', 'file')): logging.warning(f"{tool_name}: the paths in specification for Galaxy have to consist only of the filenames " f"as uploaded to Galaxy history beforehand. The problem occurs for the parameter {key}.") elif isinstance(specs[key], dict): Util.check_paths(specs[key], tool_name)
[docs] @staticmethod def update_dataset_key(specs: dict, location, new_key="dataset"): dataset_keys = list(specs["definitions"]["datasets"].keys()) assert len(dataset_keys) == 1, f"{location}: one dataset has to be defined under definitions/datasets, got {dataset_keys} instead." orig_key = dataset_keys[0] if orig_key != "dataset": specs["definitions"]["datasets"][new_key] = specs["definitions"]["datasets"][orig_key] specs["definitions"]["datasets"].pop(orig_key) for instruction_key in specs["instructions"].keys(): if "dataset" in specs["instructions"][instruction_key]: specs["instructions"][instruction_key]["dataset"] = new_key if "datasets" in specs["instructions"][instruction_key]: specs["instructions"][instruction_key]["datasets"] = [new_key] if "analyses" in specs["instructions"][instruction_key]: for analysis_key in specs["instructions"][instruction_key]["analyses"].keys(): specs["instructions"][instruction_key]["analyses"][analysis_key]["dataset"] = new_key logging.info(f"{location}: renamed dataset '{orig_key}' to '{new_key}'.")
[docs] @staticmethod def update_result_paths(specs: dict, result_path: Path, yaml_path: Path): if 'datasets' in specs['definitions']: for key, item in specs["definitions"]["datasets"].items(): if isinstance(item, dict) and 'params' in item.keys() and isinstance(item["params"], dict): item['params']["result_path"] = str(result_path / key) if item['format'] not in ['RandomRepertoireDataset', 'RandomReceptorDataset', 'RandomSequenceDataset']: item['params']['path'] = str(yaml_path.parent) with yaml_path.open("w") as file: yaml.dump(specs, file)
[docs] @staticmethod def check_instruction_type(specs: dict, tool_name, expected_instruction) -> str: ParameterValidator.assert_keys_present(list(specs.keys()), ['definitions', 'instructions'], tool_name, "YAML specification") assert len(list(specs['instructions'].keys())) == 1, f"{tool_name}: multiple instructions were given " \ f"({str(list(specs['instructions'].keys()))[1:-1]}), but only one instruction of type " \ f"{expected_instruction} should be specified." instruction_name = list(specs['instructions'].keys())[0] instruction_type = specs['instructions'][instruction_name]['type'] assert instruction_type == expected_instruction, \ f"{tool_name}: instruction type has to be '{expected_instruction}', got {instruction_type} instead." return instruction_name
[docs] @staticmethod def check_export_format(specs: dict, tool_name: str, instruction_name: str): ParameterValidator.assert_keys_present(list(specs['instructions'][instruction_name].keys()), ["export_formats"], tool_name, f"{instruction_name}/export_formats") ParameterValidator.assert_type_and_value(specs['instructions'][instruction_name]["export_formats"], list, tool_name, f"{instruction_name}/export_formats") assert len(specs['instructions'][instruction_name]["export_formats"]) == 1, \ f"{tool_name}: only one format can be specified under export_formats parameter under " \ f"{instruction_name}/export_formats, got {specs['instructions'][instruction_name]['export_formats']} instead." return specs['instructions'][instruction_name]["export_formats"][0]
[docs] @staticmethod def run_tool(yaml_path, result_path): PathBuilder.build(result_path) app = ImmuneMLApp(yaml_path, result_path) app.run()
[docs] @staticmethod def discover_dataset_path(dataset_name="dataset"): if os.path.exists(f"{dataset_name}.yaml"): dataset_path = f"{dataset_name}.yaml" else: discovered = glob.glob(f"*{dataset_name}*.yaml") if len(discovered) == 1: dataset_path = discovered[0] else: raise FileNotFoundError(f"Unable to locate '{dataset_name}.yaml'") return dataset_path
[docs] @staticmethod def remove_path_from_filename(file_path): return str(Path(file_path).name)
[docs] @staticmethod def reformat_galaxy_dataset(galaxy_dataset_path): dataset_yaml_file = galaxy_dataset_path / "dataset.yaml" assert dataset_yaml_file.is_file(), "Error: generated dataset.yaml not found" metadata_file = None with (dataset_yaml_file.open("r") as file): dataset_params = yaml.load(file, Loader=yaml.SafeLoader) if "metadata_file" in dataset_params: dataset_params["metadata_file"] = Util.remove_path_from_filename(dataset_params["metadata_file"]) metadata_file = galaxy_dataset_path / dataset_params["metadata_file"] if "filename" in dataset_params: dataset_params["filename"] = str(Path(dataset_params["filename"]).name) with dataset_yaml_file.open("w") as file: yaml.dump(dataset_params, file) if metadata_file is not None: metadata_content = pd.read_csv(metadata_file, sep=",") metadata_content["filename"] = [Util.remove_path_from_filename(filename) for filename in metadata_content["filename"]] metadata_content.to_csv(metadata_file, index=None)
[docs] @staticmethod def export_galaxy_dataset(dataset, result_path): try: PathBuilder.build(result_path / 'galaxy_dataset') AIRRExporter.export(dataset, result_path / "galaxy_dataset/") dataset_file = list(glob.glob(str(result_path / "galaxy_dataset/*.yaml")))[0] os.rename(dataset_file, result_path / "galaxy_dataset/dataset.yaml") Util.reformat_galaxy_dataset(result_path / "galaxy_dataset/") except Exception as e: raise RuntimeError(f"Error when exporting Galaxy dataset: {e}.")