Source code for immuneML.api.aggregated_runs.MultiDatasetBenchmarkTool
import copy
from pathlib import Path
import yaml
from immuneML.app.ImmuneMLApp import ImmuneMLApp
from immuneML.dsl.definition_parsers.ReportParser import ReportParser
from immuneML.dsl.symbol_table.SymbolTable import SymbolTable
from immuneML.dsl.symbol_table.SymbolType import SymbolType
from immuneML.presentation.html.MultiDatasetBenchmarkHTMLBuilder import MultiDatasetBenchmarkHTMLBuilder
from immuneML.util.Logger import print_log
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
[docs]
class MultiDatasetBenchmarkTool:
"""
MultiDatasetBenchmarkTool trains the models using nested cross-validation (CV) to determine optimal model on multiple datasets. Internally, it uses
TrainMLModel instruction for each of the listed datasets and performs nested CV on each, accumulates the results of these runs and then
generates reports on the cumulative results.
**YAML specification:**
.. highlight:: yaml
.. code-block:: yaml
definitions: # everything under definitions can be defined in a standard way
datasets:
d1: ...
d2: ...
d3: ...
ml_methods:
ml1: ...
ml2: ...
ml3: ...
encodings:
enc1: ...
enc2: ...
reports:
r1: ...
r2: ...
instructions: # there can be only one instruction
benchmark_instruction:
type: TrainMLModel
benchmark_reports: [r1, r2] # list of reports that will be executed on the results for all datasets
datasets: [d1, d2, d3] # the same optimization will be performed separately for each dataset
settings: # a list of combinations of preprocessing, encoding and ml_method to optimize over
- encoding: enc1 # mandatory field
ml_method: ml1 # mandatory field
- encoding: enc2
ml_method: ml2
- encoding: enc2
ml_method: ml3
assessment: # outer loop of nested CV
split_strategy: random # perform Monte Carlo CV (randomly split the data into train and test)
split_count: 1 # how many train/test datasets to generate
training_percentage: 0.7 # what percentage of the original data should be used for the training set
selection: # inner loop of nested CV
split_strategy: k_fold # perform k-fold CV
split_count: 5 # how many fold to create: here these two parameters mean: do 5-fold CV
labels: # list of labels to optimize the classifier for, as given in the metadata for the dataset
- celiac
strategy: GridSearch # how to choose the combinations which to test from settings (GridSearch means test all)
metrics: # list of metrics to compute for all settings, but these do not influence the choice of optimal model
- accuracy
- auc
reports: # reports to execute on the dataset (before CV, splitting, encoding etc.)
- rep1
number_of_processes: 4 # number of parallel processes to create (could speed up the computation)
optimization_metric: balanced_accuracy # the metric to use for choosing the optimal model and during training
"""
def __init__(self, specification_path: Path, result_path: Path, **kwargs):
self.specification_path = specification_path
self.result_path = result_path
self.reports = None
[docs]
def run(self):
print_log("Starting MultiDatasetBenchmarkTool...")
PathBuilder.build(self.result_path)
specs = self._split_specs_file()
self._extract_reports()
instruction_states = {}
for index, specs_name in enumerate(specs.keys()):
print_log(f"Running nested cross-validation on dataset {specs_name} ({index+1}/{len(list(specs.keys()))})..")
app = ImmuneMLApp(specification_path=specs[specs_name], result_path=self.result_path / specs_name)
instruction_states[specs_name] = app.run()[0]
print_log(f"Finished nested cross-validation on dataset {specs_name} ({index+1}/{len(list(specs.keys()))})..")
print_log("Running reports on the results of nested cross-validation on all datasets...")
report_results = self._run_reports(instruction_states)
print_log("Finished reports, now generating HTML output...")
MultiDatasetBenchmarkHTMLBuilder.build(report_results, self.result_path,
{specs_name: self.result_path / specs_name for specs_name in specs.keys()})
print_log("MultiDatasetBenchmarkTool finished.")
def _extract_reports(self):
with self.specification_path.open("r") as file:
workflow_specification = yaml.safe_load(file)
report_keys = list(workflow_specification['instructions'].values())[0]['benchmark_reports']
ParameterValidator.assert_all_in_valid_list(report_keys, list(workflow_specification['definitions']['reports'].keys()),
MultiDatasetBenchmarkTool.__name__, "benchmark_reports")
reports = {key: value for key, value in workflow_specification['definitions']['reports'].items() if key in report_keys}
symbol_table, _ = ReportParser.parse(reports, SymbolTable())
self.reports = [entry.item for entry in symbol_table.get_by_type(SymbolType.REPORT)]
def _split_specs_file(self) -> dict:
with self.specification_path.open("r") as file:
workflow_specification = yaml.safe_load(file)
self._check_specs(workflow_specification)
specs_files = {}
instruction_name = list(workflow_specification['instructions'].keys())[0]
instruction = workflow_specification['instructions'][instruction_name]
for dataset_name in instruction['datasets']:
new_specs = copy.deepcopy(workflow_specification)
new_specs['definitions']['datasets'] = {dataset_name: new_specs['definitions']['datasets'][dataset_name]}
del new_specs['instructions'][instruction_name]['datasets']
del new_specs['instructions'][instruction_name]['benchmark_reports']
new_specs['instructions'][instruction_name]['dataset'] = dataset_name
new_specs_file = self.result_path / f"specs_{dataset_name}.yaml"
with new_specs_file.open('w') as file:
yaml.dump(new_specs, file)
specs_files[dataset_name] = new_specs_file
return specs_files
def _check_specs(self, workflow_specification):
location = 'MultiDatasetBenchmarkTool'
ParameterValidator.assert_keys(workflow_specification.keys(), ['definitions', 'instructions', 'output'], location, 'YAML specification')
self._check_dataset_specs(workflow_specification, location)
self._check_instruction_specs(workflow_specification, location)
def _check_dataset_specs(self, workflow_specification, location):
ParameterValidator.assert_type_and_value(workflow_specification['definitions'], dict, location, 'definitions')
ParameterValidator.assert_keys_present(workflow_specification['definitions'].keys(), ['datasets'], location, 'definitions')
ParameterValidator.assert_type_and_value(workflow_specification['definitions']['datasets'], dict, location, 'datasets')
dataset_names = list(workflow_specification['definitions']['datasets'].keys())
assert len(dataset_names) > 1, \
f"MultiDatasetBenchmarkTool: there is only one dataset specified ({dataset_names[0]}), while this tool operates on multiple datasets. " \
f"If only one dataset is needed, consider using the training instruction directly."
def _check_instruction_specs(self, workflow_specification, location):
ParameterValidator.assert_type_and_value(workflow_specification['instructions'], dict, location, 'instructions')
instruction_names = list(workflow_specification['instructions'].keys())
assert len(instruction_names) == 1, f"MultiDatasetBenchmarkTool: there can be only one instruction specified for this tool. " \
f"Currently the following instructions are specified: {instruction_names}."
ParameterValidator.assert_keys_present(workflow_specification['instructions'][instruction_names[0]].keys(), ['type', 'datasets'], location,
instruction_names[0])
instruction_type = workflow_specification['instructions'][instruction_names[0]]['type']
assert instruction_type == 'TrainMLModel', \
f"MultiDatasetBenchmarkTool: this tool works only with instruction of type 'TrainMLModel', got {instruction_type} instead."
datasets_in_instruction = workflow_specification['instructions'][instruction_names[0]]['datasets']
assert len(datasets_in_instruction) > 1, \
f'{location}: this tool takes a multiple dataset names as input, but only {len(datasets_in_instruction)} were provided: ' \
f'{datasets_in_instruction}.'
def _run_reports(self, instruction_states: dict):
report_results = {}
for index, report in enumerate(self.reports):
print_log(f"Running report {report.name} ({index+1}/{len(self.reports)})...")
report.instruction_states = list(instruction_states.values())
report.result_path = PathBuilder.build(self.result_path / 'benchmarking_reports/')
report_result = report.generate_report()
report_results[report.name] = report_result
return report_results