Source code for immuneML.workflows.instructions.MLProcess

import copy
from pathlib import Path
from typing import List

from immuneML.data_model.SequenceParams import RegionType
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.environment.Label import Label
from immuneML.environment.LabelConfiguration import LabelConfiguration
from immuneML.environment.SequenceType import SequenceType
from immuneML.example_weighting.ExampleWeightingStrategy import ExampleWeightingStrategy
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.hyperparameter_optimization.core.HPUtil import HPUtil
from immuneML.hyperparameter_optimization.states.HPItem import HPItem
from immuneML.ml_methods.classifiers.MLMethod import MLMethod
from immuneML.ml_metrics.ClassificationMetric import ClassificationMetric
from immuneML.reports.ReportUtil import ReportUtil
from immuneML.reports.ml_reports.MLReport import MLReport
from immuneML.util.Logger import print_log
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.steps.MLMethodTrainer import MLMethodTrainer
from immuneML.workflows.steps.MLMethodTrainerParams import MLMethodTrainerParams


[docs] class MLProcess: """ Class that implements the machine learning process: 1. encodes the training dataset 2. encodes the test dataset (using parameters learnt in step 1 if there are any such parameters) 3. trains the ML method on encoded training dataset 4. assesses the method's performance on encoded test dataset It performs the task for a given label configuration, and given list of metrics (used only in the assessment step). """ def __init__(self, train_dataset: Dataset, test_dataset: Dataset, label: Label, metrics: set, optimization_metric: ClassificationMetric, path: Path, ml_reports: List[MLReport] = None, encoding_reports: list = None, data_reports: list = None, number_of_processes: int = 2, label_config: LabelConfiguration = None, report_context: dict = None, hp_setting: HPSetting = None, example_weighting: ExampleWeightingStrategy = None, sequence_type: SequenceType = SequenceType.AMINO_ACID, region_type: RegionType = RegionType.IMGT_CDR3): self.train_dataset = train_dataset self.test_dataset = test_dataset self.label = label self.label_config = label_config self.method = copy.deepcopy(hp_setting.ml_method) self.path = PathBuilder.build(path) if path is not None else None self.ml_settings_export_path = path / "ml_settings_config" if path is not None else None self.ml_score_path = path / "ml_score.csv" if path is not None else None self.train_predictions_path = path / "train_predictions.csv" if path is not None else None self.test_predictions_path = path / "test_predictions.csv" if path is not None else None self.report_path = PathBuilder.build(path / "reports") if path is not None else None self.number_of_processes = number_of_processes assert all([isinstance(metric, ClassificationMetric) for metric in metrics]), \ "MLProcess: metrics are not set to be an instance of Metric." self.metrics = metrics self.optimization_metric = optimization_metric self.ml_reports = ml_reports if ml_reports is not None else [] self.encoding_reports = encoding_reports if encoding_reports is not None else [] self.data_reports = data_reports if data_reports is not None else [] self.report_context = report_context self.hp_setting = copy.deepcopy(hp_setting) self.example_weighting = example_weighting self.sequence_type = sequence_type self.region_type = region_type def _set_paths(self): if self.path is None: raise RuntimeError("MLProcess: path is not set, stopping execution...") self.ml_settings_export_path = self.path / "ml_settings_config" self.ml_score_path = self.path / "ml_score.csv" self.train_predictions_path = self.path / "train_predictions.csv" self.test_predictions_path = self.path / "test_predictions.csv" self.report_path = PathBuilder.build(self.path / "reports")
[docs] def run(self, split_index: int) -> HPItem: print_log(f"Evaluating hyperparameter setting: {self.hp_setting}...", include_datetime=True) PathBuilder.build(self.path) self._set_paths() processed_dataset = HPUtil.preprocess_dataset(self.train_dataset, self.hp_setting.preproc_sequence, self.path / "preprocessed_train_dataset", self.report_context, number_of_processes=self.number_of_processes) weighted_dataset = HPUtil.weight_examples(dataset=processed_dataset, weighting_strategy=self.example_weighting, path=self.path / "weighted_train_datasets", learn_model=True, number_of_processes=self.number_of_processes) encoded_train_dataset = HPUtil.encode_dataset(weighted_dataset, self.hp_setting, self.path / "encoded_datasets", learn_model=True, context=self.report_context, number_of_processes=self.number_of_processes, label_configuration=self.label_config, sequence_type=self.sequence_type, region_type=self.region_type) method = self._train_method(encoded_train_dataset) encoding_train_results = ReportUtil.run_encoding_reports(encoded_train_dataset, self.encoding_reports, self.report_path / "encoding_train", self.number_of_processes) hp_item = self._assess_on_test_dataset(encoded_train_dataset, encoding_train_results, method, split_index) print_log(f"Completed hyperparameter setting {self.hp_setting}.\n", include_datetime=True) return hp_item
def _train_method(self, dataset) -> MLMethod: method = MLMethodTrainer.run(MLMethodTrainerParams( method=copy.deepcopy(self.hp_setting.ml_method), result_path=self.path / "ml_method", dataset=dataset, label=self.label, train_predictions_path=self.train_predictions_path, model_selection_cv=self.hp_setting.ml_params["model_selection_cv"], model_selection_n_folds=self.hp_setting.ml_params["model_selection_n_folds"], cores_for_training=self.number_of_processes, optimization_metric=self.optimization_metric.name.lower() )) return method def _assess_on_test_dataset(self, encoded_train_dataset, encoding_train_results, method, split_index) -> HPItem: if self.test_dataset is not None and self.test_dataset.get_example_count() > 0: processed_test_dataset = HPUtil.preprocess_dataset(self.test_dataset, self.hp_setting.preproc_sequence, self.path / "preprocessed_test_dataset") weighted_test_dataset = HPUtil.weight_examples(dataset=processed_test_dataset, weighting_strategy=self.example_weighting, path=self.path / "weighted_test_datasets", learn_model=False, number_of_processes=self.number_of_processes) encoded_test_dataset = HPUtil.encode_dataset(weighted_test_dataset, self.hp_setting, self.path / "encoded_datasets", learn_model=False, context=self.report_context, number_of_processes=self.number_of_processes, label_configuration=self.label_config, sequence_type=self.sequence_type, region_type=self.region_type) performance = HPUtil.assess_performance(method, self.metrics, self.optimization_metric, encoded_test_dataset, split_index, self.path, self.test_predictions_path, self.label, self.ml_score_path) encoding_test_results = ReportUtil.run_encoding_reports(encoded_test_dataset, self.encoding_reports, self.report_path / "encoding_test", self.number_of_processes) model_report_results = ReportUtil.run_ML_reports(encoded_train_dataset, encoded_test_dataset, method, self.ml_reports, self.report_path / "ml_method", self.hp_setting, self.label, self.number_of_processes, self.report_context) hp_item = HPItem(method=method, hp_setting=self.hp_setting, train_predictions_path=self.train_predictions_path, test_predictions_path=self.test_predictions_path, train_dataset=self.train_dataset, test_dataset=self.test_dataset, split_index=split_index, model_report_results=model_report_results, encoding_train_results=encoding_train_results, encoding_test_results=encoding_test_results, performance=performance, encoder=self.hp_setting.encoder, ml_settings_export_path=self.ml_settings_export_path) else: hp_item = HPItem(method=method, hp_setting=self.hp_setting, train_predictions_path=self.train_predictions_path, test_predictions_path=None, train_dataset=self.train_dataset, split_index=split_index, encoding_train_results=encoding_train_results, encoder=self.hp_setting.encoder, ml_settings_export_path=self.ml_settings_export_path) return hp_item