Source code for immuneML.workflows.steps.MLMethodAssessment

import os
from pathlib import Path

import numpy as np
import pandas as pd
from immuneML.ml_metrics.MetricUtil import MetricUtil

from immuneML.environment.Label import Label
from immuneML.ml_methods.classifiers.MLMethod import MLMethod
from immuneML.ml_metrics.ClassificationMetric import ClassificationMetric
from immuneML.ml_metrics.MetricUtil import MetricUtil
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.steps.MLMethodAssessmentParams import MLMethodAssessmentParams
from immuneML.workflows.steps.Step import Step


[docs] class MLMethodAssessment(Step): fieldnames = ["run", "optimal_method_params", "method", "encoding_params", "encoding", "evaluated_on"]
[docs] @staticmethod def run(input_params: MLMethodAssessmentParams = None): X = input_params.dataset.encoded_data predicted_y = input_params.method.predict(X, input_params.label) predicted_proba_y_per_class = input_params.method.predict_proba(X, input_params.label) true_y = input_params.dataset.encoded_data.labels example_ids = input_params.dataset.get_example_ids() MLMethodAssessment._store_predictions(method=input_params.method, true_y=true_y, predicted_y=predicted_y, predicted_proba_y_per_class=predicted_proba_y_per_class, label=input_params.label, predictions_path=input_params.predictions_path, example_ids=example_ids, split_index=input_params.split_index) scores = MLMethodAssessment._score(metrics_list=input_params.metrics, optimization_metric=input_params.optimization_metric, label=input_params.label, split_index=input_params.split_index, predicted_y=predicted_y, predicted_proba_y_per_class=predicted_proba_y_per_class, true_y=true_y, method=input_params.method, ml_score_path=input_params.ml_score_path) return scores
@staticmethod def _score(metrics_list: set, optimization_metric: ClassificationMetric, label: Label, predicted_y, predicted_proba_y_per_class, true_y, ml_score_path: Path, split_index: int, method: MLMethod): results = {} scores = {} predicted_proba_class = predicted_proba_y_per_class[label.name] if predicted_proba_y_per_class is not None else None predicted_proba_y = np.vstack([predicted_proba_class[cls] for cls in label.values]).T if predicted_proba_class is not None else None metrics_with_optim_metric = set(metrics_list) metrics_with_optim_metric.add(optimization_metric) metrics_with_optim_metric = sorted(list(metrics_with_optim_metric), key=lambda metric: metric.name) for metric in metrics_with_optim_metric: score = MetricUtil.score_for_metric(metric=metric, predicted_y=predicted_y[label.name], true_y=true_y[label.name], classes=label.values, predicted_proba_y=predicted_proba_y) results[f"{label.name}_{metric.name.lower()}"] = score scores[metric.name.lower()] = score results["split_index"] = split_index df = pd.DataFrame([results]) if ml_score_path.is_file() and os.path.getsize(ml_score_path) > 0: df.to_csv(ml_score_path, mode='a', header=False, index=False) else: df.to_csv(ml_score_path, index=False) return scores @staticmethod def _store_predictions(method: MLMethod, true_y, predicted_y, predicted_proba_y_per_class, label: Label, predictions_path, summary_path=None, example_ids: list = None, split_index: int = None): df = pd.DataFrame() df["example_id"] = example_ids df["split_index"] = [split_index for i in range(len(example_ids))] df[f"{label.name}_true_class"] = true_y[label.name] df[f"{label.name}_predicted_class"] = predicted_y[label.name] for cls in method.get_classes(): tmp = predicted_proba_y_per_class[label.name][cls] if predicted_proba_y_per_class is not None and predicted_proba_y_per_class[label.name] is not None else None df[f'{label.name}_{cls}_proba'] = tmp if predictions_path is not None: df.to_csv(predictions_path, index=False) if summary_path is not None: PathBuilder.build(os.path.dirname(os.path.abspath(summary_path))) if os.path.isfile(summary_path) and os.path.getsize(summary_path) > 0: df.to_csv(summary_path, mode='a', header=False, index=False) else: df.to_csv(summary_path, index=False)