Source code for immuneML.workflows.steps.MLMethodAssessment

import inspect
import os
import numpy as np
import warnings
from pathlib import Path

import pandas as pd
from sklearn import metrics

from immuneML.environment.Label import Label
from immuneML.ml_methods.MLMethod import MLMethod
from immuneML.ml_methods.util.Util import Util
from immuneML.ml_metrics import ml_metrics
from immuneML.ml_metrics.Metric import Metric
from immuneML.ml_metrics.MetricUtil import MetricUtil
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.steps.MLMethodAssessmentParams import MLMethodAssessmentParams
from immuneML.workflows.steps.Step import Step



[docs]
class MLMethodAssessment(Step):
    fieldnames = ["run", "optimal_method_params", "method", "encoding_params", "encoding", "evaluated_on"]


[docs]
    @staticmethod
    def run(input_params: MLMethodAssessmentParams = None):
        X = input_params.dataset.encoded_data
        predicted_y = input_params.method.predict(X, input_params.label)
        predicted_proba_y_per_class = input_params.method.predict_proba(X, input_params.label)
        true_y = input_params.dataset.encoded_data.labels

        example_ids = input_params.dataset.get_example_ids()

        MLMethodAssessment._store_predictions(method=input_params.method, true_y=true_y, predicted_y=predicted_y,
                                              predicted_proba_y_per_class=predicted_proba_y_per_class,
                                              label=input_params.label, predictions_path=input_params.predictions_path,
                                              example_ids=example_ids, split_index=input_params.split_index)

        scores = MLMethodAssessment._score(metrics_list=input_params.metrics, optimization_metric=input_params.optimization_metric,
                                           label=input_params.label, split_index=input_params.split_index, predicted_y=predicted_y,
                                           predicted_proba_y_per_class=predicted_proba_y_per_class, true_y=true_y, method=input_params.method,
                                           ml_score_path=input_params.ml_score_path)

        return scores


    @staticmethod
    def _score(metrics_list: set, optimization_metric: Metric, label: Label, predicted_y, predicted_proba_y_per_class, true_y, ml_score_path: Path,
               split_index: int, method: MLMethod):
        results = {}
        scores = {}

        predicted_proba_class = predicted_proba_y_per_class[label.name] if predicted_proba_y_per_class is not None else None
        predicted_proba_y = np.vstack([predicted_proba_class[cls] for cls in label.values]).T if predicted_proba_class is not None else None

        metrics_with_optim_metric = set(metrics_list)
        metrics_with_optim_metric.add(optimization_metric)

        metrics_with_optim_metric = sorted(list(metrics_with_optim_metric), key=lambda metric: metric.name)

        for metric in metrics_with_optim_metric:
            score = MetricUtil.score_for_metric(metric=metric,
                                                predicted_y=predicted_y[label.name],
                                                true_y=true_y[label.name],
                                                classes=label.values,
                                                predicted_proba_y=predicted_proba_y)

            results[f"{label.name}_{metric.name.lower()}"] = score
            scores[metric.name.lower()] = score

        results["split_index"] = split_index

        df = pd.DataFrame([results])

        if ml_score_path.is_file() and os.path.getsize(ml_score_path) > 0:
            df.to_csv(ml_score_path, mode='a', header=False, index=False)
        else:
            df.to_csv(ml_score_path, index=False)

        return scores

    @staticmethod
    def _store_predictions(method: MLMethod, true_y, predicted_y, predicted_proba_y_per_class, label: Label, predictions_path, summary_path=None,
                           example_ids: list = None, split_index: int = None):

        df = pd.DataFrame()
        df["example_id"] = example_ids
        df["split_index"] = [split_index for i in range(len(example_ids))]

        df[f"{label.name}_true_class"] = true_y[label.name]
        df[f"{label.name}_predicted_class"] = predicted_y[label.name]

        for cls in method.get_classes():
            tmp = predicted_proba_y_per_class[label.name][cls] if predicted_proba_y_per_class is not None and predicted_proba_y_per_class[label.name] is not None else None
            df[f'{label.name}_{cls}_proba'] = tmp

        if predictions_path is not None:
            df.to_csv(predictions_path, index=False)

        if summary_path is not None:
            PathBuilder.build(os.path.dirname(os.path.abspath(summary_path)))
            if os.path.isfile(summary_path) and os.path.getsize(summary_path) > 0:
                df.to_csv(summary_path, mode='a', header=False, index=False)
            else:
                df.to_csv(summary_path, index=False)