Source code for immuneML.reports.ml_reports.MotifSeedRecovery

import logging
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.ml_methods.classifiers.LogisticRegression import LogisticRegression
from immuneML.ml_methods.classifiers.MLMethod import MLMethod
from immuneML.ml_methods.classifiers.RandomForestClassifier import RandomForestClassifier
from immuneML.ml_methods.classifiers.SVC import SVC
from immuneML.ml_methods.classifiers.SVM import SVM
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.ml_reports.MLReport import MLReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder


[docs] class MotifSeedRecovery(MLReport): """ This report can be used to show how well implanted motifs (for example, through the Simulation instruction) can be recovered by various machine learning methods using the k-mer encoding. This report creates a boxplot, where the x axis (box grouping) represents the maximum possible overlap between an implanted motif seed and a kmer feature (measured in number of positions), and the y axis shows the coefficient size of the respective kmer feature. If the machine learning method has learned the implanted motif seeds, the coefficient size is expected to be largest for the kmer features with high overlap to the motif seeds. Note that to use this report, the following criteria must be met: - KmerFrequencyEncoder must be used. - One of the following classifiers must be used: RandomForestClassifier, LogisticRegression, SVM, SVC - For each label, the implanted motif seeds relevant to that label must be specified To find the overlap score between kmer features and implanted motif seeds, the two sequences are compared in a sliding window approach, and the maximum overlap is calculated. Overlap scores between kmer features and implanted motifs are calculated differently based on the Hamming distance that was allowed during implanting. .. indent with spaces .. code-block:: text Without hamming distance: Seed: AAA -> score = 3 Feature: xAAAx ^^^ Seed: AAA -> score = 0 Feature: xAAxx With hamming distance: Seed: AAA -> score = 3 Feature: xAAAx ^^^ Seed: AAA -> score = 2 Feature: xAAxx ^^ Furthermore, gap positions in the motif seed are ignored: Seed: A/AA -> score = 3 Feature: xAxAAx ^/^^ See :ref:`Recovering simulated immune signals` for more details. Example output: .. image:: _static/images/reports/motif_seed_recovery.png :alt: Motif seed recovery report :width: 650 **Specification arguments:** - implanted_motifs_per_label (dict): a nested dictionary that specifies the motif seeds that were implanted in the given dataset. The first level of keys in this dictionary represents the different labels. In the inner dictionary there should be two keys: "seeds" and "hamming_distance": - seeds: a list of motif seeds. The seeds may contain gaps, specified by a '/' symbol. - hamming_distance: A boolean value that specifies whether hamming distance was allowed when implanting the motif seeds for a given label. Note that this applies to all seeds for this label. - gap_sizes: a list of all the possible gap sizes that were used when implanting a gapped motif seed. When no gapped seeds are used, this value has no effect. **YAML specification:** .. indent with spaces .. code-block:: yaml definitions: reports: my_motif_report: MotifSeedRecovery: implanted_motifs_per_label: CD: seeds: - AA/A - AAA hamming_distance: False gap_sizes: - 0 - 1 - 2 T1D: seeds: - CC/C - CCC hamming_distance: True gap_sizes: - 2 """
[docs] @classmethod def build_object(cls, **kwargs): ParameterValidator.assert_keys_present(list(kwargs.keys()), ["implanted_motifs_per_label"], "MotifSeedRecovery", "MotifSeedRecovery report") implanted_motifs_per_label = kwargs["implanted_motifs_per_label"] ParameterValidator.assert_type_and_value(implanted_motifs_per_label, dict, "MotifSeedRecovery", f"implanted_motifs_per_label") for label_name in implanted_motifs_per_label.keys(): ParameterValidator.assert_type_and_value(implanted_motifs_per_label[label_name], dict, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}") ParameterValidator.assert_keys_present(implanted_motifs_per_label[label_name].keys(), ["hamming_distance", "seeds", "gap_sizes"], "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}") ParameterValidator.assert_type_and_value(implanted_motifs_per_label[label_name]["hamming_distance"], bool, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/hamming_distance") ParameterValidator.assert_type_and_value(implanted_motifs_per_label[label_name]["gap_sizes"], list, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/gap_sizes") ParameterValidator.assert_type_and_value(implanted_motifs_per_label[label_name]["seeds"], list, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/seeds") for gap_size in implanted_motifs_per_label[label_name]["gap_sizes"]: ParameterValidator.assert_type_and_value(gap_size, int, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/gap_sizes", min_inclusive=0) for seed in implanted_motifs_per_label[label_name]["seeds"]: ParameterValidator.assert_type_and_value(seed, str, "MotifSeedRecovery", f"implanted_motifs_per_label/{label_name}/seeds") return MotifSeedRecovery(implanted_motifs_per_label)
def __init__(self, implanted_motifs_per_label, train_dataset: Dataset = None, test_dataset: Dataset = None, method: MLMethod = None, result_path: Path = None, name: str = None, hp_setting: HPSetting = None, label=None, number_of_processes: int = 1): super().__init__(train_dataset=train_dataset, test_dataset=test_dataset, method=method, result_path=result_path, name=name, hp_setting=hp_setting, label=label, number_of_processes=number_of_processes) self.implanted_motifs_per_label = implanted_motifs_per_label self._param_field = None self._y_axis_title = None self._x_axis_title = None def _generate(self): PathBuilder.build(self.result_path) self._set_plotting_parameters() plot_df = self._retrieve_plot_data() report_output_table = self._write_results_table(plot_df) report_output_fig = self._plot(plot_df, "motif_seed_recovery") return ReportResult(self.name, info="This report shows how well implanted ('ground truth') motifs are recovered by ML models using the k-mer encoding. The x axis (box grouping) represents the maximum number of overlapping positions between a 'ground truth' motif seed and a k-mer feature. The y axis values represent the learned coefficients. ", output_tables=[report_output_table], output_figures=[report_output_fig]) def _write_results_table(self, plotting_data): filepath = self.result_path / "motif_seed_recovery.csv" plotting_data.to_csv(filepath, index=False) return ReportOutput(path=filepath, name="motif seed recovery csv") def _set_plotting_parameters(self): if isinstance(self.method, RandomForestClassifier): self._param_field = "feature_importances" self._y_axis_title = "Feature importance" else: # SVM, logistic regression, ... self._param_field = "coefficients" self._y_axis_title = "Coefficient value" if self.implanted_motifs_per_label[self.label.name]["hamming_distance"]: self._x_axis_title = "Positions overlap between feature and motif seeds<br>(hamming distance allowed)" else: self._x_axis_title = "Positions overlap between feature and motif seeds" def _retrieve_plot_data(self): seeds = self._get_implanted_seeds() overlap_fn = self._get_overlap_fn() features = self._retrieve_feature_names() plot_df = self.calculate_seed_overlap(seeds, features, overlap_fn) plot_df["coefficients"] = self.method.get_params()[self._param_field] return plot_df def _get_implanted_seeds(self): return self.implanted_motifs_per_label[self.label.name]["seeds"] def _get_overlap_fn(self): is_hamming_distance = self.implanted_motifs_per_label[self.label.name]["hamming_distance"] overlap_fn = self.hamming_overlap if is_hamming_distance else self.identical_overlap return overlap_fn def _retrieve_feature_names(self): if self.train_dataset and self.train_dataset.encoded_data: return self.train_dataset.encoded_data.feature_names def _plot(self, plotting_data, output_name): if plotting_data.empty: logging.warning(f"Coefficients: empty data subset specified, skipping {output_name} plot...") else: filename = self.result_path / f"{output_name}.html" import plotly.express as px figure = px.box(plotting_data, x="max_seed_overlap", y="coefficients", labels={ "max_seed_overlap": self._x_axis_title, "coefficients": self._y_axis_title }, template='plotly_white', color_discrete_sequence=px.colors.diverging.Tealrose) figure.write_html(str(filename)) return ReportOutput(filename, f"Overlap between implanted motif seeds and features versus {self._y_axis_title.lower()}")
[docs] def hamming_overlap(self, seed, feature): return sum(np.array(list(seed)) == np.array(list(feature)))
[docs] def identical_overlap(self, seed, feature): if "/" in seed: exclude_idx_start = seed.index("/") exclude_idx_end = seed.rindex("/") seed = seed[:exclude_idx_start] + seed[exclude_idx_end + 1:] feature = feature[:exclude_idx_start] + feature[exclude_idx_end + 1:] while feature.startswith("-"): feature = feature[1:] seed = seed[1:] while feature.endswith("-"): feature = feature[:-1] seed = seed[:-1] return int(seed == feature) * len(seed)
[docs] def max_overlap_sliding(self, seed, feature, overlap_fn): max_score = 0 sizes = self.implanted_motifs_per_label[self.label.name]["gap_sizes"] for gap_size in sizes: gap_adjusted_seed = seed.replace("/", "/" * gap_size) padding = "-" * (len(gap_adjusted_seed) - 1) padded_feature = padding + feature + padding for start_idx in range(0, len(feature) + len(padding)): feature_slice = padded_feature[start_idx:start_idx + len(gap_adjusted_seed)] max_score = max(max_score, overlap_fn(gap_adjusted_seed, feature_slice)) return max_score
[docs] def calculate_seed_overlap(self, motif_seeds, features, overlap_fn): seed_df = pd.DataFrame({"features": features}) for seed in motif_seeds: seed_df[seed] = [self.max_overlap_sliding(seed, feature, overlap_fn) for feature in seed_df["features"]] seed_df["max_seed_overlap"] = seed_df.drop("features", axis=1).max(axis=1) seed_df = seed_df[["features", "max_seed_overlap"]] return seed_df
[docs] def check_prerequisites(self): location = "MotifSeedRecovery" run_report = True if not any([isinstance(self.method, legal_method) for legal_method in (RandomForestClassifier, LogisticRegression, SVM, SVC)]): logging.warning(f"{location} report can only be created for RandomForestClassifier, LogisticRegression, SVC, or SVM, but got " f"{type(self.method).__name__} instead. Report {self.name} will not be created.") run_report = False if self.label.name not in self.implanted_motifs_per_label.keys(): warnings.warn( f"{location}: no implanted motifs were specified for the label '{self.label}'. " f"These motifs should be specified under 'implanted_motifs_per_label'. Report {self.name} will not be created.") run_report = False if self.train_dataset.encoded_data is None or self.train_dataset.encoded_data.examples is None or self.train_dataset.encoded_data.feature_names is None: warnings.warn( f"{location}: this report can only be created for an encoded dataset with specified feature names. Report {self.name} will not be created.") run_report = False if self.train_dataset.encoded_data.encoding != "KmerFrequencyEncoder": warnings.warn( f"{location}: this report can only be created for a dataset encoded with the KmerFrequencyEncoder. Report {self.name} will not be created.") run_report = False return run_report