Source code for immuneML.reports.data_reports.RecoveredSignificantFeatures

import warnings
from pathlib import Path
from typing import List

import pandas as pd
import plotly.express as px

from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.dsl.instruction_parsers.LabelHelper import LabelHelper
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.KmerHelper import KmerHelper
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.SignificantFeaturesHelper import SignificantFeaturesHelper


[docs] class RecoveredSignificantFeatures(DataReport): """ Compares a given collection of groundtruth implanted signals (sequences or k-mers) to the significant label-associated k-mers or sequences according to Fisher's exact test. Internally uses the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder` for calculating significant k-mers, and :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder` to calculate significant full sequences (depending on whether the argument compairr_path was set). This report creates two plots: - the first plot is a bar chart showing what percentage of the groundtruth implanted signals were found to be significant. - the second plot is a bar chart showing what percentage of the k-mers/sequences found to be significant match the groundtruth implanted signals. To compare k-mers or sequences of differing lengths, the groundtruth sequences or long k-mers are split into k-mers of the given size through a sliding window approach. When comparing 'full_sequences' to groundtruth sequences, a match is only registered if both sequences are of equal length. Arguments: groundtruth_sequences_path (str): Path to a file containing the true implanted (sub)sequences, e.g., full sequences or k-mers. The file should contain one sequence per line, without a header, and without V or J genes. trim_leading_trailing (bool): Whether to trim the leading and trailing first positions from the provided groundtruth sequences, e.g., the leading C and trailing Y/F amino acids. This is necessary for comparing full sequences when the main dataset is imported using settings that also trim the leading and trailing positions (specified by the region_type parameter). By default, trim_leading_trailing is False. p_values (list): The p value thresholds to be used by Fisher's exact test. Each p-value specified here will become one panel in the output figure. k_values (list): Length of the k-mers (number of amino acids) created by the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder`. When using a full sequence encoding (:py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`), specify 'full_sequence' here. Each value specified under k_values will represent one bar in the output figure. label (dict): A label configuration. One label should be specified, and the positive_class for this label should be defined. See the YAML specification below for an example. compairr_path (str): If 'full_sequence' is listed under k_values, the path to the CompAIRR executable may be provided. If the compairr_path is specified, the :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder` will be used to compute the significant sequences. If the path is not specified and 'full_sequence' is listed under k-values, :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` will be used. YAML specification: .. indent with spaces .. code-block:: yaml my_recovered_significant_features_report: RecoveredSignificantFeatures: groundtruth_sequences_path: path/to/groundtruth/sequences.txt trim_leading_trailing: False p_values: - 0.1 - 0.01 - 0.001 - 0.0001 k_values: - 3 - 4 - 5 - full_sequence compairr_path: path/to/compairr # can be specified if 'full_sequence' is listed under k_values label: # Define a label, and the positive class for that given label CMV: positive_class: + """
[docs] @classmethod def build_object(cls, **kwargs): location = RecoveredSignificantFeatures.__name__ kwargs = SignificantFeaturesHelper.parse_parameters(kwargs, location) kwargs = SignificantFeaturesHelper.parse_sequences_path(kwargs, "groundtruth_sequences_path", location) ParameterValidator.assert_keys_present(kwargs.keys(), ["trim_leading_trailing"], location, location) ParameterValidator.assert_type_and_value(kwargs["trim_leading_trailing"], bool, "RecoveredSignificantFeatures", "trim_leading_trailing") return RecoveredSignificantFeatures(**kwargs)
def __init__(self, dataset: RepertoireDataset = None, groundtruth_sequences_path: Path = None, trim_leading_trailing: bool = None, p_values: List[float] = None, k_values: List[int] = None, label: dict = None, compairr_path: Path = None, result_path: Path = None, name: str = None, number_of_processes: int = 1): super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name) self.groundtruth_sequences_path = groundtruth_sequences_path self.trim_leading_trailing = trim_leading_trailing self.groundtruth_sequences = SignificantFeaturesHelper.load_sequences(groundtruth_sequences_path, trim_leading_trailing) self.p_values = p_values self.k_values = k_values self.label = label self.compairr_path = compairr_path
[docs] def check_prerequisites(self): if isinstance(self.dataset, RepertoireDataset): return True else: warnings.warn(f"{RecoveredSignificantFeatures.__name__}: report can be generated only from RepertoireDataset. Skipping this report...") return False
def _generate(self) -> ReportResult: self.label_config = LabelHelper.create_label_config([self.label], self.dataset, RecoveredSignificantFeatures.__name__, f"{RecoveredSignificantFeatures.__name__}/label") plotting_data = self._compute_plotting_data() table_result = self._write_results_table(plotting_data) fig_significant = self._safe_plot(plotting_data=plotting_data, column_of_interest="n_significant", y_label="Percentage of significant features that match the ground truth") fig_true = self._safe_plot(plotting_data=plotting_data, column_of_interest="n_true", y_label="Percentage of ground truth features that match the significant features") output_figures = [figure for figure in [fig_significant, fig_true] if figure] return ReportResult(name=self.name, info="Compares a given collection of groundtruth implanted signals (sequences or k-mers) to the significant label-associated k-mers or sequences according to Fisher's exact test.", output_figures=output_figures, output_tables=[table_result]) def _compute_plotting_data(self): result = {"encoding": [], "p-value": [], "n_significant": [], "n_true": [], "n_intersect": []} for k in self.k_values: encoder_name = SignificantFeaturesHelper._get_encoder_name(k) for p_value in self.p_values: significant_features = self._compute_significant_features(k, p_value) true_features = self._compute_true_features(k) result["encoding"].append(encoder_name) result["p-value"].append(p_value) result["n_significant"].append(len(significant_features)) result["n_true"].append(len(true_features)) result["n_intersect"].append(len(significant_features.intersection(true_features))) return pd.DataFrame(result) def _get_encoder_name(self, k): encoder_name = f"{k}-mer" if type(k) == int else k return encoder_name def _get_encoder_result_path(self, k, p_value): result_path = self.result_path / f"{self._get_encoder_name(k)}_{p_value}" PathBuilder.build(result_path) return result_path def _write_results_table(self, data) -> ReportOutput: table_path = self.result_path / f"recovered_significant_features_report.csv" data.to_csv(table_path, index=False) return ReportOutput(table_path, "Number of true features found to be significant, and number of significant features found to be true.") def _plot(self, plotting_data, column_of_interest, y_label): plotting_data["percentage"] = plotting_data["n_intersect"] / plotting_data[column_of_interest] figure = px.bar(plotting_data, x="encoding", y="percentage", color=None, facet_row=None, facet_col="p-value", labels={ "percentage": y_label, "encoding": "Encoding", "class": "Repertoire class" }, template='plotly_white', color_discrete_sequence=px.colors.diverging.Tealrose, range_y=[0, 1]) figure.layout.yaxis.tickformat = ',.0%' file_path = self.result_path / f"{column_of_interest}_features_figure.html" figure.write_html(str(file_path)) return ReportOutput(file_path, name=y_label) def _compute_significant_features(self, k, p_value): encoder_result_path = self._get_encoder_result_path(k, p_value) encoder_params = SignificantFeaturesHelper._build_encoder_params(self.label_config, encoder_result_path) if type(k) == int: significant_features = self._compute_significant_kmers(k, p_value, encoder_params) elif self.compairr_path is None: significant_features = self._compute_significant_sequences(p_value, encoder_params) else: significant_features = self._compute_significant_compairr_sequences(p_value, encoder_params) return set(significant_features) def _compute_significant_kmers(self, k, p_value, encoder_params): encoder = SignificantFeaturesHelper._build_kmer_encoder(self.dataset, k, p_value, encoder_params) sequences = pd.read_csv(encoder.relevant_sequence_path) return list(sequences["k-mer"]) def _compute_significant_sequences(self, p_value, encoder_params): encoder = SignificantFeaturesHelper._build_sequence_encoder(self.dataset, p_value, encoder_params) sequences = pd.read_csv(encoder.relevant_sequence_path) return list(sequences[EnvironmentSettings.get_sequence_type().value]) def _compute_significant_compairr_sequences(self, p_value, encoder_params): encoder = SignificantFeaturesHelper._build_compairr_sequence_encoder(self.dataset, p_value, encoder_params, self.compairr_path) sequences = pd.read_csv(encoder.relevant_sequence_path) return list(sequences[EnvironmentSettings.get_sequence_type().value]) def _compute_true_features(self, k): if type(k) == int: return self._compute_true_kmers(k) else: return set(self.groundtruth_sequences) def _compute_true_kmers(self, k): kmers = set() for sequence in self.groundtruth_sequences: kmers = kmers.union(KmerHelper.create_kmers_from_string(sequence, k, overlap=True)) return kmers