import warnings
from pathlib import Path
from typing import List
import pandas as pd
import plotly.express as px
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.dsl.instruction_parsers.LabelHelper import LabelHelper
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.KmerHelper import KmerHelper
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.SignificantFeaturesHelper import SignificantFeaturesHelper
[docs]
class RecoveredSignificantFeatures(DataReport):
"""
Compares a given collection of groundtruth implanted signals (sequences or k-mers) to the significant label-associated
k-mers or sequences according to Fisher's exact test.
Internally uses the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder` for calculating
significant k-mers, and
:py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or
:py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`
to calculate significant full sequences (depending on whether the argument compairr_path was set).
This report creates two plots:
- the first plot is a bar chart showing what percentage of the groundtruth implanted signals were found to be significant.
- the second plot is a bar chart showing what percentage of the k-mers/sequences found to be significant match the
groundtruth implanted signals.
To compare k-mers or sequences of differing lengths, the groundtruth sequences or long k-mers are split into k-mers
of the given size through a sliding window approach. When comparing 'full_sequences' to groundtruth sequences, a match
is only registered if both sequences are of equal length.
Arguments:
groundtruth_sequences_path (str): Path to a file containing the true implanted (sub)sequences, e.g., full sequences or k-mers.
The file should contain one sequence per line, without a header, and without V or J genes.
trim_leading_trailing (bool): Whether to trim the leading and trailing first positions from the provided groundtruth sequences,
e.g., the leading C and trailing Y/F amino acids.
This is necessary for comparing full sequences when the main dataset is imported using settings that also trim
the leading and trailing positions (specified by the region_type parameter). By default, trim_leading_trailing is False.
p_values (list): The p value thresholds to be used by Fisher's exact test. Each p-value specified here will become one panel in the output figure.
k_values (list): Length of the k-mers (number of amino acids) created by the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder`.
When using a full sequence encoding (:py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or
:py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`), specify 'full_sequence' here.
Each value specified under k_values will represent one bar in the output figure.
label (dict): A label configuration. One label should be specified, and the positive_class for this label should be defined. See the YAML specification below for an example.
compairr_path (str): If 'full_sequence' is listed under k_values, the path to the CompAIRR executable may be provided.
If the compairr_path is specified, the :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`
will be used to compute the significant sequences. If the path is not specified and 'full_sequence' is listed under
k-values, :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` will be used.
YAML specification:
.. indent with spaces
.. code-block:: yaml
my_recovered_significant_features_report:
RecoveredSignificantFeatures:
groundtruth_sequences_path: path/to/groundtruth/sequences.txt
trim_leading_trailing: False
p_values:
- 0.1
- 0.01
- 0.001
- 0.0001
k_values:
- 3
- 4
- 5
- full_sequence
compairr_path: path/to/compairr # can be specified if 'full_sequence' is listed under k_values
label: # Define a label, and the positive class for that given label
CMV:
positive_class: +
"""
[docs]
@classmethod
def build_object(cls, **kwargs):
location = RecoveredSignificantFeatures.__name__
kwargs = SignificantFeaturesHelper.parse_parameters(kwargs, location)
kwargs = SignificantFeaturesHelper.parse_sequences_path(kwargs, "groundtruth_sequences_path", location)
ParameterValidator.assert_keys_present(kwargs.keys(), ["trim_leading_trailing"], location, location)
ParameterValidator.assert_type_and_value(kwargs["trim_leading_trailing"], bool, "RecoveredSignificantFeatures", "trim_leading_trailing")
return RecoveredSignificantFeatures(**kwargs)
def __init__(self, dataset: RepertoireDataset = None, groundtruth_sequences_path: Path = None,
trim_leading_trailing: bool = None,
p_values: List[float] = None, k_values: List[int] = None, label: dict = None,
compairr_path: Path = None, result_path: Path = None, name: str = None, number_of_processes: int = 1):
super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name)
self.groundtruth_sequences_path = groundtruth_sequences_path
self.trim_leading_trailing = trim_leading_trailing
self.groundtruth_sequences = SignificantFeaturesHelper.load_sequences(groundtruth_sequences_path, trim_leading_trailing)
self.p_values = p_values
self.k_values = k_values
self.label = label
self.compairr_path = compairr_path
[docs]
def check_prerequisites(self):
if isinstance(self.dataset, RepertoireDataset):
return True
else:
warnings.warn(f"{RecoveredSignificantFeatures.__name__}: report can be generated only from RepertoireDataset. Skipping this report...")
return False
def _generate(self) -> ReportResult:
self.label_config = LabelHelper.create_label_config([self.label], self.dataset, RecoveredSignificantFeatures.__name__,
f"{RecoveredSignificantFeatures.__name__}/label")
plotting_data = self._compute_plotting_data()
table_result = self._write_results_table(plotting_data)
fig_significant = self._safe_plot(plotting_data=plotting_data, column_of_interest="n_significant", y_label="Percentage of significant features that match the ground truth")
fig_true = self._safe_plot(plotting_data=plotting_data, column_of_interest="n_true", y_label="Percentage of ground truth features that match the significant features")
output_figures = [figure for figure in [fig_significant, fig_true] if figure]
return ReportResult(name=self.name,
info="Compares a given collection of groundtruth implanted signals (sequences or k-mers) to the significant label-associated k-mers or sequences according to Fisher's exact test.",
output_figures=output_figures,
output_tables=[table_result])
def _compute_plotting_data(self):
result = {"encoding": [],
"p-value": [],
"n_significant": [],
"n_true": [],
"n_intersect": []}
for k in self.k_values:
encoder_name = SignificantFeaturesHelper._get_encoder_name(k)
for p_value in self.p_values:
significant_features = self._compute_significant_features(k, p_value)
true_features = self._compute_true_features(k)
result["encoding"].append(encoder_name)
result["p-value"].append(p_value)
result["n_significant"].append(len(significant_features))
result["n_true"].append(len(true_features))
result["n_intersect"].append(len(significant_features.intersection(true_features)))
return pd.DataFrame(result)
def _get_encoder_name(self, k):
encoder_name = f"{k}-mer" if type(k) == int else k
return encoder_name
def _get_encoder_result_path(self, k, p_value):
result_path = self.result_path / f"{self._get_encoder_name(k)}_{p_value}"
PathBuilder.build(result_path)
return result_path
def _write_results_table(self, data) -> ReportOutput:
table_path = self.result_path / f"recovered_significant_features_report.csv"
data.to_csv(table_path, index=False)
return ReportOutput(table_path, "Number of true features found to be significant, and number of significant features found to be true.")
def _plot(self, plotting_data, column_of_interest, y_label):
plotting_data["percentage"] = plotting_data["n_intersect"] / plotting_data[column_of_interest]
figure = px.bar(plotting_data, x="encoding", y="percentage", color=None,
facet_row=None, facet_col="p-value",
labels={
"percentage": y_label,
"encoding": "Encoding",
"class": "Repertoire class"
}, template='plotly_white',
color_discrete_sequence=px.colors.diverging.Tealrose,
range_y=[0, 1])
figure.layout.yaxis.tickformat = ',.0%'
file_path = self.result_path / f"{column_of_interest}_features_figure.html"
figure.write_html(str(file_path))
return ReportOutput(file_path, name=y_label)
def _compute_significant_features(self, k, p_value):
encoder_result_path = self._get_encoder_result_path(k, p_value)
encoder_params = SignificantFeaturesHelper._build_encoder_params(self.label_config, encoder_result_path)
if type(k) == int:
significant_features = self._compute_significant_kmers(k, p_value, encoder_params)
elif self.compairr_path is None:
significant_features = self._compute_significant_sequences(p_value, encoder_params)
else:
significant_features = self._compute_significant_compairr_sequences(p_value, encoder_params)
return set(significant_features)
def _compute_significant_kmers(self, k, p_value, encoder_params):
encoder = SignificantFeaturesHelper._build_kmer_encoder(self.dataset, k, p_value, encoder_params)
sequences = pd.read_csv(encoder.relevant_sequence_path)
return list(sequences["k-mer"])
def _compute_significant_sequences(self, p_value, encoder_params):
encoder = SignificantFeaturesHelper._build_sequence_encoder(self.dataset, p_value, encoder_params)
sequences = pd.read_csv(encoder.relevant_sequence_path)
return list(sequences[EnvironmentSettings.get_sequence_type().value])
def _compute_significant_compairr_sequences(self, p_value, encoder_params):
encoder = SignificantFeaturesHelper._build_compairr_sequence_encoder(self.dataset, p_value, encoder_params, self.compairr_path)
sequences = pd.read_csv(encoder.relevant_sequence_path)
return list(sequences[EnvironmentSettings.get_sequence_type().value])
def _compute_true_features(self, k):
if type(k) == int:
return self._compute_true_kmers(k)
else:
return set(self.groundtruth_sequences)
def _compute_true_kmers(self, k):
kmers = set()
for sequence in self.groundtruth_sequences:
kmers = kmers.union(KmerHelper.create_kmers_from_string(sequence, k, overlap=True))
return kmers