import warnings
from pathlib import Path
from typing import List
import pickle
import pandas as pd
import plotly.express as px
import numpy as np
from immuneML.data_model.SequenceParams import RegionType
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.dsl.instruction_parsers.LabelHelper import LabelHelper
from immuneML.encodings.abundance_encoding.AbundanceEncoderHelper import AbundanceEncoderHelper
from immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder import CompAIRRSequenceAbundanceEncoder
from immuneML.encodings.abundance_encoding.KmerAbundanceEncoder import KmerAbundanceEncoder
from immuneML.environment.SequenceType import SequenceType
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.SignificantFeaturesHelper import SignificantFeaturesHelper
[docs]
class SignificantFeatures(DataReport):
"""
Plots a boxplot of the number of significant features (label-associated k-mers or sequences) per Repertoire according to Fisher's exact test,
across different classes for the given label.
Internally uses the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder` for calculating
significant k-mers, and
:py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or
:py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`
to calculate significant full sequences (depending on whether the argument compairr_path was set).
**Specification arguments:**
- p_values (list): The p value thresholds to be used by Fisher's exact test. Each p-value specified here will become
one panel in the output figure.
- k_values (list): Length of the k-mers (number of amino acids) created by the
:py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder`.
When using a full sequence encoding (:py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or
:py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`), specify 'full_sequence' here.
Each value specified under k_values will represent one boxplot in the output figure.
- label (dict): A label configuration. One label should be specified, and the positive_class for this label should be defined. See the YAML specification below for an example.
- compairr_path (str): If 'full_sequence' is listed under k_values, the path to the CompAIRR executable may be provided.
If the compairr_path is specified, the :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`
will be used to compute the significant sequences. If the path is not specified and 'full_sequence' is listed under
k-values, :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` will be used.
- log_scale (bool): Whether to plot the y axis in log10 scale (log_scale = True) or continuous scale (log_scale = False). By default, log_scale is False.
**YAML specification:**
.. indent with spaces
.. code-block:: yaml
definitions:
reports:
my_significant_features_report:
SignificantFeatures:
p_values:
- 0.1
- 0.01
- 0.001
- 0.0001
k_values:
- 3
- 4
- 5
- full_sequence
compairr_path: path/to/compairr # can be specified if 'full_sequence' is listed under k_values
label: # Define a label, and the positive class for that given label
CMV:
positive_class: +
log_scale: False
"""
[docs]
@classmethod
def build_object(cls, **kwargs):
location = SignificantFeatures.__name__
kwargs = SignificantFeaturesHelper.parse_parameters(kwargs, location)
ParameterValidator.assert_keys_present(kwargs.keys(), ["log_scale"], location, location)
ParameterValidator.assert_type_and_value(kwargs["log_scale"], bool, "SignificantFeatures", "log_scale")
return SignificantFeatures(**kwargs)
def __init__(self, dataset: RepertoireDataset = None, p_values: List[float] = None, k_values: List[int] = None,
label: dict = None, compairr_path: Path = None, log_scale: bool = False, result_path: Path = None,
name: str = None, number_of_processes: int = 1, region_type: RegionType = None,
sequence_type: SequenceType = None):
super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name)
self.p_values = p_values
self.k_values = k_values
self.label = label
self.label_config = None
self.compairr_path = compairr_path
self.log_scale = log_scale
self.region_type = region_type
self.sequence_type = sequence_type
[docs]
def check_prerequisites(self):
if isinstance(self.dataset, RepertoireDataset):
return True
else:
warnings.warn(f"{SignificantFeatures.__name__}: report can be generated only from RepertoireDataset. "
f"Skipping this report...")
return False
def _generate(self) -> ReportResult:
self.label_config = LabelHelper.create_label_config([self.label], self.dataset,
SignificantFeatures.__name__,
f"{SignificantFeatures.__name__}/label")
plotting_data = self._compute_plotting_data()
table_result = self._write_results_table(plotting_data)
report_output_fig = self._safe_plot(plotting_data=plotting_data)
output_figures = None if report_output_fig is None else [report_output_fig]
return ReportResult(name=self.name,
info="The number of significant features (label-associated k-mers or sequences) per "
"Repertoire according to Fisher's exact test, across different classes for the "
"given label.",
output_figures=output_figures,
output_tables=[table_result])
def _compute_plotting_data(self):
result = {"encoding": [],
"p-value": [],
"class": [],
"significant_features": []}
positive_class, negative_class = self._get_positive_negative_classes()
for k in self.k_values:
encoder_name = SignificantFeaturesHelper._get_encoder_name(k)
for p_value in self.p_values:
pos_class_feature_counts, neg_class_feature_counts = self._compute_significant_feature_counts(k, p_value)
n_examples = len(pos_class_feature_counts) + len(neg_class_feature_counts)
result["encoding"].extend([encoder_name] * n_examples)
result["p-value"].extend([p_value] * n_examples)
result["class"].extend(
[positive_class] * len(pos_class_feature_counts) + [negative_class] * len(neg_class_feature_counts))
result["significant_features"].extend(list(pos_class_feature_counts) + list(neg_class_feature_counts))
return pd.DataFrame(result)
def _get_positive_negative_classes(self):
label = self.label_config.get_label_objects()[0]
positive_class = label.positive_class
negative_class = label.get_binary_negative_class()
return positive_class, negative_class
def _get_encoder_result_path(self, k, p_value):
result_path = self.result_path / f"{SignificantFeaturesHelper._get_encoder_name(k)}_{p_value}"
PathBuilder.build(result_path)
return result_path
def _write_results_table(self, data) -> ReportOutput:
table_path = self.result_path / f"significant_features_report.csv"
data.to_csv(table_path, index=False)
return ReportOutput(table_path, "Significant features across different Repertoire classes")
def _plot(self, plotting_data):
figure = px.box(plotting_data, x="encoding", y="significant_features", color="class",
facet_row=None, facet_col="p-value", log_y=self.log_scale,
labels={
"significant_features": "Number of significant features per AIRR according to Fisher's exact test",
"encoding": "Encoding",
"class": "Repertoire class"
}, template='plotly_white',
color_discrete_sequence=px.colors.diverging.Tealrose)
file_path = self.result_path / f"significant_features_figure.html"
figure.write_html(str(file_path))
return ReportOutput(file_path, name="Significant features across different Repertoire classes")
def _compute_significant_feature_counts(self, k, p_value):
encoder_result_path = self._get_encoder_result_path(k, p_value)
encoder_params = SignificantFeaturesHelper._build_encoder_params(self.label_config, encoder_result_path,
self.region_type, self.sequence_type)
if type(k) == int:
pos_class_feature_counts, neg_class_feature_counts = self._compute_significant_kmer_counts(k, p_value, encoder_params)
elif self.compairr_path is None:
pos_class_feature_counts, neg_class_feature_counts = self._compute_significant_sequence_counts(p_value, encoder_params)
else:
pos_class_feature_counts, neg_class_feature_counts = self._compute_significant_compairr_sequence_counts(p_value, encoder_params)
return pos_class_feature_counts, neg_class_feature_counts
def _compute_significant_kmer_counts(self, k, p_value, encoder_params):
encoder = SignificantFeaturesHelper._build_kmer_encoder(self.dataset, k, p_value, encoder_params)
with encoder.relevant_indices_path.open("rb") as file:
relevant_indices = pickle.load(file)
relevant_feature_presence = np.sum(encoder.kmer_presence_matrix[relevant_indices], axis=0)
return self._get_positive_negative_class(relevant_feature_presence, encoder.matrix_repertoire_ids)
def _compute_significant_sequence_counts(self, p_value, encoder_params):
encoder = SignificantFeaturesHelper._build_sequence_encoder(self.dataset, p_value, encoder_params)
with encoder.relevant_indices_path.open("rb") as file:
relevant_indices = pickle.load(file)
relevant_feature_presence = self._sum_sequence_vectors_iterable(relevant_indices, encoder.comparison_data.get_item_vectors())
return self._get_positive_negative_class(relevant_feature_presence, self.dataset.get_example_ids())
def _compute_significant_compairr_sequence_counts(self, p_value, encoder_params):
encoder = SignificantFeaturesHelper._build_compairr_sequence_encoder(self.dataset, p_value, encoder_params, self.compairr_path)
with encoder.relevant_indices_path.open("rb") as file:
relevant_indices = pickle.load(file)
relevant_feature_presence = self._sum_sequence_vectors_iterable(relevant_indices, encoder.compairr_sequence_presence)
return self._get_positive_negative_class(relevant_feature_presence, self.dataset.get_example_ids())
def _sum_sequence_vectors_iterable(self, relevant_indices, sequence_vector_iterable):
relevant_feature_presence = np.zeros(shape=(6,))
for i, sequence_vector in enumerate(sequence_vector_iterable):
if relevant_indices[i]:
relevant_feature_presence += sequence_vector
return relevant_feature_presence
def _get_relevant_feature_presence(self, encoder, relevant_indices):
if isinstance(encoder, KmerAbundanceEncoder):
relevant_feature_presence = np.sum(encoder.kmer_presence_matrix[relevant_indices], axis=0)
elif isinstance(encoder, CompAIRRSequenceAbundanceEncoder):
relevant_feature_presence = np.sum(encoder.sequence_presence_matrix[relevant_indices], axis=0)
else:
relevant_feature_presence = np.zeros(shape=(6,))
for i, sequence_vector in enumerate(encoder.comparison_data.get_item_vectors()):
if relevant_indices[i]:
relevant_feature_presence += sequence_vector
return relevant_feature_presence
def _get_positive_negative_class(self, relevant_feature_presence, repertoire_ids):
is_positive_class = AbundanceEncoderHelper.check_is_positive_class(self.dataset, repertoire_ids, self.label_config)
pos_class_feature_counts = relevant_feature_presence[is_positive_class]
neg_class_feature_counts = relevant_feature_presence[np.logical_not(is_positive_class)]
return pos_class_feature_counts, neg_class_feature_counts