Source code for immuneML.reports.data_reports.SignificantFeatures

import logging
from pathlib import Path
from typing import List

import pickle
import pandas as pd
import plotly.express as px
import numpy as np

from immuneML.data_model.SequenceParams import RegionType
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.dsl.instruction_parsers.LabelHelper import LabelHelper
from immuneML.encodings.abundance_encoding.AbundanceEncoderHelper import AbundanceEncoderHelper
from immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder import CompAIRRSequenceAbundanceEncoder
from immuneML.encodings.abundance_encoding.KmerAbundanceEncoder import KmerAbundanceEncoder
from immuneML.environment.SequenceType import SequenceType
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.util.SignificantFeaturesHelper import SignificantFeaturesHelper



[docs]
class SignificantFeatures(DataReport):
    """
    Plots a boxplot of the number of significant features (label-associated k-mers or sequences) per Repertoire according to Fisher's exact test,
    across different classes for the given label.

    Internally uses the :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder` for calculating
    significant k-mers, and
    :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or
    :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`
    to calculate significant full sequences (depending on whether the argument compairr_path was set).

    **Specification arguments:**

    - p_values (list): The p value thresholds to be used by Fisher's exact test. Each p-value specified here will become
      one panel in the output figure.

    - k_values (list): Length of the k-mers (number of amino acids) created by the
      :py:obj:`~immuneML.encodings.abundance_encoding.KmerAbundanceEncoder.KmerAbundanceEncoder`.
      When using a full sequence encoding (:py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or
      :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`), specify 'full_sequence' here.
      Each value specified under k_values will represent one boxplot in the output figure.

    - label (dict): A label configuration. One label should be specified, and the positive_class for this label should be defined. See the YAML specification below for an example.

    - compairr_path (str): If 'full_sequence' is listed under k_values, the path to the CompAIRR executable may be provided.
      If the compairr_path is specified, the :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder`
      will be used to compute the significant sequences. If the path is not specified and 'full_sequence' is listed under
      k-values, :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` will be used.

    - log_scale (bool): Whether to plot the y axis in log10 scale (log_scale = True) or continuous scale (log_scale = False). By default, log_scale is False.


    **YAML specification:**

    .. indent with spaces
    .. code-block:: yaml

        definitions:
            reports:
                my_significant_features_report:
                    SignificantFeatures:
                        p_values:
                            - 0.1
                            - 0.01
                            - 0.001
                            - 0.0001
                        k_values:
                            - 3
                            - 4
                            - 5
                            - full_sequence
                        compairr_path: path/to/compairr # can be specified if 'full_sequence' is listed under k_values
                        label: # Define a label, and the positive class for that given label
                            CMV:
                                positive_class: +
                        log_scale: False

    """


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        location = SignificantFeatures.__name__

        kwargs = SignificantFeaturesHelper.parse_parameters(kwargs, location)

        ParameterValidator.assert_keys_present(kwargs.keys(), ["log_scale"], location, location)
        ParameterValidator.assert_type_and_value(kwargs["log_scale"], bool, "SignificantFeatures", "log_scale")

        return SignificantFeatures(**kwargs)


    def __init__(self, dataset: RepertoireDataset = None, p_values: List[float] = None, k_values: List[int] = None,
                 label: dict = None, compairr_path: Path = None, log_scale: bool = False, result_path: Path = None,
                 name: str = None, number_of_processes: int = 1, region_type: RegionType = None,
                 sequence_type: SequenceType = None):
        super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name)
        self.p_values = p_values
        self.k_values = k_values
        self.label = label
        self.label_config = None
        self.compairr_path = compairr_path
        self.log_scale = log_scale
        self.region_type = region_type
        self.sequence_type = sequence_type


[docs]
    def check_prerequisites(self):
        if isinstance(self.dataset, RepertoireDataset):
            return True
        else:
            logging.warning(f"{SignificantFeatures.__name__}: report can be generated only from RepertoireDataset. "
                          f"Skipping this report...")
            return False


    def _generate(self) -> ReportResult:
        self.label_config = LabelHelper.create_label_config([self.label], self.dataset,
                                                            SignificantFeatures.__name__,
                                                            f"{SignificantFeatures.__name__}/label")

        plotting_data = self._compute_plotting_data()
        table_result = self._write_results_table(plotting_data)

        report_output_fig = self._safe_plot(plotting_data=plotting_data)
        output_figures = None if report_output_fig is None else [report_output_fig]

        return ReportResult(name=self.name,
                            info="The number of significant features (label-associated k-mers or sequences) per "
                                 "Repertoire according to Fisher's exact test, across different classes for the "
                                 "given label.",
                            output_figures=output_figures,
                            output_tables=[table_result])

    def _compute_plotting_data(self):
        result = {"encoding": [],
                  "p-value": [],
                  "class": [],
                  "significant_features": []}

        positive_class, negative_class = self._get_positive_negative_classes()

        for k in self.k_values:
            encoder_name = SignificantFeaturesHelper._get_encoder_name(k)

            for p_value in self.p_values:
                pos_class_feature_counts, neg_class_feature_counts = self._compute_significant_feature_counts(k, p_value)
                n_examples = len(pos_class_feature_counts) + len(neg_class_feature_counts)

                result["encoding"].extend([encoder_name] * n_examples)
                result["p-value"].extend([p_value] * n_examples)
                result["class"].extend(
                    [positive_class] * len(pos_class_feature_counts) + [negative_class] * len(neg_class_feature_counts))
                result["significant_features"].extend(list(pos_class_feature_counts) + list(neg_class_feature_counts))

        return pd.DataFrame(result)

    def _get_positive_negative_classes(self):
        label = self.label_config.get_label_objects()[0]
        positive_class = label.positive_class
        negative_class = label.get_binary_negative_class()

        return positive_class, negative_class

    def _get_encoder_result_path(self, k, p_value):
        result_path = self.result_path / f"{SignificantFeaturesHelper._get_encoder_name(k)}_{p_value}"
        PathBuilder.build(result_path)
        return result_path

    def _write_results_table(self, data) -> ReportOutput:
        table_path = self.result_path / f"significant_features_report.csv"
        data.to_csv(table_path, index=False)
        return ReportOutput(table_path, "Significant features across different Repertoire classes")

    def _plot(self, plotting_data):
        figure = px.box(plotting_data, x="encoding", y="significant_features", color="class",
                        facet_row=None, facet_col="p-value", log_y=self.log_scale,
                        labels={
                            "significant_features": "Number of significant features per AIRR according to Fisher's exact test",
                            "encoding": "Encoding",
                            "class": "Repertoire class"
                        }, template='plotly_white',
                        color_discrete_sequence=px.colors.diverging.Tealrose)

        file_path = self.result_path / f"significant_features_figure.html"

        figure.write_html(str(file_path))

        return ReportOutput(file_path, name="Significant features across different Repertoire classes")

    def _compute_significant_feature_counts(self, k, p_value):
        encoder_result_path = self._get_encoder_result_path(k, p_value)
        encoder_params = SignificantFeaturesHelper._build_encoder_params(self.label_config, encoder_result_path,
                                                                         self.region_type, self.sequence_type)

        if type(k) == int:
            pos_class_feature_counts, neg_class_feature_counts = self._compute_significant_kmer_counts(k, p_value, encoder_params)
        elif self.compairr_path is None:
            pos_class_feature_counts, neg_class_feature_counts = self._compute_significant_sequence_counts(p_value, encoder_params)
        else:
            pos_class_feature_counts, neg_class_feature_counts = self._compute_significant_compairr_sequence_counts(p_value, encoder_params)

        return pos_class_feature_counts, neg_class_feature_counts

    def _compute_significant_kmer_counts(self, k, p_value, encoder_params):
        encoder = SignificantFeaturesHelper._build_kmer_encoder(self.dataset, k, p_value, encoder_params)

        with encoder.relevant_indices_path.open("rb") as file:
            relevant_indices = pickle.load(file)

        relevant_feature_presence = np.sum(encoder.kmer_presence_matrix[relevant_indices], axis=0)

        return self._get_positive_negative_class(relevant_feature_presence, encoder.matrix_repertoire_ids)

    def _compute_significant_sequence_counts(self, p_value, encoder_params):
        encoder = SignificantFeaturesHelper._build_sequence_encoder(self.dataset, p_value, encoder_params)

        with encoder.relevant_indices_path.open("rb") as file:
            relevant_indices = pickle.load(file)

        relevant_feature_presence = self._sum_sequence_vectors_iterable(relevant_indices, encoder.comparison_data.get_item_vectors())

        return self._get_positive_negative_class(relevant_feature_presence, self.dataset.get_example_ids())

    def _compute_significant_compairr_sequence_counts(self, p_value, encoder_params):
        encoder = SignificantFeaturesHelper._build_compairr_sequence_encoder(self.dataset, p_value, encoder_params, self.compairr_path)

        with encoder.relevant_indices_path.open("rb") as file:
            relevant_indices = pickle.load(file)

        relevant_feature_presence = self._sum_sequence_vectors_iterable(relevant_indices, encoder.compairr_sequence_presence)

        return self._get_positive_negative_class(relevant_feature_presence, self.dataset.get_example_ids())

    def _sum_sequence_vectors_iterable(self, relevant_indices, sequence_vector_iterable):
        relevant_feature_presence = np.zeros(shape=(6,))

        for i, sequence_vector in enumerate(sequence_vector_iterable):
            if relevant_indices[i]:
                relevant_feature_presence += sequence_vector

        return relevant_feature_presence

    def _get_relevant_feature_presence(self, encoder, relevant_indices):

        if isinstance(encoder, KmerAbundanceEncoder):
            relevant_feature_presence = np.sum(encoder.kmer_presence_matrix[relevant_indices], axis=0)
        elif isinstance(encoder, CompAIRRSequenceAbundanceEncoder):
            relevant_feature_presence = np.sum(encoder.sequence_presence_matrix[relevant_indices], axis=0)
        else:
            relevant_feature_presence = np.zeros(shape=(6,))

            for i, sequence_vector in enumerate(encoder.comparison_data.get_item_vectors()):
                if relevant_indices[i]:
                    relevant_feature_presence += sequence_vector

        return relevant_feature_presence

    def _get_positive_negative_class(self, relevant_feature_presence, repertoire_ids):
        is_positive_class = AbundanceEncoderHelper.check_is_positive_class(self.dataset, repertoire_ids, self.label_config)

        pos_class_feature_counts = relevant_feature_presence[is_positive_class]
        neg_class_feature_counts = relevant_feature_presence[np.logical_not(is_positive_class)]

        return pos_class_feature_counts, neg_class_feature_counts