Source code for immuneML.reports.data_reports.LabelOverlap

import logging
from pathlib import Path

import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.reports.PlotlyUtil import PlotlyUtil
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder



[docs]
class LabelOverlap(DataReport):
    """
    This report creates a heatmap where the columns are the values of one label and rows are the values of another label,
    and the cells contain the number of samples that have both label values. It works for any dataset type.

    **Specification arguments:**

    - column_label (str): Name of the label to be used as columns in the heatmap.

    - row_label (str): Name of the label to be used as rows in the heatmap.

    **YAML specification:**

    .. code-block:: yaml

        my_data_report:
            LabelOverlap:
                column_label: epitope
                row_label: batch

    """

    def __init__(self, dataset: Dataset = None, result_path: Path = None, name: str = None,
                 number_of_processes: int = 1,
                 column_label: str = None, row_label: str = None):
        super().__init__(dataset, result_path, name, number_of_processes)
        self.column_label = column_label
        self.row_label = row_label


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        ParameterValidator.assert_keys_present(list(kwargs.keys()), ["column_label", "row_label"], "LabelOverlap",
                                               'LabelOverlap')
        ParameterValidator.assert_type_and_value(kwargs["column_label"], str, "LabelOverlap", "column_label")
        ParameterValidator.assert_type_and_value(kwargs["row_label"], str, "LabelOverlap", "row_label")
        return LabelOverlap(column_label=kwargs["column_label"], row_label=kwargs["row_label"], name=kwargs.get("name", "LabelOverlap"))



[docs]
    def check_prerequisites(self):
        if self.column_label not in self.dataset.get_label_names() or self.row_label not in self.dataset.get_label_names():
            logging.warning(f"One or both of the specified labels ({[self.column_label, self.row_label]}) do not exist in the dataset.")
            return False
        return True


    def _generate(self) -> ReportResult:
        # Get metadata for both labels
        metadata = self.dataset.get_metadata([self.column_label, self.row_label])

        # Create a cross-tabulation of the two labels
        overlap_matrix = pd.crosstab(metadata[self.row_label], metadata[self.column_label])

        # Save as CSV
        PathBuilder.build(self.result_path)
        csv_path = self.result_path / 'label_overlap.csv'
        overlap_matrix.to_csv(csv_path)

        # Create heatmap using plotly
        fig = go.Figure(data=go.Heatmap(
            z=overlap_matrix.values,
            x=overlap_matrix.columns,
            y=overlap_matrix.index,
            colorscale='Blues',
            text=overlap_matrix.values,
            texttemplate="%{text}",
            hovertemplate=f"{self.row_label}: " + "%{y}<br>" + f"{self.column_label}: "
                          + "%{x}<br>Count: %{z}<extra></extra>",
            textfont={"size": 14},
            showscale=False,  # Hide the color scale legend
            hoverongaps=False,
        ))

        # Update layout for better readability
        fig.update_layout(
            title=f"Label Overlap: {self.row_label} vs {self.column_label}",
            xaxis_title=self.column_label,
            yaxis_title=self.row_label,
            template="plotly_white",
            font=dict(size=12),
            height=max(600, 40 * len(overlap_matrix.index) + 200),
        )

        # Save plot
        plot_path = self.result_path / 'label_overlap.html'
        plot_path = PlotlyUtil.write_image_to_file(fig, plot_path)

        return ReportResult(
            name=self.name,
            info=f"Shows overlap between {self.row_label} and {self.column_label} labels.",
            output_figures=[ReportOutput(plot_path, "Label overlap heatmap")],
            output_tables=[ReportOutput(csv_path, "Label overlap counts")]
        )