Source code for immuneML.reports.clustering_reports.ExternalLabelClusterSummary

from pathlib import Path
from typing import List

import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

from immuneML.reports.PlotlyUtil import PlotlyUtil
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.clustering_reports.ClusteringReport import ClusteringReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.instructions.clustering.ClusteringState import ClusteringState



[docs]
class ExternalLabelClusterSummary(ClusteringReport):
    """
    This report summarizes the number of examples in a cluster with different values of external labels.
    For each external label, it creates:
    1. A contingency table showing the count of examples for each combination of cluster and label value
    2. A heatmap visualization of these counts

    It can be used in combination with Clustering instruction.

    **Specification arguments:**

    - external_labels (list): the list of metadata columns in the dataset that should be compared against cluster
      assignment

    **YAML specification:**

    .. indent with spaces
    .. code-block:: yaml

        reports:
            my_external_label_cluster_summary:
                ExternalLabelClusterSummary:
                    external_labels: [disease, HLA]

    """


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        ParameterValidator.assert_keys(list(kwargs.keys()), ['external_labels', 'name'],
                                       ExternalLabelClusterSummary.__name__, ExternalLabelClusterSummary.__name__)
        ParameterValidator.assert_all_type_and_value(kwargs['external_labels'], str,
                                                     ExternalLabelClusterSummary.__name__, 'external_labels')
        return ExternalLabelClusterSummary(**kwargs)


    def __init__(self, external_labels: List[str], name: str = None, state: ClusteringState = None,
                 result_path: Path = None, number_of_processes: int = 1):
        super().__init__(name, result_path, number_of_processes, state)
        self.external_labels = external_labels
        self.desc = "External Label Cluster Summary"

    def _generate(self) -> ReportResult:
        self.result_path = PathBuilder.build(self.result_path / self.name)
        report_outputs = []

        # For each split in the clustering results
        for split_idx, clustering_results in enumerate(self.state.clustering_items):
            # Process discovery results
            if clustering_results.discovery:
                report_outputs.extend(self._process_analysis_results(
                    clustering_results.discovery, 
                    f"discovery_split_{split_idx + 1}"
                ))

            # Process method-based validation results if available
            if clustering_results.method_based_validation:
                report_outputs.extend(self._process_analysis_results(
                    clustering_results.method_based_validation,
                    f"method_based_validation_split_{split_idx + 1}"
                ))

            # Process result-based validation results if available
            if clustering_results.result_based_validation:
                report_outputs.extend(self._process_analysis_results(
                    clustering_results.result_based_validation,
                    f"result_based_validation_split_{split_idx + 1}"
                ))

        if not report_outputs:
            return ReportResult(
                name=f"{self.desc} ({self.name})",
                info="No results were generated. This could be because no external labels were found in the dataset "
                     "metadata."
            )

        return ReportResult(
            name=f"{self.desc} ({self.name})",
            info="Summary of cluster assignments versus external labels",
            output_tables=[output for output in report_outputs if 'table' in output.name],
            output_figures=[output for output in report_outputs if 'heatmap' in output.name]
        )

    def _process_analysis_results(self, analysis_results, analysis_name: str) -> List[ReportOutput]:
        outputs = []

        # For each clustering setting
        for setting_key, item_result in analysis_results.items.items():
            predictions = item_result.item.predictions
            dataset = item_result.item.dataset

            # For each external label
            labels = dataset.get_metadata(self.external_labels, return_df=True)
            for label in self.external_labels:
                label_values = labels[label]

                # Create contingency table
                contingency_df = pd.crosstab(
                    pd.Series(predictions, name='cluster'),
                    pd.Series(label_values, name=label)
                )

                # Save contingency table
                table_path = self.result_path / f"{analysis_name}_{setting_key}_{label}_contingency.csv"
                contingency_df.to_csv(table_path)
                outputs.append(ReportOutput(
                    path=table_path,
                    name=f"Contingency table for {label} ({analysis_name.replace('_', ' ')}, {setting_key})"
                ))

                # Create heatmap
                fig = go.Figure(data=go.Heatmap(
                    z=contingency_df.values,
                    x=contingency_df.columns,
                    y=contingency_df.index,
                    colorscale='Tealrose',
                    text=contingency_df.values,
                    texttemplate='%{text}',
                    hovertemplate='count: %{z}<br>cluster: %{y}<br>label: %{x}<extra></extra>',
                    textfont={"size": 15},
                    hoverongaps=False,
                    colorbar=dict(
                        title="Count"
                    )
                ))

                fig.update_layout(
                    xaxis_title=label,
                    yaxis_title='cluster',
                    legend_title_text='Count'
                )
                fig.update_layout(coloraxis_colorbar=dict(orientation="h"))

                fig.update_xaxes(type='category')
                fig.update_yaxes(type='category')

                heatmap_path = self.result_path / f"{analysis_name}_{setting_key}_{label}_heatmap.html"
                plot_path = PlotlyUtil.write_image_to_file(fig,
                                                           heatmap_path,
                                                           contingency_df.shape[0])

                outputs.append(ReportOutput(
                    path=plot_path,
                    name=f"Distribution heatmap for {label} ({analysis_name.replace('_', ' ')}, {setting_key})"
                ))

        return outputs


[docs]
    def check_prerequisites(self):
        if not self.state:
            return False

        if not self.state.clustering_items:
            return False

        if not self.external_labels:
            return False

        return True