Source code for immuneML.reports.clustering_reports.ExternalLabelClusterSummary

from pathlib import Path
from typing import List

import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

from immuneML.reports.PlotlyUtil import PlotlyUtil
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.clustering_reports.ClusteringReport import ClusteringReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.instructions.clustering.ClusteringState import ClusteringState


[docs] class ExternalLabelClusterSummary(ClusteringReport): """ This report summarizes the number of examples in a cluster with different values of external labels. For each external label, it creates: 1. A contingency table showing the count of examples for each combination of cluster and label value 2. A heatmap visualization of these counts It can be used in combination with Clustering instruction. **Specification arguments:** - external_labels (list): the list of metadata columns in the dataset that should be compared against cluster assignment **YAML specification:** .. indent with spaces .. code-block:: yaml reports: my_external_label_cluster_summary: ExternalLabelClusterSummary: external_labels: [disease, HLA] """
[docs] @classmethod def build_object(cls, **kwargs): ParameterValidator.assert_keys(list(kwargs.keys()), ['external_labels', 'name'], ExternalLabelClusterSummary.__name__, ExternalLabelClusterSummary.__name__) ParameterValidator.assert_all_type_and_value(kwargs['external_labels'], str, ExternalLabelClusterSummary.__name__, 'external_labels') return ExternalLabelClusterSummary(**kwargs)
def __init__(self, external_labels: List[str], name: str = None, state: ClusteringState = None, result_path: Path = None, number_of_processes: int = 1): super().__init__(name, result_path, number_of_processes, state) self.external_labels = external_labels self.desc = "External Label Cluster Summary" def _generate(self) -> ReportResult: self.result_path = PathBuilder.build(self.result_path / self.name) report_outputs = [] # For each split in the clustering results for split_idx, clustering_results in enumerate(self.state.clustering_items): # Process discovery results if clustering_results.discovery: report_outputs.extend(self._process_analysis_results( clustering_results.discovery, f"discovery_split_{split_idx + 1}" )) # Process method-based validation results if available if clustering_results.method_based_validation: report_outputs.extend(self._process_analysis_results( clustering_results.method_based_validation, f"method_based_validation_split_{split_idx + 1}" )) # Process result-based validation results if available if clustering_results.result_based_validation: report_outputs.extend(self._process_analysis_results( clustering_results.result_based_validation, f"result_based_validation_split_{split_idx + 1}" )) if not report_outputs: return ReportResult( name=f"{self.desc} ({self.name})", info="No results were generated. This could be because no external labels were found in the dataset " "metadata." ) return ReportResult( name=f"{self.desc} ({self.name})", info="Summary of cluster assignments versus external labels", output_tables=[output for output in report_outputs if 'table' in output.name], output_figures=[output for output in report_outputs if 'heatmap' in output.name] ) def _process_analysis_results(self, analysis_results, analysis_name: str) -> List[ReportOutput]: outputs = [] # For each clustering setting for setting_key, item_result in analysis_results.items.items(): predictions = item_result.item.predictions dataset = item_result.item.dataset # For each external label labels = dataset.get_metadata(self.external_labels, return_df=True) for label in self.external_labels: label_values = labels[label] # Create contingency table contingency_df = pd.crosstab( pd.Series(predictions, name='cluster'), pd.Series(label_values, name=label) ) # Save contingency table table_path = self.result_path / f"{analysis_name}_{setting_key}_{label}_contingency.csv" contingency_df.to_csv(table_path) outputs.append(ReportOutput( path=table_path, name=f"Contingency table for {label} ({analysis_name.replace('_', ' ')}, {setting_key})" )) # Create heatmap fig = go.Figure(data=go.Heatmap( z=contingency_df.values, x=contingency_df.columns, y=contingency_df.index, colorscale='Tealrose', text=contingency_df.values, texttemplate='%{text}', hovertemplate='count: %{z}<br>cluster: %{y}<br>label: %{x}<extra></extra>', textfont={"size": 15}, hoverongaps=False, colorbar=dict( title="Count" ) )) fig.update_layout( xaxis_title=label, yaxis_title='cluster', legend_title_text='Count' ) fig.update_layout(coloraxis_colorbar=dict(orientation="h")) fig.update_xaxes(type='category') fig.update_yaxes(type='category') heatmap_path = self.result_path / f"{analysis_name}_{setting_key}_{label}_heatmap.html" plot_path = PlotlyUtil.write_image_to_file(fig, heatmap_path, contingency_df.shape[0]) outputs.append(ReportOutput( path=plot_path, name=f"Distribution heatmap for {label} ({analysis_name.replace('_', ' ')}, {setting_key})" )) return outputs
[docs] def check_prerequisites(self): if not self.state: return False if not self.state.clustering_items: return False if not self.external_labels: return False return True