[docs]classExternalLabelClusterSummary(ClusteringMethodReport):""" This report summarizes the number of examples in a cluster with different values of external labels. For each external label, it creates: 1. A contingency table showing the count of examples for each combination of cluster and label value 2. A heatmap visualization of these counts It can be used in combination with Clustering instruction. **Specification arguments:** - external_labels (list): the list of metadata columns in the dataset that should be compared against cluster assignment **YAML specification:** .. indent with spaces .. code-block:: yaml reports: my_external_label_cluster_summary: ExternalLabelClusterSummary: external_labels: [disease, HLA] """
def__init__(self,external_labels:List[str],name:str=None,item:ClusteringItem=None,result_path:Path=None):super().__init__(name=name,result_path=result_path,clustering_item=item)self.external_labels=external_labelsself.desc="External Label Cluster Summary"def_generate(self)->ReportResult:self.result_path=PathBuilder.build(self.result_path/self.name)report_outputs=self._process_analysis_results()ifnotreport_outputs:returnReportResult(name=f"{self.desc} ({self.name})",info="No results were generated. This could be because no external labels were found in the dataset ""metadata.")returnReportResult(name=f"{self.desc} ({self.name})",info="Summary of cluster assignments versus external labels",output_tables=[outputforoutputinreport_outputsif'table'inoutput.name],output_figures=[outputforoutputinreport_outputsif'heatmap'inoutput.name])def_process_analysis_results(self)->List[ReportOutput]:outputs=[]predictions=self.item.predictionsdataset=self.item.dataset# For each external labellabels=dataset.get_metadata(self.external_labels,return_df=True)forlabelinself.external_labels:label_values=labels[label]# Create contingency tablecontingency_df=pd.crosstab(pd.Series(predictions,name='cluster'),pd.Series(label_values,name=label))# Save contingency tabletable_path=self.result_path/f"{label}_contingency.csv"contingency_df.to_csv(table_path)outputs.append(ReportOutput(path=table_path,name=f"Contingency table for {label} ({self.item.cl_setting.get_key()})"))# Create heatmapfig=go.Figure(data=go.Heatmap(z=contingency_df.values,x=contingency_df.columns,y=contingency_df.index,colorscale='Viridis',text=contingency_df.values,texttemplate='%{text}',hovertemplate='count: %{z}<br>cluster: %{y}<br>'+label+': %{x}<extra></extra>',hoverongaps=False))fig.update_layout(xaxis_title=label,yaxis_title='cluster',template='plotly_white')fig.update_xaxes(type='category')fig.update_yaxes(type='category')heatmap_path=self.result_path/f"{label}_heatmap.html"plot_path=PlotlyUtil.write_image_to_file(fig,heatmap_path,contingency_df.shape[0])outputs.append(ReportOutput(path=plot_path,name=f"Distribution heatmap for {label} with example counts "f"({dataset.get_example_count()} total examples)"))returnoutputs