[docs]classClusteringVisualization(ClusteringMethodReport):""" A report that creates low-dimensional visualizations of clustering results using the specified dimensionality reduction method. For each dataset and clustering configuration, it creates a scatter plot where points are colored by their cluster assignments. Specification arguments: - dim_red_method (dict): specification of which dimensionality reduction to perform; valid options are presented under :ref:`**Dimensionality reduction methods**` and should be specified with the name of the method and its parameters, see the example below; if not specified, the report will use any dimensionality reduced data present in the dataset's encoded data; if the dataset does not contain dimensionality reduced data, and the encoded data has more than 2 dimensions, the report will be skipped. YAML specification: .. indent with spaces .. code-block:: yaml reports: my_report_with_pca: ClusteringVisualization: dim_red_method: PCA: n_components: 2 my_report_with_tsne: ClusteringVisualization: dim_red_method: TSNE: n_components: 2 init: pca my_report_existing_dim_red: ClusteringVisualization: dim_red_method: null """def__init__(self,dim_red_method:DimRedMethod=None,name:str=None,result_path:Path=None,clustering_item:ClusteringItem=None):super().__init__(name=name,result_path=result_path,clustering_item=clustering_item)self.dim_red_method=dim_red_methodself.result_name=Noneself.desc="Clustering Visualization"self._dimension_names=self.dim_red_method.get_dimension_names()ifself.dim_red_methodelseNone
[docs]@classmethoddefbuild_object(cls,**kwargs):location="ClusteringVisualization"name=kwargs["name"]if"name"inkwargselseNoneresult_path=kwargs["result_path"]if"result_path"inkwargselseNoneif"dim_red_method"inkwargsandkwargs["dim_red_method"]:method_name=list(kwargs["dim_red_method"].keys())[0]dim_red_method=MLParser.parse_any_model("dim_red_method",kwargs["dim_red_method"],method_name)[0]else:logging.warning(f"{location}: No dimensionality reduction method specified. ""If the encoded dataset includes dimensionality reduction, it will be used.")dim_red_method=Nonereturncls(dim_red_method=dim_red_method,name=name,result_path=result_path,clustering_item=kwargs['clustering_item']if'clustering_item'inkwargselseNone,)
def_generate(self)->ReportResult:PathBuilder.build(self.result_path)self.result_name=f"clustering_{self.dim_red_method.__class__.__name__.lower()}_plots"result_path=PathBuilder.build(self.result_path/self.result_name)plot_path=self._make_plot(result_path)report_output=ReportOutput(plot_path,f"Clustering visualization for {self.item.cl_setting.get_key()}")returnReportResult(f"{self.desc} ({self.name})",info=f"Visualizations of clustering results using "f"{self.dim_red_method.__class__.__name__ifself.dim_red_methodelse'encoded data directly'}.",output_figures=[report_output])def_make_plot(self,result_path:Path)->Path:ifself.dim_red_methodisnotNone:transformed_data=self.dim_red_method.fit_transform(dataset=self.item.dataset)elifself.item.dataset.encoded_data.dimensionality_reduced_dataisnotNone:transformed_data=self.item.dataset.encoded_data.dimensionality_reduced_dataself._dimension_names=self.item.dataset.encoded_data.dim_namesifself.item.dataset.encoded_data.dim_nameselse['dim1','dim2']self.dim_red_method=self.item.dim_red_methodelifself.item.dataset.encoded_data.examples.shape[1]<=2:transformed_data=self.item.dataset.encoded_data.get_examples_as_np_matrix()self._dimension_names=self.item.dataset.encoded_data.feature_namesself.dim_red_method=Noneelse:raiseValueError("ClusteringVisualization: No dimensionality reduction method specified, and the dataset ""does not contain dimensionality reduced data. Please specify a dimensionality reduction ""method.")df=pd.DataFrame(transformed_data,columns=self._dimension_names)df['cluster']=pd.Series(self.item.predictions).astype(str)df['id']=self.item.dataset.get_example_ids()unique_clusters=sorted(df.cluster.astype(int).unique())color_palette=self.get_color_palette(len(unique_clusters))fig=px.scatter(df,x=self._dimension_names[0],y=self._dimension_names[1],color='cluster',color_discrete_sequence=color_palette,category_orders={'cluster':[str(c)forcinunique_clusters]},hover_data=['id'])fig.update_layout(template="plotly_white")df.to_csv(result_path/f"clustering_visualization_{self.dim_red_method.nameifself.dim_red_methodelse''}.csv",index=False)plot_path=PlotlyUtil.write_image_to_file(fig,result_path/f"clustering_visualization_{self.dim_red_method.nameifself.dim_red_methodelse''}.html",df.shape[0])returnplot_path
[docs]defget_color_palette(self,n_clusters):ifn_clusters<=10:returnpx.colors.qualitative.Vividelifn_clusters<=24:returnpx.colors.qualitative.Dark24else:logging.warning(f"ClusteringVisualization: number of clusters is {n_clusters}, which is commonly too many to "f"visualize effectively.")returnplotly.colors.sample_colorscale('Plasma',[i/n_clustersforiinrange(n_clusters)])
[docs]defcheck_prerequisites(self)->bool:"""The results cannot be visualized in this report if the encoded data is precomputed distances"""fromimmuneML.encodings.distance_encoding.DistanceEncoderimportDistanceEncoderfromimmuneML.encodings.distance_encoding.TCRdistEncoderimportTCRdistEncoderreturnnotisinstance(self.item.encoder,TCRdistEncoder)andnotisinstance(self.item.encoder,DistanceEncoder)