Source code for immuneML.reports.clustering_method_reports.ClusteringVisualization

import logging
from pathlib import Path

import pandas as pd
import plotly
import plotly.express as px

from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.dsl.definition_parsers.MLParser import MLParser
from immuneML.ml_methods.dim_reduction.DimRedMethod import DimRedMethod
from immuneML.reports.PlotlyUtil import PlotlyUtil
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.clustering_method_reports.ClusteringMethodReport import ClusteringMethodReport
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.instructions.clustering.clustering_run_model import ClusteringItem



[docs]
class ClusteringVisualization(ClusteringMethodReport):
    """
    A report that creates low-dimensional visualizations of clustering results using the specified dimensionality reduction method.
    For each dataset and clustering configuration, it creates a scatter plot where points are colored by their cluster assignments.

    Specification arguments:

        - dim_red_method (dict): specification of which dimensionality reduction to perform; valid options are presented
          under :ref:`**Dimensionality reduction methods**` and should be specified with the name of the method and its
          parameters, see the example below; if not specified, the report will use any dimensionality reduced data
          present in the dataset's encoded data; if the dataset does not contain dimensionality reduced data, and the
          encoded data has more than 2 dimensions, the report will be skipped.

    YAML specification:

    .. indent with spaces
    .. code-block:: yaml

        reports:
            my_report_with_pca:
                ClusteringVisualization:
                    dim_red_method:
                        PCA:
                            n_components: 2
            my_report_with_tsne:
                ClusteringVisualization:
                    dim_red_method:
                        TSNE:
                            n_components: 2
                            init: pca
            my_report_existing_dim_red:
                ClusteringVisualization:
                    dim_red_method: null

    """

    def __init__(self, dim_red_method: DimRedMethod = None, name: str = None,
                 result_path: Path = None, clustering_item: ClusteringItem = None):
        super().__init__(name=name, result_path=result_path, clustering_item=clustering_item)
        self.dim_red_method = dim_red_method
        self.result_name = None
        self.desc = "Clustering Visualization"
        self._dimension_names = self.dim_red_method.get_dimension_names() if self.dim_red_method else None


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        location = "ClusteringVisualization"
        name = kwargs["name"] if "name" in kwargs else None
        result_path = kwargs["result_path"] if "result_path" in kwargs else None

        if "dim_red_method" in kwargs and kwargs["dim_red_method"]:
            method_name = list(kwargs["dim_red_method"].keys())[0]
            dim_red_method = MLParser.parse_any_model("dim_red_method", kwargs["dim_red_method"], method_name)[0]
        else:
            logging.warning(f"{location}: No dimensionality reduction method specified. "
                            "If the encoded dataset includes dimensionality reduction, it will be used.")
            dim_red_method = None

        return cls(dim_red_method=dim_red_method, name=name, result_path=result_path,
                   clustering_item=kwargs['clustering_item'] if 'clustering_item' in kwargs else None,)


    def _generate(self) -> ReportResult:
        PathBuilder.build(self.result_path)
        self.result_name = f"clustering_{self.dim_red_method.__class__.__name__.lower()}_plots"
        result_path = PathBuilder.build(self.result_path / self.result_name)

        plot_path = self._make_plot(result_path)
        report_output = ReportOutput(plot_path,
                                     f"Clustering visualization for {self.item.cl_setting.get_key()}")

        return ReportResult(f"{self.desc} ({self.name})",
                            info=f"Visualizations of clustering results",
                            output_figures=[report_output])

    def _make_plot(self, result_path: Path) -> Path:
        if self.dim_red_method is not None:
            transformed_data = self.dim_red_method.fit_transform(dataset=self.item.dataset)
        elif self.item.dataset.encoded_data.dimensionality_reduced_data is not None:
            transformed_data = self.item.dataset.encoded_data.dimensionality_reduced_data
            self._dimension_names = self.item.dataset.encoded_data.dim_names if self.item.dataset.encoded_data.dim_names else ['dim1', 'dim2']
        elif self.item.dataset.encoded_data.examples.shape[1] <= 2:
            transformed_data = self.item.dataset.encoded_data.get_examples_as_np_matrix()
            self._dimension_names = self.item.dataset.encoded_data.feature_names
        else:
            raise ValueError("ClusteringVisualization: No dimensionality reduction method specified, and the dataset "
                             "does not contain dimensionality reduced data. Please specify a dimensionality reduction "
                             "method.")

        df = pd.DataFrame(transformed_data, columns=self._dimension_names)
        df['cluster'] = pd.Series(self.item.predictions).astype(str)
        df['id'] = self.item.dataset.get_example_ids()

        fig = px.scatter(df, x=self._dimension_names[0], y=self._dimension_names[1], color='cluster',
                         color_discrete_sequence=plotly.colors.qualitative.Set2,
                         category_orders={'cluster': sorted(df.cluster.unique())},
                         hover_data=['id'])

        fig.update_layout(template="plotly_white")

        df.to_csv(result_path / f"clustering_visualization_{self.dim_red_method.name if self.dim_red_method else ''}.csv", index=False)

        plot_path = PlotlyUtil.write_image_to_file(fig,
                                                   result_path / f"clustering_visualization_{self.dim_red_method.name if self.dim_red_method else ''}.html",
                                                   df.shape[0])

        return plot_path


[docs]
    def get_ids(self):
        if isinstance(self.item.dataset, RepertoireDataset):
            metadata = self.item.dataset.get_metadata(['subject_id'], return_df=True)
            if 'subject_id' in metadata.columns:
                return metadata['subject_id'].tolist()
            else:
                return self.item.dataset.get_example_ids()
        else:
            return self.item.dataset.get_example_ids()