Source code for immuneML.reports.encoding_reports.DimensionalityReduction

import logging
from pathlib import Path

import pandas as pd
import plotly.express as px

from immuneML.data_model.EncodedData import EncodedData
from immuneML.data_model.datasets.Dataset import Dataset
from immuneML.dsl.definition_parsers.MLParser import MLParser
from immuneML.ml_methods.dim_reduction.DimRedMethod import DimRedMethod
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.PathBuilder import PathBuilder


[docs] class DimensionalityReduction(EncodingReport): """ This report visualizes the data obtained by dimensionality reduction. **Specification arguments:** - label (str): name of the label to use for highlighting data points; or None - dim_red_method (str): name of the dimensionality reduction method defined under ml_methods that will be used to transform the data for plotting; if None, it will visualize the encoded data of reduced dimensionality if set **YAML specification:** .. indent with spaces .. code-block:: yaml definitions: reports: rep1: DimensionalityReduction: label: epitope dim_red_method: PCA: n_components: 2 """
[docs] @classmethod def build_object(cls, **kwargs): if "dim_red_method" in kwargs: cls_name = list(kwargs['dim_red_method'].keys())[0] method = MLParser.parse_any_model("dim_red_method", kwargs['dim_red_method'], cls_name)[0] else: method = None return DimensionalityReduction(**{**kwargs, "dim_red_method": method})
def __init__(self, dataset: Dataset = None, batch_size: int = 1, result_path: Path = None, name: str = None, label: str = None, dim_red_method: DimRedMethod = None): super().__init__(dataset=dataset, result_path=result_path, name=name) self._label = label self._dim_red_method = dim_red_method self.info = (f"This report visualizes the encoded data after applying dimensionality reduction " f"({self._dim_red_method.__class__.__name__}).")
[docs] def check_prerequisites(self): return (isinstance(self.dataset.encoded_data, EncodedData) and self.dataset.encoded_data.dimensionality_reduced_data is not None)
def _generate(self) -> ReportResult: if self._dim_red_method: assert self.dataset.encoded_data.examples is not None, \ f"{DimensionalityReduction.__name__}: data not encoded, report will not be made." dim_reduced_data = self._dim_red_method.fit_transform(self.dataset) else: assert self.dataset.encoded_data.dimensionality_reduced_data is not None dim_reduced_data = self.dataset.encoded_data.dimensionality_reduced_data assert dim_reduced_data.shape[1] == 2 data_labels = None try: data_labels = self.dataset.get_attribute(self._label).tolist() except (AttributeError, TypeError) as e: logging.warning(f"Label {self._label} not found in the dataset. Skipping label coloring in the plot.") PathBuilder.build(self.result_path) df = pd.DataFrame({'example_id': self.dataset.get_example_ids(), "x": dim_reduced_data[:, 0], 'y': dim_reduced_data[:, 1]}) if self._label: df[self._label] = data_labels df.to_csv(self.result_path / 'dimensionality_reduced_data.csv', index=False) report_output_fig = self._safe_plot(df=df, output_written=True) output_figures = None if report_output_fig is None else [report_output_fig] return ReportResult(name=self.name, info=self.info, output_figures=output_figures, output_tables=[ReportOutput(self.result_path / 'dimensionality_reduced_data.csv', 'data after dimensionality reduction')]) def _plot(self, df: pd.DataFrame) -> ReportOutput: figure = px.scatter(df, x="x", y="y", color=self._label) figure.update_layout(template="plotly_white") PathBuilder.build(self.result_path) file_path = self.result_path / "dimensionality_reduction.html" figure.write_html(str(file_path)) return ReportOutput(path=file_path, name="Data visualization after dimensionality reduction")