Source code for immuneML.reports.encoding_reports.DesignMatrixExporter

import logging
import os
import warnings
import zipfile
from dataclasses import dataclass
from pathlib import Path

import h5py
import numpy as np
import pandas as pd
import yaml

from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder


[docs]class DesignMatrixExporter(EncodingReport): """ Exports the design matrix and related information of a given encoded Dataset to csv files. If the encoded data has more than 2 dimensions (such as when using the OneHot encoder with option Flatten=False), the data are then exported to different formats to facilitate their import with external software. Arguments: file_format (str): the format and extension of the file to store the design matrix. The supported formats are: npy, csv, hdf5, npy.zip, csv.zip or hdf5.zip. YAML specification: .. indent with spaces .. code-block:: yaml my_dme_report: DesignMatrixExporter: file_format: csv """ def __init__(self, dataset: Dataset = None, result_path: Path = None, file_format: str = None, number_of_processes: int = 1, name: str = None): super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name) self.file_format = file_format
[docs] @classmethod def build_object(cls, **kwargs): ParameterValidator.assert_keys_present(list(kwargs.keys()), ['file_format', 'name'], DesignMatrixExporter.__name__, DesignMatrixExporter.__name__) ParameterValidator.assert_in_valid_list(kwargs['file_format'], ['npy', 'csv', 'npy.zip', 'csv.zip', 'hdf5.zip'], DesignMatrixExporter.__name__, 'file_format') return DesignMatrixExporter(**kwargs)
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) matrix_result = self._export_matrix() details_result = self._export_details() label_result = self._export_labels() return ReportResult(self.name, info="The design matrix and related information of a given encoded Dataset", output_tables=[matrix_result, label_result], output_text=[details_result]) def _export_matrix(self) -> ReportOutput: """Create a file for the design matrix in the desired format.""" data = self._get_data() file_path = self.result_path / "design_matrix" ext = os.path.splitext(self.file_format)[0] file_path = file_path.with_suffix('.' + ext) # Use h5py to create a hdf5 file. if ext == "hdf5": with h5py.File(str(file_path), 'w') as hf_object: hf_object.create_dataset(str(file_path), data=data) # Use numpy to create a csv or npy file. elif len(data.shape) <= 2 and ext == "csv": feature_names = self.dataset.encoded_data.feature_names header = ",".join(str(name) for name in feature_names) if feature_names is not None else "" np.savetxt(fname=str(file_path), X=data, delimiter=",", comments='', header=header) else: if ext != "npy": logging.info('The selected Report format is not compatible, .npy is used instead') file_path = file_path.with_suffix(".npy") ext = "npy" np.save(str(file_path), data) # If requested, compress the file into a .zip. if self.file_format.endswith(".zip"): file_path_zip = file_path.with_suffix('.' + ext + '.zip') with zipfile.ZipFile(str(file_path_zip), 'w') as zipped_file: zipped_file.write(str(file_path), compress_type=zipfile.ZIP_DEFLATED) os.remove(str(file_path)) file_path = file_path_zip return ReportOutput(file_path, "design matrix") def _get_data(self) -> np.ndarray: if isinstance(self.dataset.encoded_data.examples, np.ndarray): data = self.dataset.encoded_data.examples elif isinstance(self.dataset.encoded_data.examples, pd.DataFrame): data = self.dataset.encoded_data.examples.to_numpy() else: #scipy data = self.dataset.encoded_data.examples.toarray() return data def _export_details(self) -> ReportOutput: file_path = self.result_path / "encoding_details.yaml" with file_path.open("w") as file: details = { "feature_names": self.dataset.encoded_data.feature_names, "encoding": self.dataset.encoded_data.encoding, "example_ids": list(self.dataset.encoded_data.example_ids) } yaml.dump(details, file) return ReportOutput(file_path, "encoding details") def _export_labels(self) -> ReportOutput: if self.dataset.encoded_data.labels is not None: labels_df = pd.DataFrame(self.dataset.encoded_data.labels) file_path = self.result_path / "labels.csv" labels_df.to_csv(file_path, sep=",", index=False) return ReportOutput(file_path, "exported labels")
[docs] def check_prerequisites(self): if self.dataset.encoded_data is None or self.dataset.encoded_data.examples is None: warnings.warn("DesignMatrixExporter: the dataset is not encoded, skipping this report...") return False else: return True