Source code for immuneML.reports.encoding_reports.DesignMatrixExporter

import logging
import os
import warnings
import zipfile
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
import yaml

from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder


[docs] class DesignMatrixExporter(EncodingReport): """ Exports the design matrix and related information of a given encoded Dataset to csv files. If the encoded data has more than 2 dimensions (such as when using the OneHot encoder with option Flatten=False), the data are then exported to different formats to facilitate their import with external software. Arguments: file_format (str): the format and extension of the file to store the design matrix. The supported formats are: npy, csv, hdf5, npy.zip, csv.zip or hdf5.zip. Note: when using hdf5 or hdf5.zip output formats, make sure the 'hdf5' dependency is installed. YAML specification: .. indent with spaces .. code-block:: yaml my_dme_report: DesignMatrixExporter: file_format: csv """ def __init__(self, dataset: Dataset = None, result_path: Path = None, file_format: str = None, number_of_processes: int = 1, name: str = None): super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name) self.file_format = file_format
[docs] @classmethod def build_object(cls, **kwargs): ParameterValidator.assert_keys_present(list(kwargs.keys()), ['file_format', 'name'], DesignMatrixExporter.__name__, DesignMatrixExporter.__name__) ParameterValidator.assert_in_valid_list(kwargs['file_format'], ['npy', 'csv', 'npy.zip', 'csv.zip', 'hdf5.zip'], DesignMatrixExporter.__name__, 'file_format') return DesignMatrixExporter(**kwargs)
def _generate(self) -> ReportResult: PathBuilder.build(self.result_path) matrix_result = self._export_matrix() details_result = self._export_details() label_result = self._export_labels() return ReportResult(self.name, info="The design matrix and related information of a given encoded Dataset", output_tables=[matrix_result, label_result], output_text=[details_result]) def _export_matrix(self) -> ReportOutput: """Create a file for the design matrix in the desired format.""" data = self._get_data() file_path = self.result_path / "design_matrix" ext = os.path.splitext(self.file_format)[0] file_path = file_path.with_suffix('.' + ext) # Use h5py to create a hdf5 file. if ext == "hdf5": import h5py with h5py.File(str(file_path), 'w') as hf_object: hf_object.create_dataset(str(file_path), data=data) # Use numpy to create a csv or npy file. elif len(data.shape) <= 2 and ext == "csv": feature_names = self.dataset.encoded_data.feature_names header = ",".join(str(name) for name in feature_names) if feature_names is not None else "" np.savetxt(fname=str(file_path), X=data, delimiter=",", comments='', header=header) else: if ext != "npy": logging.info('The selected Report format is not compatible, .npy is used instead') file_path = file_path.with_suffix(".npy") ext = "npy" np.save(str(file_path), data) # If requested, compress the file into a .zip. if self.file_format.endswith(".zip"): file_path_zip = file_path.with_suffix('.' + ext + '.zip') with zipfile.ZipFile(str(file_path_zip), 'w') as zipped_file: zipped_file.write(str(file_path), compress_type=zipfile.ZIP_DEFLATED) os.remove(str(file_path)) file_path = file_path_zip return ReportOutput(file_path, "design matrix") def _get_data(self) -> np.ndarray: if isinstance(self.dataset.encoded_data.examples, np.ndarray): data = self.dataset.encoded_data.examples elif isinstance(self.dataset.encoded_data.examples, pd.DataFrame): data = self.dataset.encoded_data.examples.to_numpy() else: #scipy data = self.dataset.encoded_data.examples.toarray() return data def _export_details(self) -> ReportOutput: file_path = self.result_path / "encoding_details.yaml" with file_path.open("w") as file: details = { "feature_names": self.dataset.encoded_data.feature_names, "encoding": self.dataset.encoded_data.encoding, "example_ids": list(self.dataset.encoded_data.example_ids) } yaml.dump(details, file) return ReportOutput(file_path, "encoding details") def _export_labels(self) -> ReportOutput: if self.dataset.encoded_data.labels is not None: labels_df = pd.DataFrame(self.dataset.encoded_data.labels) file_path = self.result_path / "labels.csv" labels_df.to_csv(file_path, sep=",", index=False) return ReportOutput(file_path, "exported labels")
[docs] def check_prerequisites(self): if self.dataset.encoded_data is None or self.dataset.encoded_data.examples is None: warnings.warn("DesignMatrixExporter: the dataset is not encoded, skipping this report...") return False else: return True