Source code for immuneML.data_model.EncodedData

import logging

import pandas as pd
import numpy as np


[docs] class EncodedData: """ When a dataset is encoded, it is stored in an object of EncodedData class. Arguments: examples: a design matrix containing the encoded data. This is typically a numpy array, although other matrix formats such as scipy sparse matrix, pandas dataframe or pytorch tensors are also permitted as long as the numpy matrix can be retrieved using 'get_examples_as_np_matrix()'. The matrix is usually two-dimensional. The first dimension should be the examples, and the second (and higher) dimensions represent features. feature_names: a list of feature names. The length (dimensions) of this list should match the number of features in the examples matrix. feature_annotations: a data frame consisting of additional annotations for each feature. This can be used to add more information fields if feature_names is not sufficient. This data field is not used for machine learning, but may be used by some Reports. example_ids: a list of example (repertoire/sequence/receptor) IDs; it must be the same length as the example_count in the examples matrix. These can be retrieved using Dataset.get_example_ids() labels: a dict of labels where label names are keys and the values are lists of values for the label across examples: {'disease1': ['sick', 'healthy', 'sick']} During encoding, the labels can be computed using EncoderHelper.encode_dataset_labels() """ def __init__(self, examples, labels: dict = None, example_ids: list = None, feature_names: list = None, feature_annotations: pd.DataFrame = None, encoding: str = None, example_weights: list = None, info: dict = None, dimensionality_reduced_data: np.ndarray = None): assert feature_names is None or examples.shape[1] == len(feature_names), \ (f"EncodedData: the length of feature_names ({len(feature_names)}) must match the feature dimension of the " f"example matrix ({examples.shape[1]})") if feature_names is not None: assert feature_annotations is None or feature_annotations.shape[0] == len(feature_names) == examples.shape[1] if example_ids is not None and labels is not None: for label in labels.values(): assert len(label) == len(example_ids), "EncodedData: there are {} labels, but {} examples"\ .format(len(label), len(example_ids)) assert examples is None or len(example_ids) == examples.shape[0], ( "EncodedData: there are {} example ids, but {} examples." .format(len(example_ids), examples.shape[0])) if example_weights is not None: assert len(example_weights) == len(example_ids) if examples is not None: assert all(len(labels[key]) == examples.shape[0] for key in labels.keys()) if labels is not None else True if example_weights is not None: assert len(example_weights) == examples.shape[0] self.examples = examples self.labels = labels self.example_ids = example_ids self.feature_names = feature_names self.feature_annotations = feature_annotations self.encoding = encoding self.example_weights = example_weights self.info = info self.dimensionality_reduced_data = dimensionality_reduced_data
[docs] def get_examples_as_np_matrix(self): if isinstance(self.examples, np.ndarray): return self.examples elif isinstance(self.examples, pd.DataFrame): return self.examples.to_numpy() try: from scipy.sparse import issparse if issparse(self.examples): return self.examples.toarray() except ImportError as e: logging.warning(f"{EncodedData.__name__}: scipy could not be imported.") try: import torch if torch.is_tensor(self.examples): return self.examples.numpy() except ImportError as e: logging.warning(f"{EncodedData.__name__}: torch could not be imported.") raise ValueError(f"EncodedData: examples matrix of type '{type(self.examples)}' cannot be converted to a numpy matrix.")