[docs]classEncodedData:""" When a dataset is encoded, it is stored in an object of EncodedData class. Arguments: examples: a design matrix containing the encoded data. This is typically a numpy array, although other matrix formats such as scipy sparse matrix, pandas dataframe or pytorch tensors are also permitted as long as the numpy matrix can be retrieved using 'get_examples_as_np_matrix()'. The matrix is usually two-dimensional. The first dimension should be the examples, and the second (and higher) dimensions represent features. feature_names: a list of feature names. The length (dimensions) of this list should match the number of features in the examples matrix. feature_annotations: a data frame consisting of additional annotations for each feature. This can be used to add more information fields if feature_names is not sufficient. This data field is not used for machine learning, but may be used by some Reports. example_ids: a list of example (repertoire/sequence/receptor) IDs; it must be the same length as the example_count in the examples matrix. These can be retrieved using Dataset.get_example_ids() labels: a dict of labels where label names are keys and the values are lists of values for the label across examples: {'disease1': ['sick', 'healthy', 'sick']} During encoding, the labels can be computed using EncoderHelper.encode_dataset_labels() """def__init__(self,examples,labels:dict=None,example_ids:list=None,feature_names:list=None,feature_annotations:pd.DataFrame=None,encoding:str=None,example_weights:list=None,info:dict=None,dimensionality_reduced_data:np.ndarray=None):assertfeature_namesisNoneorexamples.shape[1]==len(feature_names), \
(f"EncodedData: the length of feature_names ({len(feature_names)}) must match the feature dimension of the "f"example matrix ({examples.shape[1]})")iffeature_namesisnotNone:assertfeature_annotationsisNoneorfeature_annotations.shape[0]==len(feature_names)==examples.shape[1]ifexample_idsisnotNoneandlabelsisnotNone:forlabelinlabels.values():assertlen(label)==len(example_ids),"EncodedData: there are {} labels, but {} examples"\
.format(len(label),len(example_ids))assertexamplesisNoneorlen(example_ids)==examples.shape[0],("EncodedData: there are {} example ids, but {} examples.".format(len(example_ids),examples.shape[0]))ifexample_weightsisnotNone:assertlen(example_weights)==len(example_ids)ifexamplesisnotNone:assertall(len(labels[key])==examples.shape[0]forkeyinlabels.keys())iflabelsisnotNoneelseTrueifexample_weightsisnotNone:assertlen(example_weights)==examples.shape[0]self.examples=examplesself.labels=labelsself.example_ids=example_idsself.feature_names=feature_namesself.feature_annotations=feature_annotationsself.encoding=encodingself.example_weights=example_weightsself.info=infoself.dimensionality_reduced_data=dimensionality_reduced_data
[docs]defget_examples_as_np_matrix(self):ifisinstance(self.examples,np.ndarray):returnself.exampleselifisinstance(self.examples,pd.DataFrame):returnself.examples.to_numpy()try:fromscipy.sparseimportissparseifissparse(self.examples):returnself.examples.toarray()exceptImportErrorase:logging.warning(f"{EncodedData.__name__}: scipy could not be imported.")try:importtorchiftorch.is_tensor(self.examples):returnself.examples.numpy()exceptImportErrorase:logging.warning(f"{EncodedData.__name__}: torch could not be imported.")raiseValueError(f"EncodedData: examples matrix of type '{type(self.examples)}' cannot be converted to a numpy matrix.")