from uuid import uuid4
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.data_model.encoded_data.EncodedData import EncodedData
from immuneML.data_model.receptor.ElementGenerator import ElementGenerator
[docs]class ElementDataset(Dataset):
"""
This is the base class for ReceptorDataset and SequenceDataset which implements all the functionality for both classes. The only difference between
these two classes is whether paired or single chain data is stored.
"""
def __init__(self, labels: dict = None, encoded_data: EncodedData = None, filenames: list = None, identifier: str = None,
file_size: int = 50000, name: str = None):
super().__init__()
self.labels = labels
self.encoded_data = encoded_data
self.identifier = identifier if identifier is not None else uuid4().hex
self._filenames = sorted(filenames) if filenames is not None else []
self.element_generator = ElementGenerator(self._filenames, file_size)
self.file_size = file_size
self.element_ids = None
self.name = name
[docs] def get_data(self, batch_size: int = 10000):
self._filenames.sort()
self.element_generator.file_list = self._filenames
return self.element_generator.build_element_generator()
[docs] def get_batch(self, batch_size: int = 10000):
self._filenames.sort()
self.element_generator.file_list = self._filenames
return self.element_generator.build_batch_generator()
[docs] def get_filenames(self):
return self._filenames
[docs] def set_filenames(self, filenames):
self._filenames = filenames
[docs] def get_example_count(self):
return len(self.get_example_ids())
[docs] def get_example_ids(self):
if self.element_ids is None or (isinstance(self.element_ids, list) and len(self.element_ids) == 0):
self.element_ids = []
for element in self.get_data():
self.element_ids.append(element.identifier)
return self.element_ids
[docs] def make_subset(self, example_indices, path, dataset_type: str):
"""
Creates a new dataset object with only those examples (receptors or receptor sequences) available which were given by index in example_indices argument.
Args:
example_indices (list): a list of indices of examples (receptors or receptor sequences) to use in the new dataset
path (Path): a path where to store the newly created dataset
dataset_type (str): a type of the dataset used as a part of the name of the resulting dataset; the values are defined as constants in :py:obj:`~immuneML.data_model.dataset.Dataset.Dataset`
Returns:
a new dataset object (ReceptorDataset or SequenceDataset, as the original dataset) which includes only the examples specified under example_indices
"""
new_dataset = self.__class__(labels=self.labels, file_size=self.file_size)
batch_filenames = self.element_generator.make_subset(example_indices, path, dataset_type, new_dataset.identifier)
new_dataset.set_filenames(batch_filenames)
new_dataset.name = f"{self.name}_split_{dataset_type.lower()}"
return new_dataset
[docs] def get_label_names(self):
"""Returns the list of metadata fields which can be used as labels"""
return [label for label in list(self.labels.keys()) if label not in ['region_type', 'receptor_chains', 'organism']]
[docs] def clone(self):
raise NotImplementedError