# quality: gold
import copy
import uuid
from pathlib import Path
import pandas as pd
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.data_model.encoded_data.EncodedData import EncodedData
from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.environment.Constants import Constants
[docs]class RepertoireDataset(Dataset):
def __init__(self, labels: dict = None, encoded_data: EncodedData = None, repertoires: list = None, identifier: str = None,
metadata_file: Path = None, name: str = None):
super().__init__(encoded_data, name, identifier if identifier is not None else uuid.uuid4().hex, labels)
self.metadata_file = metadata_file
self.metadata_fields = None
self.repertoire_ids = None
self.repertoires = repertoires
[docs] def clone(self):
return RepertoireDataset(self.labels, copy.deepcopy(self.encoded_data), copy.deepcopy(self.repertoires),
metadata_file=self.metadata_file, name=self.name)
[docs] def add_encoded_data(self, encoded_data: EncodedData):
self.encoded_data = encoded_data
[docs] def get_data(self, batch_size: int = 1):
return self.repertoires
[docs] def get_batch(self, batch_size: int = 1):
return self.repertoires
[docs] def get_repertoire(self, index: int = -1, repertoire_identifier: str = "") -> Repertoire:
assert index != -1 or repertoire_identifier != "", \
"RepertoireDataset: cannot import_dataset repertoire since the index nor identifier are set."
return self.repertoires[index] if index != -1 else [rep for rep in self.repertoires if rep.identifier == repertoire_identifier][0]
[docs] def get_example_count(self):
return len(self.repertoires)
[docs] def get_label_names(self, refresh=False):
"""Returns the list of metadata fields which can be used as labels; if refresh=True, it reloads the fields from disk"""
all_metadata_fields = set(self.get_metadata_fields(refresh))
for non_label in ["subject_id", "filename", "repertoire_identifier", "identifier"]:
if non_label in all_metadata_fields:
all_metadata_fields.remove(non_label)
return all_metadata_fields
[docs] def get_filenames(self):
"""Returns the paths to files in which repertoire information is stored"""
return [Path(filename) for filename in self.get_metadata(["filename"])["filename"]]
def _build_new_metadata(self, indices, path: Path) -> Path:
if self.metadata_file:
df = pd.read_csv(self.metadata_file, comment=Constants.COMMENT_SIGN)
df = df.iloc[indices, :]
df.to_csv(path, index=False)
return path
else:
return None
[docs] def make_subset(self, example_indices, path: Path, dataset_type: str):
"""
Creates a new dataset object with only those examples (repertoires) available which were given by index in example_indices argument.
Args:
example_indices (list): a list of indices of examples (repertoires) to use in the new dataset
path (Path): a path where to store the newly created dataset
dataset_type (str): a type of the dataset used as a part of the name of the resulting dataset; the values are defined as constants in :py:obj:`~immuneML.data_model.dataset.Dataset.Dataset`
Returns:
a new RepertoireDataset object which includes only the repertoires specified under example_indices
"""
metadata_file = self._build_new_metadata(example_indices, path / f"{dataset_type}_metadata.csv")
new_dataset = RepertoireDataset(repertoires=[self.repertoires[i] for i in example_indices], labels=copy.deepcopy(self.labels),
metadata_file=metadata_file, identifier=str(uuid.uuid1()))
return new_dataset
[docs] def get_repertoire_ids(self) -> list:
"""Returns a list of repertoire identifiers, same as get_example_ids()"""
if self.repertoire_ids is None:
self.repertoire_ids = [str(repertoire.identifier) for repertoire in self.repertoires]
return self.repertoire_ids
[docs] def get_example_ids(self):
"""Returns a list of example identifiers"""
return self.get_repertoire_ids()