immuneML.data_model.datasets package

Submodules

immuneML.data_model.datasets.Dataset module

class immuneML.data_model.datasets.Dataset.Dataset(identifier: str = None, name: str = None, encoded_data: immuneML.data_model.EncodedData.EncodedData = None, labels: dict = <factory>, dataset_file: pathlib.Path = None)[source]

Bases: object

PREPROCESSED = 'preprocessed'
SUBSAMPLED = 'subsampled'
TEST = 'test'
TRAIN = 'train'
abstract classmethod build_from_objects(**kwargs)[source]
abstract clone(keep_identifier: bool = False)[source]
abstract classmethod create_metadata_dict(**kwargs)[source]
dataset_file: Path = None
encoded_data: EncodedData = None
abstract get_data(batch_size: int = 1)[source]
abstract get_data_from_index_range(start_index: int, end_index: int)[source]
abstract get_example_count()[source]
abstract get_example_ids()[source]
abstract get_label_names()[source]
abstract get_metadata(field_names: list, return_df: bool = False)[source]
identifier: str = None
labels: dict
abstract make_subset(example_indices, path, dataset_type: str)[source]
name: str = None

immuneML.data_model.datasets.ElementDataset module

class immuneML.data_model.datasets.ElementDataset.ElementDataset(identifier: str = None, name: str = None, encoded_data: immuneML.data_model.EncodedData.EncodedData = None, labels: dict = <factory>, dataset_file: pathlib.Path = None, filename: pathlib.Path = None, element_count: int = None, element_ids: list = None, dynamic_fields: dict = None, bnp_dataclass: Type = None)[source]

Bases: Dataset, ABC

bnp_dataclass: Type = None
property buffer_type
clone(keep_identifier: bool = False)[source]
classmethod create_metadata_dict(dataset_class, filename, type_dict, name, labels, identifier=None)[source]
property data: AIRRSequenceSet
dataset_file: Path = None
dynamic_fields: dict = None
element_count: int = None
element_ids: list = None
filename: Path = None
get_label_names()[source]
class immuneML.data_model.datasets.ElementDataset.ReceptorDataset(identifier: str = None, name: str = None, encoded_data: ~immuneML.data_model.EncodedData.EncodedData = None, labels: dict = <factory>, dataset_file: ~pathlib.Path = None, filename: ~pathlib.Path = None, element_count: int = None, element_ids: list = None, dynamic_fields: dict = None, bnp_dataclass: ~typing.Type = None)[source]

Bases: ElementDataset

A dataset class for receptor datasets (paired chain).

classmethod build(filename: Path, metadata_filename: Path, name: str = None, bnp_dc=None, labels: dict = None)[source]
classmethod build_from_objects(receptors: List[Receptor], path: Path, name: str = None, labels: dict = None, region_type: RegionType = RegionType.IMGT_CDR3)[source]
get_data(batch_size: int = 1, region_type: RegionType = RegionType.IMGT_CDR3)[source]
get_data_from_index_range(start_index: int, end_index: int)[source]
get_example_count()[source]
get_example_ids()[source]
get_metadata(field_names: list, return_df: bool = False)[source]

Returns a dict or an equivalent pandas DataFrame with metadata information from Receptor objects for provided field names

make_subset(example_indices, path, dataset_type: str)[source]
class immuneML.data_model.datasets.ElementDataset.SequenceDataset(identifier: str = None, name: str = None, encoded_data: ~immuneML.data_model.EncodedData.EncodedData = None, labels: dict = <factory>, dataset_file: ~pathlib.Path = None, filename: ~pathlib.Path = None, element_count: int = None, element_ids: list = None, dynamic_fields: dict = None, bnp_dataclass: ~typing.Type = None)[source]

Bases: ElementDataset

A dataset class for sequence datasets (single chain).

classmethod build(filename: Path, metadata_filename: Path, name: str = None, bnp_dc=None, labels: dict = None)[source]
classmethod build_from_objects(sequences: List[ReceptorSequence], path: Path, name: str = None, labels: dict = None, region_type: RegionType = RegionType.IMGT_CDR3)[source]
classmethod build_from_partial_df(df: pandas.DataFrame, path: Path, name: str = None, labels: dict = None, type_dict: dict = None)[source]
get_attribute(attribute_name)[source]
get_data(batch_size: int = 1, region_type: RegionType = RegionType.IMGT_CDR3)[source]
get_data_from_index_range(start_index: int, end_index: int)[source]
get_example_count()[source]
get_example_ids()[source]
get_metadata(field_names: list, return_df: bool = False)[source]

Returns a dict or an equivalent pandas DataFrame with metadata information from ReceptorSequence objects for provided field names

make_subset(example_indices, path, dataset_type: str)[source]
immuneML.data_model.datasets.ElementDataset.fill_in_neutral_vals(all_fields, airr_fields, sequences)[source]
immuneML.data_model.datasets.ElementDataset.fix_empty_strings_in_metadata(df: pandas.DataFrame)[source]
immuneML.data_model.datasets.ElementDataset.make_all_fields_dict_from_receptors(receptors: List[Receptor], region_type: RegionType = RegionType.IMGT_CDR3)[source]
immuneML.data_model.datasets.ElementDataset.make_all_fields_dict_from_sequences(sequences: List[ReceptorSequence], region_type: RegionType = RegionType.IMGT_CDR3)[source]

immuneML.data_model.datasets.RepertoireDataset module

class immuneML.data_model.datasets.RepertoireDataset.RepertoireDataset(labels: dict = None, encoded_data: EncodedData = None, repertoires: list = None, identifier: str = None, metadata_file: Path = None, name: str = None, metadata_fields: list = None, repertoire_ids: list = None, dataset_file: Path = None)[source]

Bases: Dataset

add_encoded_data(encoded_data: EncodedData)[source]
classmethod build(**kwargs)[source]
classmethod build_from_objects(**kwargs)[source]
clone(keep_identifier: bool = False)[source]
classmethod create_metadata_dict(metadata_file, labels, name, identifier=None)[source]
get_data(batch_size: int = 1)[source]
get_data_from_index_range(start_index: int, end_index: int)[source]
get_example_count()[source]
get_example_ids()[source]

Returns a list of example identifiers

get_filenames()[source]
get_label_names(refresh=False)[source]

Returns the list of metadata fields which can be used as labels; if refresh=True, it reloads the fields from disk

get_metadata(field_names: list, return_df: bool = False)[source]
get_metadata_fields(refresh=False)[source]

Returns the list of metadata fields, includes also the fields that will typically not be used as labels, like filename or identifier

get_repertoire(index: int = -1, repertoire_identifier: str = '') Repertoire[source]
get_repertoire_ids() list[source]

Returns a list of repertoire identifiers, same as get_example_ids()

get_subject_ids()[source]

Returns a list of subject identifiers

make_subset(example_indices, path: Path, dataset_type: str)[source]

Creates a new dataset object with only those examples (repertoires) available which were given by index in example_indices argument.

Parameters:
  • example_indices (list) – a list of indices of examples (repertoires) to use in the new dataset

  • path (Path) – a path where to store the newly created dataset

  • dataset_type (str) – a type of the dataset used as a part of the name of the resulting dataset; the values are defined as constants in Dataset

Returns:

a new RepertoireDataset object which includes only the repertoires specified under example_indices

Module contents