immuneML.data_model package

Subpackages

Submodules

immuneML.data_model.AIRRSequenceSet module

immuneML.data_model.EncodedData module

class immuneML.data_model.EncodedData.EncodedData(examples, labels: dict = None, example_ids: list = None, feature_names: list = None, feature_annotations: pandas.DataFrame = None, encoding: str = None, example_weights: list = None, info: dict = None, dimensionality_reduced_data: ndarray = None)[source]

Bases: object

When a dataset is encoded, it is stored in an object of EncodedData class.

Parameters:
  • examples – a design matrix containing the encoded data. This is typically a numpy array, although other matrix formats such as scipy sparse matrix, pandas dataframe or pytorch tensors are also permitted as long as the numpy matrix can be retrieved using ‘get_examples_as_np_matrix()’. The matrix is usually two-dimensional. The first dimension should be the examples, and the second (and higher) dimensions represent features.

  • feature_names – a list of feature names. The length (dimensions) of this list should match the number of features in the examples matrix.

  • feature_annotations – a data frame consisting of additional annotations for each feature. This can be used to add more information fields if feature_names is not sufficient. This data field is not used for machine learning, but may be used by some Reports.

  • example_ids – a list of example (repertoire/sequence/receptor) IDs; it must be the same length as the example_count in the examples matrix. These can be retrieved using Dataset.get_example_ids()

  • labels – a dict of labels where label names are keys and the values are lists of values for the label across examples: {‘disease1’: [‘sick’, ‘healthy’, ‘sick’]} During encoding, the labels can be computed using EncoderHelper.encode_dataset_labels()

get_examples_as_np_matrix()[source]

immuneML.data_model.SequenceParams module

class immuneML.data_model.SequenceParams.Chain(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: Enum

ALPHA = 'TRA'
BETA = 'TRB'
DELTA = 'TRD'
GAMMA = 'TRG'
HEAVY = 'IGH'
KAPPA = 'IGK'
LIGHT = 'IGL'
static get_chain(item: str)[source]
static get_chain_value(item: str)[source]
to_string()[source]
class immuneML.data_model.SequenceParams.ChainPair(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: Enum

IGH_IGK = ('IGH', 'IGK')
IGH_IGL = ('IGH', 'IGL')
TRA_TRB = ('TRA', 'TRB')
TRG_TRD = ('TRG', 'TRD')
static get_chain_pair(chains: List[Chain])[source]

Given a list of 2 chain objects, returns the relevant ChainPair

class immuneML.data_model.SequenceParams.RegionType(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: Enum

FULL_SEQUENCE = 'full_sequence'
IMGT_CDR1 = 'cdr1'
IMGT_CDR2 = 'cdr2'
IMGT_CDR3 = 'cdr3'
IMGT_FR1 = 'fwr1'
IMGT_FR2 = 'fwr2'
IMGT_FR3 = 'fwr3'
IMGT_FR4 = 'fwr4'
IMGT_JUNCTION = 'junction'
classmethod get_object(rt)[source]
to_string()[source]

immuneML.data_model.SequenceSet module

class immuneML.data_model.SequenceSet.Receptor(chain_pair: immuneML.data_model.SequenceParams.ChainPair, chain_1: immuneML.data_model.SequenceSet.ReceptorSequence, chain_2: immuneML.data_model.SequenceSet.ReceptorSequence, receptor_id: str = None, cell_id: str = None, metadata: dict = <factory>)[source]

Bases: object

cell_id: str = None
chain_1: ReceptorSequence
chain_2: ReceptorSequence
chain_pair: ChainPair
metadata: dict
receptor_id: str = None
class immuneML.data_model.SequenceSet.ReceptorSequence(sequence_id: str = '', sequence: AlphabetEncoding('ACGT') = '', sequence_aa: AlphabetEncoding('ACDEFGHIKLMNPQRSTVWY*') = '', productive: str = 'T', vj_in_frame: str = 'T', stop_codon: str = 'F', locus: str = '', locus_species: str = '', v_call: str = '', d_call: str = '', j_call: str = '', c_call: str = '', metadata: dict = <factory>, duplicate_count: int = -1, cell_id: str = '')[source]

Bases: object

c_call: str = ''
cell_id: str = ''
d_call: str = ''
duplicate_count: int = -1
get_attribute(attr_name)[source]
get_sequence(sequence_type: SequenceType = SequenceType.AMINO_ACID)[source]
j_call: str = ''
locus: str = ''
locus_species: str = ''
metadata: dict
productive: str = 'T'
sequence: AlphabetEncoding('ACGT') = ''
sequence_aa: AlphabetEncoding('ACDEFGHIKLMNPQRSTVWY*') = ''
sequence_id: str = ''
stop_codon: str = 'F'
v_call: str = ''
property v_gene
vj_in_frame: str = 'T'
class immuneML.data_model.SequenceSet.Repertoire(data_filename: pathlib.Path = None, metadata_filename: pathlib.Path = None, metadata: dict = None, identifier: str = None, dynamic_fields: dict = None, element_count: int = None, _bnp_dataclass: bytes = None, _buffer_type: bytes = None)[source]

Bases: object

property bnp_dataclass
property buffer_type
classmethod build(path: Path, metadata: dict, filename_base: str = None, identifier: str = None, **kwargs)[source]
classmethod build_from_dc_object(path: Path, metadata: dict, filename_base: str = None, identifier: str = None, data=None, type_dict: dict = None)[source]
classmethod build_from_sequences(sequences: List[ReceptorSequence], result_path: Path, filename_base: str = None, metadata: dict = None, region_type: RegionType = RegionType.IMGT_CDR3)[source]
classmethod build_like(repertoire: Repertoire, indices_to_keep, result_path: Path, filename_base: str)[source]
property data: AIRRSequenceSet
data_filename: Path = None
dynamic_fields: dict = None
element_count: int = None
get_element_count()[source]
identifier: str = None
metadata: dict = None
metadata_filename: Path = None
receptors(region_type: RegionType) List[Receptor][source]
sequences(region_type: RegionType = RegionType.IMGT_CDR3) List[ReceptorSequence][source]
immuneML.data_model.SequenceSet.build_dynamic_airr_sequence_set_dataclass(all_fields_dict: Dict[str, Any])[source]
immuneML.data_model.SequenceSet.get_sequence_value(el: AIRRSequenceSet, region_type: RegionType = RegionType.IMGT_CDR3)[source]
immuneML.data_model.SequenceSet.make_airr_seq_set_object_from_sequences(sequences: List[ReceptorSequence], region_type: RegionType = RegionType.IMGT_CDR3)[source]
immuneML.data_model.SequenceSet.make_receptors_from_data(data: AIRRSequenceSet, dynamic_fields: dict, location, region_type: RegionType = RegionType.IMGT_CDR3)[source]
immuneML.data_model.SequenceSet.make_sequences_from_data(data, dynamic_fields: dict, region_type: RegionType = RegionType.IMGT_CDR3)[source]

immuneML.data_model.bnp_util module

immuneML.data_model.bnp_util.add_neutral_values(field_values: dict, types: dict) dict[source]
immuneML.data_model.bnp_util.bnp_read_from_file(filename: Path, buffer_type: DelimitedBuffer = None, dataclass=None)[source]
immuneML.data_model.bnp_util.bnp_write_to_file(filename: Path, bnp_object)[source]
immuneML.data_model.bnp_util.build_dynamic_bnp_dataclass(all_fields_dict: Dict[str, Any])[source]
immuneML.data_model.bnp_util.build_dynamic_bnp_dataclass_obj(all_fields_dict: Dict[str, Any])[source]
immuneML.data_model.bnp_util.convert_enums_to_str(field_values: dict) dict[source]
immuneML.data_model.bnp_util.convert_to_expected_types(all_fields_dict, types) dict[source]
immuneML.data_model.bnp_util.extend_dataclass_with_dynamic_fields(cls, fields: List[Tuple[str, type]], cls_name: str = None)[source]
immuneML.data_model.bnp_util.get_field_type_from_values(values)[source]
immuneML.data_model.bnp_util.get_row_by_index(self, index) dict[source]
immuneML.data_model.bnp_util.get_rows_by_indices(self, index1, index2) dict[source]
immuneML.data_model.bnp_util.get_sequence_field_name(region_type: RegionType = RegionType.IMGT_CDR3, sequence_type: SequenceType = SequenceType.AMINO_ACID)[source]
immuneML.data_model.bnp_util.get_single_row_value(self, attr_name: str)[source]
immuneML.data_model.bnp_util.get_type_dict_from_bnp_object(bnp_object) dict[source]
immuneML.data_model.bnp_util.load_type_dict(full_dict: dict) dict[source]
immuneML.data_model.bnp_util.make_buffer_type_from_dataset_file(dataset_file: Path)[source]
immuneML.data_model.bnp_util.make_element_dataset_objects(bnp_object, class_name) list[source]
immuneML.data_model.bnp_util.make_full_airr_seq_set_df(df)[source]
immuneML.data_model.bnp_util.merge_dataclass_objects(objects: list, fill_unmatched: bool = False)[source]
immuneML.data_model.bnp_util.prepare_values_for_bnp(field_values: dict, types: dict) dict[source]
immuneML.data_model.bnp_util.read_yaml(filename: Path) dict[source]
immuneML.data_model.bnp_util.write_dataset_yaml(filename: Path, yaml_dict)[source]
immuneML.data_model.bnp_util.write_yaml(filename: Path, yaml_dict)[source]

Module contents