Source code for immuneML.data_model.repertoire.Repertoire

# quality: gold
import ast
import logging
import shutil
import weakref
from pathlib import Path
from typing import List
from uuid import uuid4

import numpy as np
import yaml

from immuneML.data_model.DatasetItem import DatasetItem
from immuneML.data_model.cell.Cell import Cell
from immuneML.data_model.cell.CellList import CellList
from immuneML.data_model.receptor.Receptor import Receptor
from immuneML.data_model.receptor.ReceptorBuilder import ReceptorBuilder
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequence import ReceptorSequence
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequenceList import ReceptorSequenceList
from immuneML.data_model.receptor.receptor_sequence.SequenceAnnotation import SequenceAnnotation
from immuneML.data_model.receptor.receptor_sequence.SequenceMetadata import SequenceMetadata
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.simulation.implants.ImplantAnnotation import ImplantAnnotation
from immuneML.util.NumpyHelper import NumpyHelper
from immuneML.util.PathBuilder import PathBuilder


[docs]class Repertoire(DatasetItem): """ Repertoire object consisting of sequence objects, each sequence attribute is stored as a list across all sequences and can be loaded separately. Internally, this class relies on numpy to store/import_dataset the data. """ FIELDS = tuple( "sequence_aas,sequences,v_genes,j_genes,v_subgroups,j_subgroups,v_alleles,j_alleles,chains,counts,region_types,frame_types," "sequence_identifiers,cell_ids".split(","))
[docs] @staticmethod def process_custom_lists(custom_lists): if custom_lists: field_list = list(custom_lists.keys()) values = [[NumpyHelper.get_numpy_representation(el) for el in custom_lists[field]] for field in custom_lists.keys()] dtype = [(field, np.array(values[index]).dtype) for index, field in enumerate(custom_lists.keys())] else: field_list, values, dtype = [], [], [] return field_list, values, dtype
[docs] @staticmethod def check_count(sequence_aas: list = None, sequences: list = None, custom_lists: dict = None) -> int: sequence_count = len(sequence_aas) if sequence_aas is not None else len(sequences) if sequences is not None else 0 if sequences is not None and sequence_aas is not None: assert len(sequences) == len(sequence_aas), \ f"Repertoire: there is a mismatch between number of nucleotide sequences ({len(sequences)}) and the number of amino acid " \ f"sequences ({len(sequence_aas)})." assert all(len(custom_lists[key]) == sequence_count for key in custom_lists) if custom_lists else True, \ f"Repertoire: there is a mismatch between the number of sequences and the number of attributes listed in " \ f"{str(list(custom_lists.keys()))[1:-1]}" return sequence_count
[docs] @classmethod def build(cls, sequence_aas: list = None, sequences: list = None, v_genes: list = None, j_genes: list = None, v_subgroups: list = None, j_subgroups: list = None, v_alleles: list = None, j_alleles: list = None, chains: list = None, counts: list = None, region_types: list = None, frame_types: list = None, custom_lists: dict = None, sequence_identifiers: list = None, path: Path = None, metadata: dict = None, signals: dict = None, cell_ids: List[str] = None, filename_base: str = None): sequence_count = Repertoire.check_count(sequence_aas, sequences, custom_lists) if sequence_identifiers is None or len(sequence_identifiers) == 0 or any(identifier is None for identifier in sequence_identifiers): sequence_identifiers = np.arange(sequence_count).astype(str) identifier = uuid4().hex filename_base = filename_base if filename_base is not None else identifier data_filename = path / f"{filename_base}.npy" field_list, values, dtype = Repertoire.process_custom_lists(custom_lists) if signals: signals_filtered = {signal: signals[signal] for signal in signals if signal not in metadata["field_list"]} field_list_signals, values_signals, dtype_signals = Repertoire.process_custom_lists(signals_filtered) field_list.extend(field_list_signals) values.extend(values_signals) dtype.extend(dtype_signals) for field in Repertoire.FIELDS: if eval(field) is not None and not all(el is None for el in eval(field)): field_list.append(field) values.append([NumpyHelper.get_numpy_representation(val) if val is not None else np.nan for val in eval(field)]) dtype.append((field, np.array(values[-1]).dtype)) repertoire_matrix = np.array(list(map(tuple, zip(*values))), order='F', dtype=dtype) np.save(str(data_filename), repertoire_matrix, allow_pickle=False) metadata_filename = path / f"{filename_base}_metadata.yaml" metadata = {} if metadata is None else metadata metadata["field_list"] = field_list with metadata_filename.open("w") as file: yaml.dump(metadata, file) repertoire = Repertoire(data_filename, metadata_filename, identifier) return repertoire
[docs] @classmethod def build_like(cls, repertoire, indices_to_keep: list, result_path: Path, filename_base: str = None): if indices_to_keep is not None and len(indices_to_keep) > 0: PathBuilder.build(result_path) data = repertoire.load_data() data = data[indices_to_keep] identifier = uuid4().hex filename_base = filename_base if filename_base is not None else identifier data_filename = result_path / f"{filename_base}.npy" np.save(str(data_filename), data) metadata_filename = result_path / f"{filename_base}_metadata.yaml" shutil.copyfile(repertoire.metadata_filename, metadata_filename) new_repertoire = Repertoire(data_filename, metadata_filename, identifier) return new_repertoire else: return None
[docs] @classmethod def build_from_sequence_objects(cls, sequence_objects: list, path: Path, metadata: dict, filename_base: str = None): assert all(isinstance(sequence, ReceptorSequence) for sequence in sequence_objects), \ "Repertoire: all sequences have to be instances of ReceptorSequence class." sequence_aas, sequences, v_genes, j_genes, v_subgroups, j_subgroups, v_alleles, j_alleles, chains, counts, region_types, frame_types, sequence_identifiers, cell_ids = [], [], [], [], [], [], [], [], [], [], [], [], [], [] custom_lists = {key: [] for key in sequence_objects[0].metadata.custom_params} if sequence_objects[0].metadata else {} signals = {} for index, sequence in enumerate(sequence_objects): sequence_identifiers.append(sequence.identifier) sequence_aas.append(sequence.amino_acid_sequence) sequences.append(sequence.nucleotide_sequence) if sequence.metadata: v_genes.append(sequence.metadata.v_gene) j_genes.append(sequence.metadata.j_gene) v_subgroups.append(sequence.metadata.v_subgroup) j_subgroups.append(sequence.metadata.j_subgroup) v_alleles.append(sequence.metadata.v_allele) j_alleles.append(sequence.metadata.j_allele) chains.append(sequence.metadata.chain) counts.append(sequence.metadata.count) region_types.append(sequence.metadata.region_type) frame_types.append(sequence.metadata.frame_type) cell_ids.append(sequence.metadata.cell_id) for param in sequence.metadata.custom_params.keys(): custom_lists[param].append(sequence.metadata.custom_params[param] if param in sequence.metadata.custom_params else None) if sequence.annotation and sequence.annotation.implants and len(sequence.annotation.implants) > 0: for implant in sequence.annotation.implants: if implant.signal_id in signals: signals[implant.signal_id].append(str(implant)) else: signals[implant.signal_id] = [None for _ in range(index)] + [str(implant)] return cls.build(sequence_aas=sequence_aas, sequences=sequences, v_genes=v_genes, j_genes=j_genes, v_subgroups=v_subgroups, j_subgroups=j_subgroups, v_alleles=v_alleles, j_alleles=j_alleles, chains=chains, counts=counts, region_types=region_types, frame_types=frame_types, custom_lists=custom_lists, sequence_identifiers=sequence_identifiers, path=path, metadata=metadata, signals=signals, cell_ids=cell_ids, filename_base=filename_base)
def __init__(self, data_filename: Path, metadata_filename: Path, identifier: str): data_filename = Path(data_filename) metadata_filename = Path(metadata_filename) if metadata_filename is not None else None assert data_filename.suffix == ".npy", \ f"Repertoire: the file representing the repertoire has to be in numpy binary format. Got {data_filename.suffix} instead." self.data_filename = data_filename if metadata_filename: with metadata_filename.open("r") as file: self.metadata = yaml.safe_load(file) self.fields = self.metadata["field_list"] self.metadata_filename = metadata_filename self.identifier = identifier self.data = None self.element_count = None
[docs] def get_sequence_aas(self): return self.get_attribute("sequence_aas")
[docs] def get_sequence_identifiers(self): return self.get_attribute("sequence_identifiers")
[docs] def get_v_genes(self): return self.get_attribute("v_genes")
[docs] def get_j_genes(self): return self.get_attribute("j_genes")
[docs] def get_counts(self): counts = self.get_attribute("counts") if counts is not None: counts = np.array([int(count) if not NumpyHelper.is_nan_or_empty(count) else None for count in counts]) return counts
[docs] def get_chains(self): chains = self.get_attribute("chains") if chains is not None: chains = np.array([Chain.get_chain(chain_str) if chain_str is not None else None for chain_str in chains]) return chains
[docs] def load_data(self): if self.data is None or (isinstance(self.data, weakref.ref) and self.data() is None): data = np.load(self.data_filename, allow_pickle=False) self.data = weakref.ref(data) if EnvironmentSettings.low_memory else data data = self.data() if EnvironmentSettings.low_memory else self.data self.element_count = data.shape[0] return data
[docs] def get_attribute(self, attribute): data = self.load_data() if attribute in data.dtype.names: tmp = data[attribute] return tmp else: return None
[docs] def get_attributes(self, attributes: list): data = self.load_data() result = {} for attribute in attributes: if attribute in data.dtype.names: result[attribute] = data[attribute] else: logging.warning(f"{Repertoire.__name__}: attribute {attribute} is not present in the repertoire {self.identifier}, skipping...") return result
[docs] def free_memory(self): self.data = None
def __getstate__(self): state = self.__dict__.copy() del state['data'] return state def __setstate__(self, state): self.__dict__.update(state) self.data = None
[docs] def get_element_count(self): if self.element_count is None: self.load_data() return self.element_count
def _make_sequence_object(self, row, load_implants: bool = False): fields = row.dtype.names implants = [] if load_implants: keys = [key for key in row.dtype.names if key not in Repertoire.FIELDS] for key in keys: value_dict = row[key] if value_dict: try: implants.append(ImplantAnnotation(**ast.literal_eval(value_dict))) except (SyntaxError, ValueError, TypeError) as e: pass seq = ReceptorSequence(amino_acid_sequence=row["sequence_aas"] if "sequence_aas" in fields else None, nucleotide_sequence=row["sequences"] if "sequences" in fields else None, identifier=row["sequence_identifiers"] if "sequence_identifiers" in fields else None, metadata=SequenceMetadata(v_gene=row["v_genes"] if "v_genes" in fields else None, j_gene=row["j_genes"] if "j_genes" in fields else None, v_subgroup=row["v_subgroups"] if "v_subgroups" in fields else None, j_subgroup=row["j_subgroups"] if "j_subgroups" in fields else None, v_allele=row["v_alleles"] if "v_alleles" in fields else None, j_allele=row["j_alleles"] if "j_alleles" in fields else None, chain=row["chains"] if "chains" in fields else None, count=row["counts"] if "counts" in fields and not NumpyHelper.is_nan_or_empty(row['counts']) else None, region_type=row["region_types"] if "region_types" in fields else None, frame_type=row["frame_types"] if "frame_types" in fields else "IN", cell_id=row["cell_ids"] if "cell_ids" in fields else None, custom_params={key: row[key] if key in fields else None for key in set(self.fields) - set(Repertoire.FIELDS)}), annotation=SequenceAnnotation(implants=implants)) return seq def _prepare_cell_lists(self): data = self.load_data() assert "cell_ids" in data.dtype.names and data["cell_ids"] is not None, \ f"Repertoire: cannot return receptor objects in repertoire {self.identifier} since cell_ids are not specified. " \ f"Existing fields are: {str(data.dtype.names)[1:-1]}" same_cell_lists = NumpyHelper.group_structured_array_by(data, "cell_ids") return same_cell_lists def _make_receptors(self, cell_content): sequences = ReceptorSequenceList() for item in cell_content: sequences.append(self._make_sequence_object(item)) return ReceptorBuilder.build_objects(sequences)
[docs] def get_sequence_objects(self, load_implants: bool = False) -> List[ReceptorSequence]: """ Lazily loads sequences from disk to reduce RAM consumption Args: load_implants: whether implants should be parsed to objects and converted to ImplantAnnotations; if True, might slow down the loading Returns: a list of ReceptorSequence objects """ seqs = [] data = self.load_data() for i in range(len(self.get_sequence_identifiers())): seq = self._make_sequence_object(data[i], load_implants) seqs.append(seq) return seqs
@property def sequences(self): return self.get_sequence_objects(False) @property def receptors(self) -> List[Receptor]: """ A property that creates a list of Receptor objects based on the cell_ids field in the following manner: - all sequences that have the same cell_id are grouped together - they are divided into groups based on the chain - all valid combinations of chains are created and used to make a receptor object - this means that if a cell has two beta (b1 and b2) and one alpha chain (a1), two receptor objects will be created: receptor1 (b1, a1), receptor2 (b2, a1) To avoid have multiple receptors in the same cell, use some of the preprocessing classes which could merge/eliminate multiple sequences. See the documentation of the preprocessing module for more information. Returns: ReceptorList: a list of objects of Receptor class """ receptors = [] same_cell_lists = self._prepare_cell_lists() for cell_content in same_cell_lists: receptors.extend(self._make_receptors(cell_content)) return receptors @property def cells(self) -> CellList: """ A property that creates a list of Cell objects based on the cell_ids field in the following manner: - all sequences that have the same cell_id are grouped together - they are divided into groups based on the chain - all valid combinations of chains are created and used to make a receptor object - this means that if a cell has two beta (b1 and b2) and one alpha chain (a1), two receptor objects will be created: receptor1 (b1, a1), receptor2 (b2, a1) - an object of the Cell class is created from all receptors with the same cell_id created as described in the previous steps To avoid have multiple receptors in the same cell, use some of the preprocessing classes which could merge/eliminate multiple sequences. See the documentation of the preprocessing module for more information. Returns: CellList: a list of objects of Cell class """ cells = CellList() cell_lists = self._prepare_cell_lists() for cell_content in cell_lists: receptors = self._make_receptors(cell_content) cells.append(Cell(receptors)) return cell_lists