Source code for immuneML.util.RepertoireBuilder

import uuid
from dataclasses import fields
from datetime import datetime
from pathlib import Path

import pandas as pd

from immuneML.data_model.AIRRSequenceSet import AIRRSequenceSet
from immuneML.data_model.SequenceSet import ReceptorSequence, Repertoire
from immuneML.data_model.bnp_util import write_yaml, build_dynamic_bnp_dataclass_obj, make_full_airr_seq_set_df
from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.util.PathBuilder import PathBuilder


[docs] class RepertoireBuilder: """ Helper class for tests: creates repertoires from a list of a list of sequences and stores them in the given path """
[docs] @staticmethod def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list = None, subject_ids: list = None, name: str = "d1"): if subject_ids is not None: assert len(subject_ids) == len(sequences) if seq_metadata is not None: assert len(sequences) == len(seq_metadata) for index, sequence_list in enumerate(sequences): assert len(sequence_list) == len(seq_metadata[index]) assert not ( path / "repertoires").is_dir(), f"RepertoireBuilder: attempted to store new repertoires at {path / 'repertoires'} but this folder already exists. " \ f"Please remove this folder or specify a different path. " PathBuilder.build(path) rep_path = PathBuilder.build(path / "repertoires") repertoires = [] if subject_ids is None: subject_ids = [] for rep_index, sequence_list in enumerate(sequences): if len(subject_ids) < len(sequences): subject_ids.append("rep_" + str(rep_index)) df = pd.DataFrame({ 'cdr3_aa': sequence_list, 'cdr3': ['' for _ in range(len(sequence_list))], 'sequence_id': [uuid.uuid4().hex for _ in range(len(sequence_list))], 'productive': ['T' for _ in range(len(sequence_list))], 'vj_in_frame': ['T' for _ in range(len(sequence_list))], 'stop_codon': ['F' for _ in range(len(sequence_list))] }) if seq_metadata is None: df['v_call'], df['j_call'], df['duplicate_count'], df['locus'] = "TRBV1-1*01", "TRBJ1-1*01", 1, "TRB" else: df = pd.concat([df, pd.DataFrame.from_records(seq_metadata[rep_index])], axis=1) df = make_full_airr_seq_set_df(df) df.to_csv(str(rep_path / f'rep_{rep_index}.tsv'), sep='\t', index=False) if labels is not None: metadata = {key: labels[key][rep_index] for key in labels.keys()} else: metadata = {} metadata['type_dict_dynamic_fields'] = {key: AIRRSequenceSet.TYPE_TO_STR[df[key].dtype] for key in df.columns if key not in AIRRSequenceSet.get_field_type_dict()} metadata = {**metadata, **{"subject_id": subject_ids[rep_index]}} write_yaml(rep_path / f"rep_{rep_index}.yaml", metadata) bnp_dc_obj, _ = build_dynamic_bnp_dataclass_obj(df.to_dict(orient='list')) repertoire = Repertoire(rep_path / f"rep_{rep_index}.tsv", rep_path / f"rep_{rep_index}.yaml", metadata, _bnp_dataclass=type(bnp_dc_obj), identifier=uuid.uuid4().hex) repertoires.append(repertoire) df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires], "subject_id": subject_ids, "identifier": [repertoire.identifier for repertoire in repertoires]}, **(labels if labels is not None else {})}) df.to_csv(path / f"{name}_metadata.csv", index=False) return repertoires, path / f"{name}_metadata.csv"
[docs] @staticmethod def build_dataset(sequences: list, path: Path, labels: dict = None, seq_metadata: list = None, subject_ids: list = None, name: str = "d1"): reps, metadata_file = RepertoireBuilder.build(sequences, path, labels, seq_metadata, subject_ids, name) type_dict = {k: v for tmp_dict in [rep.metadata['type_dict_dynamic_fields'] for rep in reps] for k, v in tmp_dict.items()} labels_unique = {k: list(set(v)) for k, v in labels.items()} if isinstance(labels, dict) else {} identifier = uuid.uuid4().hex metadata_yaml = RepertoireDataset.create_metadata_dict(type_dict=type_dict, labels=labels_unique, identifier=identifier, metadata_file=str(metadata_file.name), name=name) write_yaml(path / f'{name}.yaml', metadata_yaml) return RepertoireDataset(repertoires=reps, metadata_file=metadata_file, name=name, labels=labels_unique, dataset_file=path / f'{name}.yaml', identifier=identifier)