import logging
from typing import Tuple, List
import pandas as pd
from immuneML.caching.CacheHandler import CacheHandler
from immuneML.data_model.SequenceParams import RegionType, Chain
from immuneML.data_model.datasets.ElementDataset import ElementDataset
[docs]
class TCRdistHelper:
[docs]
@staticmethod
def get_chains(dataset: ElementDataset) -> list:
"""
Returns the chains used in the dataset.
Args:
dataset: receptor dataset for which all pairwise distances between receptors will be computed
Returns:
a list of chains used in the dataset
"""
return [str(Chain.get_chain(el)).lower() for el in set(dataset.data.locus.tolist())]
[docs]
@staticmethod
def compute_tcr_dist(dataset: ElementDataset, label_names: list, cores: int = 1, cdr3_only: bool = False):
return CacheHandler.memo_by_params((('dataset_identifier', dataset.identifier), ("type", "TCRrep")),
lambda: TCRdistHelper._compute_tcr_dist(dataset, label_names, cores, cdr3_only))
@staticmethod
def _compute_tcr_dist(dataset: ElementDataset, label_names: list, cores: int, cdr3_only: bool = False):
"""
Computes the tcrdist distances by creating a TCRrep object and calling compute_distances() function.
Parameters `ntrim` and `ctrim` in TCRrep object for CDR3 are adjusted to account for working with IMGT CDR3 definition if IMGT CDR3 was set
as region_type for the dataset upon importing. `deduplicate` parameter is set to False as we assume that we work with clones in immuneML,
and not individual receptors.
Args:
dataset: receptor dataset for which all pairwise distances between receptors will be computed
label_names: a list of label names (e.g., specific epitopes) to be used for later classification or reports
cores: how many cpus to use for computation
Returns:
an instance of TCRrep object with computed pairwise distances between all receptors in the dataset
"""
from tcrdist.repertoire import TCRrep
df, chains = TCRdistHelper.prepare_tcr_dist_dataframe(dataset, label_names)
tcr_rep = TCRrep(cell_df=df, chains=chains, organism=dataset.labels["organism"], cpus=cores,
deduplicate=False, compute_distances=False)
if cdr3_only:
for chain in chains:
for attr_prefix in ['metrics', 'kargs', 'weights']:
setattr(tcr_rep, f'{attr_prefix}_{chain[0]}',
{key: value for key, value in getattr(tcr_rep, f'{attr_prefix}_{chain[0]}').items() if 'cdr3' in key})
tcr_rep.compute_distances()
return tcr_rep
[docs]
@staticmethod
def add_default_allele_to_v_gene(v_gene: str):
if v_gene is not None and "*" not in v_gene:
return f"{v_gene}*01"
else:
return v_gene
[docs]
@staticmethod
def prepare_tcr_dist_dataframe(dataset: ElementDataset, label_names: list) -> Tuple[pd.DataFrame, List[str]]:
df = dataset.data.topandas()
if "subject" not in df:
df['subject'] = "sub" + df['cell_id']
df.loc[df['v_call'].str.contains("\*"), 'v_call'] = [TCRdistHelper.add_default_allele_to_v_gene(el) for el in
df.loc[df['v_call'].str.contains("\*"), 'v_call']]
df.loc[df['j_call'].str.contains("\*"), 'j_call'] = [TCRdistHelper.add_default_allele_to_v_gene(el) for el in
df.loc[df['j_call'].str.contains("\*"), 'j_call']]
unique_chains = [str(Chain.get_chain(el)).lower() for el in df['locus'].unique().tolist()]
df['clone_id'] = df['cell_id' if len(unique_chains) == 2 else 'sequence_id']
cols_to_keep = ['cdr3', 'cdr3_aa', 'v_call', 'j_call', 'duplicate_count', 'subject', 'clone_id'] + label_names
df_alpha, df_beta = None, None
if 'alpha' in unique_chains:
df_alpha = (df[df.locus == 'TRA'][cols_to_keep]
.rename(columns={"cdr3_aa": "cdr3_a_aa", "cdr3": "cdr3_a_nucseq", "v_call": "v_a_gene",
'j_call': "j_a_gene", "duplicate_count": "count"}))
df_alpha.loc[:, 'count'] = [1 if el in [-1, None] else el for el in df_alpha['count']]
if len(unique_chains) == 1:
df = df_alpha
if 'beta' in unique_chains:
df_beta = (df[df.locus == 'TRB'][cols_to_keep].rename(
columns={"cdr3_aa": "cdr3_b_aa", "cdr3": "cdr3_b_nucseq", "v_call": "v_b_gene",
'j_call': "j_b_gene", "duplicate_count": "count"}))
df_beta.loc[:, 'count'] = [1 if el in [-1, None] else el for el in df_beta['count']]
if len(unique_chains) == 1:
df = df_beta
if len(unique_chains) == 2:
df = df_alpha.merge(df_beta, on=['clone_id', 'subject', 'count'] + label_names)
return df, unique_chains