import pandas as pd
from immuneML import Constants
from immuneML.IO.dataset_import.DatasetImportParams import DatasetImportParams
from immuneML.data_model.SequenceParams import RegionType, Chain
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.ImportHelper import ImportHelper
[docs]
class AdaptiveImportHelper:
[docs]
@staticmethod
def preprocess_dataframe(dataframe: pd.DataFrame, params: DatasetImportParams):
if "sequence" in dataframe.columns:
dataframe['junction'] = [y[(84 - 3 * len(x)): 78] if is_valid_sequence_str(x) else ''
for x, y in zip(dataframe['junction_aa'], dataframe['sequence'])]
dataframe['cdr3'] = dataframe['junction'].str[3:-3]
dataframe['cdr3_aa'] = dataframe['junction_aa'].str[1:-1]
if "frame_type" in dataframe.columns:
dataframe['vj_in_frame'] = (dataframe.frame_type.str.upper() == 'IN').astype(str).str[:1]
dataframe['stop_codon'] = (dataframe.frame_type.str.upper() == 'STOP').astype(str).str[:1]
dataframe['productive'] = dataframe.junction_aa.notnull().astype(str).str[:1]
dataframe.drop(columns=['frame_type'], inplace=True)
if 'duplicate_count' in dataframe.columns:
dataframe.loc[dataframe['duplicate_count'].isna(), 'duplicate_count'] = -1
dataframe.duplicate_count = dataframe.duplicate_count.astype(int)
dataframe = AdaptiveImportHelper.parse_adaptive_germline_to_imgt(dataframe, params.organism)
dataframe = set_locus_column(dataframe)
return dataframe
[docs]
@staticmethod
def parse_adaptive_germline_to_imgt(dataframe, organism):
gene_name_replacement = pd.read_csv(
EnvironmentSettings.root_path / "immuneML/IO/dataset_import/conversion/imgt_adaptive_conversion.csv")
gene_name_replacement = gene_name_replacement[gene_name_replacement.Species == organism]
gene_name_replacement = dict(zip(gene_name_replacement.Adaptive, gene_name_replacement.IMGT))
# remove C and extra 0 from gene name but not from allele (e.g., TCRBV03-01*01 -> TRBV3-1*01) to follow IMGT
# naming
germline_value_replacement = {**{"TCRB": "TRB", "TCRA": "TRA"},
**{f"-0{i}": f"-{str(i)}" for i in range(10)},
**{f"J0": "J", "V0": "V"}}
return AdaptiveImportHelper.parse_germline(dataframe, gene_name_replacement, germline_value_replacement)
[docs]
@staticmethod
def parse_germline(dataframe: pd.DataFrame, gene_name_replacement: dict, germline_value_replacement: dict):
for gene in ["v", "j"]:
if f"{gene}_call" in dataframe.columns:
dataframe = replace_nans_with_empty_str(dataframe, f"{gene}_call")
dataframe[f"{gene}_call"].replace(gene_name_replacement, regex=True, inplace=True)
dataframe[f"{gene}_call"].replace(germline_value_replacement, regex=True, inplace=True)
if f"{gene}_allele" in dataframe.columns:
rows_to_add_allele = ~dataframe[f'{gene}_call'].str.contains("\*") & dataframe[
f"{gene}_allele"].astype(
str).str.contains("[0-9]{2}")
dataframe.loc[rows_to_add_allele, f"{gene}_call"] = \
dataframe.loc[rows_to_add_allele, lambda df: [f"{gene}_call", f"{gene}_allele"]].agg('*'.join,
axis=1)
elif f"{gene}_gene" in dataframe.columns and f"{gene}_allele" in dataframe.columns:
dataframe = parse_allele(dataframe, gene)
dataframe = parse_gene_column(dataframe, gene, gene_name_replacement, germline_value_replacement)
make_gene_call_from_gene_and_allele(dataframe, gene)
elif f"{gene}_gene" in dataframe.columns:
dataframe = parse_gene_column(dataframe, gene, gene_name_replacement, germline_value_replacement)
dataframe.rename(columns={f'{gene}_gene': f'{gene}_call'}, inplace=True)
dataframe.drop(columns=['v_gene', 'j_gene', 'v_allele', 'j_allele'], inplace=True, errors='ignore')
return dataframe
[docs]
def set_locus_column(df: pd.DataFrame):
if 'locus' in df.columns:
df.locus = [Chain.get_chain_value(item) for item in df.locus]
elif 'v_call' in df.columns:
df['locus'] = [Chain.get_chain_value(item[:3]) for item in df.v_call]
elif 'j_call' in df.columns:
df['locus'] = [Chain.get_chain_value(item[:3]) for item in df.j_call]
return df
[docs]
def make_gene_call_from_gene_and_allele(df: pd.DataFrame, gene: str):
df[f"{gene}_call"] = df[[f"{gene}_gene", f"{gene}_allele"]].agg('*0'.join, axis=1)
return df
[docs]
def parse_allele(df: pd.DataFrame, gene: str):
if f"{gene}_allele" in df.columns:
df[f"{gene}_allele"] = df[f"{gene}_allele"].astype(str)
allele_set = df[f"{gene}_allele"] != 'nan'
df.loc[allele_set, f"{gene}_allele"] = df.loc[allele_set, f"{gene}_allele"].str[:-2]
df.loc[~allele_set, f"{gene}_allele"] = ''
return df
[docs]
def parse_gene_column(df: pd.DataFrame, gene, gene_name_replacement, germline_value_replacement):
df[f"{gene}_gene"].replace(germline_value_replacement, regex=True, inplace=True)
df[f"{gene}_gene"].replace(gene_name_replacement, regex=True, inplace=True)
return df
[docs]
def replace_nans_with_empty_str(df: pd.DataFrame, col: str):
df[col].replace('nan', '', inplace=True)
return df
[docs]
def is_valid_sequence_str(x):
return isinstance(x, str) and x not in ["unresolved", "no data", "na", "unknown", 'nan']