from typing import List
import pandas as pd
from immuneML.IO.dataset_export.ImmuneMLExporter import ImmuneMLExporter
from immuneML.IO.dataset_import.DataImport import DataImport
from immuneML.IO.dataset_import.DatasetImportParams import DatasetImportParams
from immuneML.data_model.dataset.ReceptorDataset import ReceptorDataset
from immuneML.data_model.receptor.ChainPair import ChainPair
from immuneML.data_model.receptor.ReceptorBuilder import ReceptorBuilder
from immuneML.data_model.receptor.RegionType import RegionType
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequence import ReceptorSequence
from immuneML.data_model.receptor.receptor_sequence.SequenceMetadata import SequenceMetadata
from immuneML.util.ImportHelper import ImportHelper
from immuneML.util.PathBuilder import PathBuilder
from scripts.specification_util import update_docs_per_mapping
[docs]class SingleLineReceptorImport(DataImport):
"""
Imports data from a tabular file (where each line contains a pair of immune receptor sequences) into a ReceptorDataset.
If you instead want to import a ReceptorDataset from a tabular file that contains one receptor sequence per line,
see :ref:`Generic` import.
Arguments:
path (str): Required parameter. This is the path to a directory with files to import.
receptor_chains (str): Required parameter. Determines which pair of chains to import for each Receptor.
Valid values for receptor_chains are the names of the :py:obj:`~immuneML.data_model.receptor.ChainPair.ChainPair` enum.
import_empty_nt_sequences (bool): imports sequences which have an empty nucleotide sequence field; can be True or False.
By default, import_empty_nt_sequences is set to True.
import_empty_aa_sequences (bool): imports sequences which have an empty amino acid sequence field; can be True or False; for analysis on
amino acid sequences, this parameter should be False (import only non-empty amino acid sequences). By default, import_empty_aa_sequences is set to False.
region_type (str): Which part of the sequence to import. When IMGT_CDR3 is specified, immuneML assumes the IMGT
junction (including leading C and trailing Y/F amino acids) is used in the input file, and the first and last
amino acids will be removed from the sequences to retrieve the IMGT CDR3 sequence. Specifying any other value
will result in importing the sequences as they are.
Valid values for region_type are the names of the :py:obj:`~immuneML.data_model.receptor.RegionType.RegionType` enum.
column_mapping (dict): A mapping where the keys are the column names in the input file, and the values must be
mapped to the following fields: <chain>_amino_acid_sequence, <chain>_nucleotide_sequence, <chain>_v_gene,
<chain>_j_gene, identifier, epitope.
The possible names that can be filled in for <chain> are given in :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`
Any column namme other than the sequence, v/j genes and identifier will be set as metadata fields to the
Receptors, and can subsequently be used as labels in immuneML instructions.
For TCR alpha-beta receptor import, a column mapping could for example look like this:
.. indent with spaces
.. code-block:: yaml
cdr3_a_aa: alpha_amino_acid_sequence
cdr3_b_aa: beta_amino_acid_sequence
cdr3_a_nucseq: alpha_nucleotide_sequence
cdr3_b_nucseq: beta_nucleotide_sequence
v_a_gene: alpha_v_gene
v_b_gene: beta_v_gene
j_a_gene: alpha_j_gene
j_b_gene: beta_j_gene
clone_id: identifier
epitope: epitope # metadata field
column_mapping_synonyms (dict): This is a column mapping that can be used if a column could have alternative names.
The formatting is the same as column_mapping. If some columns specified in column_mapping are not found in the file,
the columns specified in column_mapping_synonyms are instead attempted to be loaded.
columns_to_load (list): Optional; specifies which columns to load from the input file. This may be useful if
the input files contain many unused columns. If no value is specified, all columns are loaded.
separator (str): Required parameter. Column separator, for example "\\t" or ",".
organism (str): The organism that the receptors came from. This will be set as a parameter in the ReceptorDataset object.
YAML specification:
.. indent with spaces
.. code-block:: yaml
my_receptor_dataset:
format: SingleLineReceptor
params:
path: path/to/files/
receptor_chains: TRA_TRB # what chain pair to import
separator: "\\t" # column separator
import_empty_nt_sequences: True # keep sequences even though the nucleotide sequence might be empty
import_empty_aa_sequences: False # filter out sequences if they don't have sequence_aa set
region_type: IMGT_CDR3 # what part of the sequence to import
columns_to_load: # which subset of columns to load from the file
- subject
- epitope
- count
- v_a_gene
- j_a_gene
- cdr3_a_aa
- v_b_gene
- j_b_gene
- cdr3_b_aa
- clone_id
column_mapping: # column mapping file: immuneML
cdr3_a_aa: alpha_amino_acid_sequence
cdr3_b_aa: beta_amino_acid_sequence
cdr3_a_nucseq: alpha_nucleotide_sequence
cdr3_b_nucseq: beta_nucleotide_sequence
v_a_gene: alpha_v_gene
v_b_gene: beta_v_gene
j_a_gene: alpha_j_gene
j_b_gene: beta_j_gene
clone_id: identifier
epitope: epitope
organism: mouse
"""
[docs] @staticmethod
def import_dataset(params, dataset_name: str) -> ReceptorDataset:
generic_params = DatasetImportParams.build_object(**params) if isinstance(params, dict) else params
filenames = ImportHelper.get_sequence_filenames(generic_params.path, dataset_name)
PathBuilder.build(generic_params.result_path, warn_if_exists=True)
dataset = SingleLineReceptorImport._import_from_files(filenames, generic_params)
dataset.name = dataset_name
dataset.labels = ImportHelper.extract_sequence_dataset_params(params=generic_params)
ImmuneMLExporter.export(dataset, generic_params.result_path)
return dataset
@staticmethod
def _import_from_files(filenames: List[str], generic_params: DatasetImportParams) -> ReceptorDataset:
elements = []
for file in filenames:
df = pd.read_csv(file, sep=generic_params.separator, usecols=generic_params.columns_to_load)
df.dropna()
df.drop_duplicates()
df.rename(columns=generic_params.column_mapping, inplace=True)
if "alpha_amino_acid_sequence" in df:
df["alpha_amino_acid_sequence"] = df["alpha_amino_acid_sequence"].str[1:-1]
if "beta_amino_acid_sequence" in df:
df["beta_amino_acid_sequence"] = df["beta_amino_acid_sequence"].str[1:-1]
if "alpha_nucleotide_sequence" in df:
df["alpha_nucleotide_sequence"] = df["alpha_nucleotide_sequence"].str[3:-3]
if "beta_nucleotide_sequence" in df:
df["beta_nucleotide_sequence"] = df["beta_nucleotide_sequence"].str[3:-3]
chain_vals = [ch for ch in generic_params.receptor_chains.value]
chain_names = [Chain.get_chain(ch).name.lower() for ch in generic_params.receptor_chains.value]
for chain_name in chain_names:
df = SingleLineReceptorImport.make_gene_columns(df, ["v", "j"], chain_name)
for index, row in df.iterrows():
sequences = {chain_vals[i]: ReceptorSequence(amino_acid_sequence=row[
chain_name + "_amino_acid_sequence"] if chain_name + "_amino_acid_sequence" in row else None,
nucleotide_sequence=row[
chain_name + "_nucleotide_sequence"] if chain_name + "_nucleotide_sequence" in row else None,
metadata=SequenceMetadata(
v_gene=row[f"{chain_name}_v_gene"], v_allele=row[f"{chain_name}_v_allele"],
v_subgroup=row[f'{chain_name}_v_subgroup'],
j_gene=row[f"{chain_name}_j_gene"], j_allele=row[f"{chain_name}_j_allele"],
j_subgroup=row[f'{chain_name}_j_subgroup'],
chain=chain_name, count=row["count"], region_type=generic_params.region_type.value))
for i, chain_name in enumerate(chain_names)}
elements.append(ReceptorBuilder.build_object(sequences, row["identifier"],
{key: row[key] for key in row.keys()
if all(item not in key for item in
["v_gene", 'j_gene', "count", "identifier"] + chain_names)}))
return ReceptorDataset.build_from_objects(elements, generic_params.sequence_file_size, generic_params.result_path)
[docs] @staticmethod
def make_gene_columns(df: pd.DataFrame, genes: list, chain_name=None):
for gene in genes:
column_name = f"{gene}_gene" if chain_name is None else f"{chain_name}_{gene}_gene"
if column_name in df.columns:
df[column_name.replace("gene", "allele")] = df[column_name]
df[column_name] = [item.split("*")[0] for item in df[column_name]]
df[column_name.replace("gene", "subgroup")] = [item.split("-")[0] for item in df[column_name]]
return df
[docs] @staticmethod
def get_documentation():
doc = str(SingleLineReceptorImport.__doc__)
valid_chain_names = str([item.name for item in Chain])[1:-1].replace("'", "`")
valid_chain_pair_names = str([item.name for item in ChainPair])[1:-1].replace("'", "`")
region_type_values = str([region_type.name for region_type in RegionType])[1:-1].replace("'", "`")
mapping = {
"The possible names that can be filled in for <chain> are given in :py:obj:`~immuneML.data_model.receptor.receptor_sequence.Chain.Chain`":
f"The possible names that can be filled in for <chain> are: {valid_chain_names}.",
"Valid values for receptor_chains are the names of the :py:obj:`~immuneML.data_model.receptor.ChainPair.ChainPair` enum.":
f"Valid values for receptor_chains are: {valid_chain_pair_names}.",
"Valid values for region_type are the names of the :py:obj:`~immuneML.data_model.receptor.RegionType.RegionType` enum.": f"Valid values are {region_type_values}.",
}
doc = update_docs_per_mapping(doc, mapping)
return doc