import os
import subprocess
import warnings
from pathlib import Path
import pandas as pd
from immuneML.data_model.receptor.RegionType import RegionType
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
from immuneML.util.CompAIRRParams import CompAIRRParams
[docs]
class CompAIRRHelper:
[docs]
@staticmethod
def determine_compairr_path(compairr_path):
if compairr_path is None:
try:
compairr_path = CompAIRRHelper.check_compairr_path("compairr")
except Exception as e:
compairr_path = CompAIRRHelper.check_compairr_path("/usr/local/bin/compairr")
else:
compairr_path = CompAIRRHelper.check_compairr_path(compairr_path)
return compairr_path
[docs]
@staticmethod
def check_compairr_path(compairr_path):
required_major = 1
required_minor = 3
required_patch = 2
try:
compairr_result = subprocess.run([str(Path(compairr_path)), "--version"], capture_output=True)
assert compairr_result.returncode == 0, "exit code was non-zero."
output = str(compairr_result.stderr).split()
major, minor, patch = output[1].split(".")
mssg = f"CompAIRR version {required_major}.{required_minor}.{required_patch} or higher is required, found version {output[1]}"
assert int(major) >= required_major, mssg
if major == 1:
assert int(minor) >= required_minor, mssg
if minor == 3:
assert int(patch) >= {required_patch}, mssg
except Exception as e:
raise Exception(f"CompAIRRHelper: failed to call CompAIRR: {e}\n"
f"Please ensure the correct version of CompAIRR has been installed (version {required_major}.{required_minor}.{required_patch} or later), "
f"or provide the path to the CompAIRR executable.")
return compairr_path
[docs]
@staticmethod
def get_cmd_args(compairr_params: CompAIRRParams, input_file_list, result_path):
indels_args = ["-i"] if compairr_params.indels else []
frequency_args = ["-f"] if compairr_params.ignore_counts else []
ignore_genes = ["-g"] if compairr_params.ignore_genes else []
output_args = ["-o", str(result_path / compairr_params.output_filename), "-l", str(result_path / compairr_params.log_filename)]
output_pairs = ['-p', str(result_path / compairr_params.pairs_filename)] if compairr_params.output_pairs else []
cdr3_indicator = ['--cdr3'] if compairr_params.is_cdr3 else []
command = '-m' if compairr_params.do_repertoire_overlap and not compairr_params.do_sequence_matching else '-x'
return [str(compairr_params.compairr_path), command, "-d", str(compairr_params.differences), "-t", str(compairr_params.threads)] + \
indels_args + frequency_args + ignore_genes + output_args + input_file_list + output_pairs + cdr3_indicator
[docs]
@staticmethod
def write_repertoire_file(repertoire_dataset=None, filename=None, compairr_params=None, repertoires: list = None,
export_sequence_id: bool = False):
mode = "w"
header = True
columns_in_order = []
if repertoire_dataset is not None and repertoires is None:
repertoires = repertoire_dataset.get_data()
for ind, repertoire in enumerate(repertoires):
repertoire_contents = CompAIRRHelper.get_repertoire_contents(repertoire, compairr_params, export_sequence_id)
if ind == 0:
columns_in_order = sorted(repertoire_contents.columns)
repertoire_contents[columns_in_order].to_csv(filename, mode=mode, header=header, index=False, sep="\t")
mode = "a"
header = False
[docs]
@staticmethod
def get_repertoire_contents(repertoire, compairr_params, export_sequence_id=False):
attributes = [EnvironmentSettings.get_sequence_type().value, "counts"]
attributes += [] if compairr_params.ignore_genes else ["v_genes", "j_genes"]
repertoire_contents = repertoire.get_attributes(attributes)
repertoire_contents = pd.DataFrame({**repertoire_contents, "identifier": repertoire.identifier})
if export_sequence_id:
repertoire_contents['sequence_id'] = repertoire.get_attribute('sequence_identifiers')
check_na_rows = [EnvironmentSettings.get_sequence_type().value]
check_na_rows += [] if compairr_params.ignore_counts else ["counts"]
check_na_rows += [] if compairr_params.ignore_genes else ["v_genes", "j_genes"]
n_rows_before = len(repertoire_contents)
repertoire_contents.dropna(inplace=True, subset=check_na_rows)
if n_rows_before > len(repertoire_contents):
warnings.warn(
f"CompAIRRHelper: removed {n_rows_before - len(repertoire_contents)} entries from repertoire {repertoire.identifier} due to missing values.")
if compairr_params.ignore_counts:
repertoire_contents["counts"] = 1
repertoire_contents.rename(columns={EnvironmentSettings.get_sequence_type().value: "cdr3_aa" if repertoire.get_region_type() == RegionType.IMGT_CDR3 else 'junction_aa',
"v_genes": "v_call", "j_genes": "j_call",
"counts": "duplicate_count", "identifier": "repertoire_id"},
inplace=True)
return repertoire_contents
[docs]
@staticmethod
def verify_compairr_output_path(subprocess_result, compairr_params, result_path):
output_file = result_path / compairr_params.output_filename
if not output_file.is_file():
raise RuntimeError(
f"CompAIRRHelper: failed to calculate the distance matrix with CompAIRR ({compairr_params.compairr_path}). "
f"The following error occurred: {subprocess_result.stderr}")
if os.path.getsize(output_file) == 0:
raise RuntimeError(
f"CompAIRRHelper: failed to calculate the distance matrix with CompAIRR ({compairr_params.compairr_path}), output matrix is empty. "
f"For details see the log file at {result_path / compairr_params.log_filename}")
return output_file
[docs]
@staticmethod
def read_compairr_output_file(output_file):
return pd.read_csv(output_file, sep="\t", index_col=0)
[docs]
@staticmethod
def process_compairr_output_file(subprocess_result, compairr_params, result_path):
output_file = CompAIRRHelper.verify_compairr_output_path(subprocess_result, compairr_params, result_path)
return CompAIRRHelper.read_compairr_output_file(output_file)