import warnings
from pathlib import Path
import pandas as pd
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.data_model.dataset.ReceptorDataset import ReceptorDataset
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequence import ReceptorSequence
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.data_reports.DataReport import DataReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
[docs]
class CytoscapeNetworkExporter(DataReport):
"""
This report exports the Receptor sequences to .sif format, such that they can directly be
imported as a network in Cytoscape, to visualize chain sharing between the different receptors
in a dataset (for example, for TCRs: how often one alpha chain is shared with multiple beta chains, and vice versa).
The Receptor sequences can be provided as a ReceptorDataset, or a RepertoireDataset (containing paired sequence
information). In the latter case, one .sif file is exported per Repertoire.
YAML specification:
.. indent with spaces
.. code-block:: yaml
my_cyto_export: CytoscapeNetworkExporter
"""
[docs]
@classmethod
def build_object(cls, **kwargs):
if kwargs["additional_node_attributes"] is None:
kwargs["additional_node_attributes"] = []
if kwargs["additional_edge_attributes"] is None:
kwargs["additional_edge_attributes"] = []
ParameterValidator.assert_type_and_value(kwargs["additional_node_attributes"], list, "CytoscapeNetworkExporter", "additional_node_attributes")
ParameterValidator.assert_type_and_value(kwargs["additional_edge_attributes"], list, "CytoscapeNetworkExporter", "additional_edge_attributes")
return CytoscapeNetworkExporter(**kwargs)
def __init__(self, dataset: Dataset = None, result_path: Path = None,
chains=("alpha", "beta"), drop_duplicates=True,
additional_node_attributes=[], additional_edge_attributes=[], number_of_processes: int = 1, name: str = None,):
super().__init__(dataset=dataset, result_path=result_path, number_of_processes=number_of_processes, name=name)
self.chains = chains
self.drop_duplicates = drop_duplicates
self.additional_node_attributes = additional_node_attributes
self.additional_edge_attributes = additional_edge_attributes
[docs]
def check_prerequisites(self):
if isinstance(self.dataset, RepertoireDataset) or isinstance(self.dataset, ReceptorDataset):
return True
else:
warnings.warn(
"CytoscapeNetworkExporter: report can be generated only from a ReceptorDataset or a RepertoireDataset "
"(with repertoires containing Receptors). Skipping this report...")
return False
def _generate(self):
report_output_tables = []
if isinstance(self.dataset, RepertoireDataset):
for repertoire in self.dataset.get_data():
result_path = self.result_path / repertoire.identifier
PathBuilder.build(result_path)
report_output_tables = self.export_receptorlist(repertoire.receptors, result_path)
elif isinstance(self.dataset, ReceptorDataset):
receptors = self.dataset.get_data()
result_path = self.result_path / self.dataset.identifier
PathBuilder.build(result_path)
report_output_tables = self.export_receptorlist(receptors, result_path=result_path)
return ReportResult(name=self.name,
info="This report exports the Receptor sequences to .sif format, such that they can directly be imported as a network in Cytoscape, to visualize chain sharing between the different receptors in a dataset (for example, for TCRs: how often one alpha chain is shared with multiple beta chains, and vice versa).",
output_tables=report_output_tables)
[docs]
def export_receptorlist(self, receptors, result_path: Path):
export_list = []
node_metadata_list = []
edge_metadata_list = []
for receptor in receptors:
first_chain = receptor.get_chain(self.chains[0])
second_chain = receptor.get_chain(self.chains[1])
first_chain_name = self.get_shared_name(first_chain)
second_chain_name = self.get_shared_name(second_chain)
export_list.append([first_chain_name, "pair", second_chain_name])
node_metadata_list.append([first_chain_name, self.chains[0]] + self.get_formatted_node_metadata(first_chain))
node_metadata_list.append([second_chain_name, self.chains[1]] + self.get_formatted_node_metadata(second_chain))
edge_metadata_list.append(
[f"{first_chain_name} (pair) {second_chain_name}"] + self.get_formatted_edge_metadata(first_chain, second_chain))
full_df = pd.DataFrame(export_list, columns=[self.chains[0], "relationship", self.chains[1]])
node_meta_df = pd.DataFrame(node_metadata_list, columns=["shared_name", "chain", "sequence", "v_subgroup", "v_gene", "j_subgroup",
"j_gene"] + self.additional_node_attributes)
edge_meta_df = pd.DataFrame(edge_metadata_list, columns=["shared_name"] + self.additional_edge_attributes)
node_cols = list(node_meta_df.columns)
node_meta_df["n_duplicates"] = 1
node_meta_df = node_meta_df.groupby(node_cols, as_index=False)["n_duplicates"].sum()
edge_meta_df.drop_duplicates(inplace=True)
node_meta_df.to_csv(result_path / "node_metadata.tsv", sep="\t", index=0, header=True)
edge_meta_df.to_csv(result_path / "edge_metadata.tsv", sep="\t", index=0, header=True)
if self.drop_duplicates:
full_df.drop_duplicates(inplace=True)
full_df.to_csv(result_path / "all_chains.sif", sep="\t", index=0, header=False)
shared_df = full_df[(full_df.duplicated(["alpha"], keep=False)) | (full_df.duplicated(["beta"], keep=False))]
shared_df.to_csv(result_path / "shared_chains.sif", sep="\t", index=0, header=False)
return [ReportOutput(path=result_path / "node_metadata.tsv"),
ReportOutput(path=result_path / "edge_metadata.tsv"),
ReportOutput(path=result_path / "all_chains.sif"),
ReportOutput(path=result_path / "shared_chains.sif")]
[docs]
def get_shared_name(self, seq: ReceptorSequence):
"""Returns a string containing a representation of the given receptor chain, with the chain, sequence, v and j genes.
For example: *a*s=AMREGPEHSGYALN*v=V7-3*j=J41"""
return f"*{seq.get_attribute('chain').value.lower()}" \
f"*s={seq.get_sequence()}" \
f"*v={seq.get_attribute('v_gene')}" \
f"*j={seq.get_attribute('j_gene')}"