# quality: gold
import copy
import os
import platform
import shutil
from enum import Enum
from pathlib import Path
from typing import List
import pandas as pd
import yaml
from immuneML.IO.dataset_export.DataExporter import DataExporter
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.data_model.dataset.ReceptorDataset import ReceptorDataset
from immuneML.data_model.dataset.RepertoireDataset import RepertoireDataset
from immuneML.data_model.dataset.SequenceDataset import SequenceDataset
from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.environment.Constants import Constants
from immuneML.util.PathBuilder import PathBuilder
[docs]
class ImmuneMLExporter(DataExporter):
[docs]
@staticmethod
def export(dataset: Dataset, path: Path, number_of_processes: int = 1):
PathBuilder.build(path)
exported_dataset = dataset.clone(keep_identifier=True)
dataset_name = exported_dataset.name
dataset_filename = f"{dataset_name}.iml_dataset"
if isinstance(dataset, RepertoireDataset):
repertoires_path = PathBuilder.build(path / "repertoires")
exported_repertoires = ImmuneMLExporter._export_repertoires(dataset.repertoires, repertoires_path)
exported_dataset.repertoires = exported_repertoires
exported_dataset.metadata_file = ImmuneMLExporter._export_metadata(dataset, path, dataset_filename, repertoires_path)
elif isinstance(dataset, SequenceDataset) or isinstance(dataset, ReceptorDataset):
exported_dataset.set_filenames(ImmuneMLExporter._export_receptors(exported_dataset.get_filenames(), path))
file_path = path / dataset_filename
with file_path.open("w") as file:
yaml_dict = {**{key: ImmuneMLExporter._parse_val_for_export(val) for key, val in vars(exported_dataset).items()
if key not in ['repertoires', 'element_generator', 'encoded_data']},
**{'dataset_class': type(exported_dataset).__name__}}
yaml.dump(yaml_dict, file)
version_path = path / "info.txt"
with version_path.open("w") as file:
file.writelines(f"immuneML_version: {Constants.VERSION}\n"
f"Python_version: {platform.python_version()}\n")
return exported_dataset
@staticmethod
def _parse_val_for_export(val):
if isinstance(val, Path) or isinstance(val, Enum):
return str(val.name)
elif isinstance(val, list) and all(isinstance(v, Path) for v in val):
return [str(v.name) for v in val]
elif isinstance(val, dict):
return {inner_key: ImmuneMLExporter._parse_val_for_export(inner_val) for inner_key, inner_val in val.items()}
else:
return val
@staticmethod
def _export_metadata(dataset, metadata_folder_path: Path, dataset_filename, repertoires_path):
if dataset.metadata_file is None or not dataset.metadata_file.is_file():
return None
metadata_file = metadata_folder_path / f"{dataset.name}_metadata.csv"
if not metadata_file.is_file():
shutil.copyfile(dataset.metadata_file, metadata_file)
ImmuneMLExporter._update_repertoire_paths_in_metadata(metadata_file, repertoires_path)
ImmuneMLExporter._add_dataset_to_metadata(metadata_file, dataset_filename)
old_metadata_file = metadata_folder_path / "metadata.csv"
if old_metadata_file.is_file():
os.remove(str(old_metadata_file))
return metadata_file
@staticmethod
def _update_repertoire_paths_in_metadata(metadata_file: Path, repertoires_path: Path):
metadata = pd.read_csv(metadata_file, comment=Constants.COMMENT_SIGN)
path = Path(os.path.relpath(str(repertoires_path), str(metadata_file.parent)))
metadata["filename"] = [path / os.path.basename(name) for name in metadata["filename"].values.tolist()]
metadata.to_csv(metadata_file, index=False)
@staticmethod
def _add_dataset_to_metadata(metadata_file: Path, dataset_filename: str):
metadata = pd.read_csv(metadata_file)
with metadata_file.open("w") as file:
file.writelines([f"{Constants.COMMENT_SIGN}{dataset_filename}\n"])
metadata.to_csv(metadata_file, mode="a", index=False)
@staticmethod
def _export_receptors(filenames_old: List[str], path: Path) -> List[str]:
filenames_new = []
for filename_old in filenames_old:
filename_new = ImmuneMLExporter._copy_if_exists(filename_old, path)
filenames_new.append(filename_new)
return filenames_new
@staticmethod
def _export_repertoires(repertoires: List[Repertoire], repertoires_path: Path) -> List[Repertoire]:
new_repertoires = []
for repertoire_old in repertoires:
repertoire = copy.deepcopy(repertoire_old)
repertoire.data_filename = ImmuneMLExporter._copy_if_exists(repertoire_old.data_filename, repertoires_path)
repertoire.metadata_filename = ImmuneMLExporter._copy_if_exists(repertoire_old.metadata_filename, repertoires_path)
new_repertoires.append(repertoire)
return new_repertoires
@staticmethod
def _copy_if_exists(old_file: Path, path: Path):
if old_file is not None and old_file.is_file():
new_file = path / old_file.name
if not new_file.is_file():
shutil.copyfile(old_file, new_file)
return new_file
else:
raise RuntimeError(f"{ImmuneMLExporter.__name__}: tried exporting file {old_file}, but it does not exist.")