[docs]classDatasetExportInstruction(Instruction):""" DatasetExport instruction takes a list of datasets as input, optionally applies preprocessing steps, and outputs the data in specified formats. **Specification arguments:** - datasets (list): a list of datasets to export in all given formats - preprocessing_sequence (list): which preprocessing sequence to use on the dataset(s), this item is optional and does not have to be specified. When specified, the same preprocessing sequence will be applied to all datasets. - exporters (list): a list of formats in which to export the datasets. Valid formats are class names of any non-abstract class inheriting :py:obj:`~immuneML.IO.dataset_export.DataExporter.DataExporter`. - number_of_processes (int): how many processes to use during repertoire export (not used for sequence datasets) **YAML specification:** .. indent with spaces .. code-block:: yaml instructions: my_dataset_export_instruction: # user-defined instruction name type: DatasetExport # which instruction to execute datasets: # list of datasets to export - my_generated_dataset - my_dataset_from_adaptive preprocessing_sequence: my_preprocessing_sequence number_of_processes: 4 export_formats: # list of formats to export the datasets to - AIRR - ImmuneML """def__init__(self,datasets:List[Dataset],exporters:List[DataExporter],number_of_processes:int=1,preprocessing_sequence:List[Preprocessor]=None,result_path:Path=None,name:str=None):self.datasets=datasetsself.exporters=exportersself.preprocessing_sequence=preprocessing_sequenceself.result_path=result_pathself.number_of_processes=number_of_processesself.name=name
[docs]defrun(self,result_path:Path)->DatasetExportState:self.result_path=result_path/self.namepaths={}fordatasetinself.datasets:dataset_name=dataset.nameifdataset.nameisnotNoneelsedataset.identifierifself.preprocessing_sequenceisnotNoneandlen(self.preprocessing_sequence)>0:forindex,preprocessinginenumerate(self.preprocessing_sequence):print_log(f"For dataset {dataset_name}, started preprocessing step {index+1}/{len(self.preprocessing_sequence)} with {preprocessing.__class__.__name__}",include_datetime=True)dataset=preprocessing.process_dataset(dataset,self.result_path/f"step_{index+1}")print_log(f"Preprocessed {dataset.__class__.__name__.split('Dataset')[0].lower()} dataset {dataset.name} with {preprocessing.__class__.__name__}:\n"f"- Example count: {dataset.get_example_count()}\n"f"- Labels: {dataset.get_label_names()}",True)paths[dataset_name]={}forexporterinself.exporters:export_format=exporter.__name__[:-8]path=self.result_path/dataset_name/export_formatexporter.export(dataset,path,number_of_processes=self.number_of_processes)paths[dataset_name][export_format]=pathcontains=str(dataset.__class__.__name__).replace("Dataset","s").lower()print_log(f"Exported dataset {dataset_name} containing {dataset.get_example_count()} "f"{contains} in {export_format} format.",include_datetime=True)returnDatasetExportState(datasets=self.datasets,formats=[exporter.__name__[:-8]forexporterinself.exporters],preprocessing_sequence=self.preprocessing_sequence,paths=paths,result_path=self.result_path,name=self.name)
[docs]@staticmethoddefget_documentation():doc=str(DatasetExportInstruction.__doc__)valid_strategy_values=ReflectionHandler.all_nonabstract_subclass_basic_names(DataExporter,"Exporter","dataset_export/")valid_strategy_values=str(valid_strategy_values)[1:-1].replace("'","`")mapping={"Valid formats are class names of any non-abstract class inheriting "":py:obj:`~immuneML.IO.dataset_export.DataExporter.DataExporter`.":f"Valid values are: {valid_strategy_values}.","preprocessing_sequence (list)":"preprocessing_sequence (str)","exporters (list)":"formats (list)"}doc=update_docs_per_mapping(doc,mapping)returndoc