from pathlib import Path
from immuneML.data_model.dataset.Dataset import Dataset
from immuneML.encodings.EncoderParams import EncoderParams
from immuneML.reports.ReportResult import ReportResult
from immuneML.util.Logger import print_log
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.instructions.Instruction import Instruction
from immuneML.workflows.instructions.exploratory_analysis.ExploratoryAnalysisState import ExploratoryAnalysisState
from immuneML.workflows.instructions.exploratory_analysis.ExploratoryAnalysisUnit import ExploratoryAnalysisUnit
from immuneML.workflows.steps.DataEncoder import DataEncoder
from immuneML.workflows.steps.DataEncoderParams import DataEncoderParams
[docs]
class ExploratoryAnalysisInstruction(Instruction):
"""
Allows exploratory analysis of different datasets using encodings and reports.
Analysis is defined by a dictionary of ExploratoryAnalysisUnit objects that encapsulate a dataset, an encoding [optional]
and a report to be executed on the [encoded] dataset. Each analysis specified under `analyses` is completely independent from all
others.
Arguments:
analyses (dict): a dictionary of analyses to perform. The keys are the names of different analyses, and the values for each
of the analyses are:
- dataset: dataset on which to perform the exploratory analysis
- preprocessing_sequence: which preprocessings to use on the dataset, this item is optional and does not have to be specified.
- encoding: how to encode the dataset before running the report, this item is optional and does not have to be specified.
- labels: if encoding is specified, the relevant labels must be specified here.
- report: which report to run on the dataset. Reports specified here may be of the category :ref:`Data reports` or :ref:`Encoding reports`, depending on whether 'encoding' was specified.
number_of_processes: (int): how many processes should be created at once to speed up the analysis. For personal machines, 4 or 8 is usually a good choice.
YAML specification:
.. indent with spaces
.. code-block:: yaml
my_expl_analysis_instruction: # user-defined instruction name
type: ExploratoryAnalysis # which instruction to execute
analyses: # analyses to perform
my_first_analysis: # user-defined name of the analysis
dataset: d1 # dataset to use in the first analysis
preprocessing_sequence: p1 # preprocessing sequence to use in the first analysis
report: r1 # which report to generate using the dataset d1
my_second_analysis: # user-defined name of another analysis
dataset: d1 # dataset to use in the second analysis - can be the same or different from other analyses
encoding: e1 # encoding to apply on the specified dataset (d1)
report: r2 # which report to generate in the second analysis
labels: # labels present in the dataset d1 which will be included in the encoded data on which report r2 will be run
- celiac # name of the first label as present in the column of dataset's metadata file
- CMV # name of the second label as present in the column of dataset's metadata file
number_of_processes: 4 # number of parallel processes to create (could speed up the computation)
"""
def __init__(self, exploratory_analysis_units: dict, name: str = None):
assert all(isinstance(unit, ExploratoryAnalysisUnit) for unit in exploratory_analysis_units.values()), \
"ExploratoryAnalysisInstruction: not all elements passed to init method are instances of ExploratoryAnalysisUnit."
self.state = ExploratoryAnalysisState(exploratory_analysis_units, name=name)
self.name = name
[docs]
def run(self, result_path: Path):
name = self.name if self.name is not None else "exploratory_analysis"
self.state.result_path = result_path / name
for index, (key, unit) in enumerate(self.state.exploratory_analysis_units.items()):
print_log(f"Started analysis {key} ({index+1}/{len(self.state.exploratory_analysis_units)}).", include_datetime=True)
path = self.state.result_path / f"analysis_{key}"
PathBuilder.build(path)
report_result = self.run_unit(unit, path)
unit.report_result = report_result
print_log(f"Finished analysis {key} ({index+1}/{len(self.state.exploratory_analysis_units)}).\n", include_datetime=True)
return self.state
[docs]
def run_unit(self, unit: ExploratoryAnalysisUnit, result_path: Path) -> ReportResult:
unit.dataset = self.preprocess_dataset(unit, result_path / "preprocessed_dataset")
encoded_dataset = self.encode(unit, result_path / "encoded_dataset")
unit.report.dataset = encoded_dataset
unit.report.result_path = result_path / "report"
unit.report.number_of_processes = unit.number_of_processes
report_result = unit.report.generate_report()
return report_result
[docs]
def preprocess_dataset(self, unit: ExploratoryAnalysisUnit, result_path: Path) -> Dataset:
if unit.preprocessing_sequence is not None and len(unit.preprocessing_sequence) > 0:
dataset = unit.dataset
for preprocessing in unit.preprocessing_sequence:
dataset = preprocessing.process_dataset(dataset, result_path)
else:
dataset = unit.dataset
return dataset
[docs]
def encode(self, unit: ExploratoryAnalysisUnit, result_path: Path) -> Dataset:
if unit.encoder is not None:
encoded_dataset = DataEncoder.run(DataEncoderParams(dataset=unit.dataset, encoder=unit.encoder,
encoder_params=EncoderParams(result_path=result_path,
label_config=unit.label_config,
filename="encoded_dataset.pkl",
pool_size=unit.number_of_processes,
learn_model=True,
encode_labels=unit.label_config is not None),
))
else:
encoded_dataset = unit.dataset
return encoded_dataset