[docs]classExperimentalImport(GenerativeModel):""" Allows to import existing experimental data and do annotations and simulations on top of them. This model should be used only for LIgO simulation and not with TrainGenModel instruction. **YAML specification:** .. indent with spaces .. code-block:: yaml definitions: ml_methods: generative_model: type: ExperimentalImport import_format: AIRR tmp_import_path: ./tmp/ import_params: path: path/to/files/ region_type: IMGT_CDR3 # what part of the sequence to import column_mapping: # column mapping AIRR: immuneML junction: sequence junction_aa: sequence_aa locus: chain """def__init__(self,dataset:SequenceDataset,original_input_file:Path=None):self._dataset=datasetself._counter=0self._original_input_file=original_input_file
[docs]@classmethoddefbuild_object(cls,**kwargs):ParameterValidator.assert_keys(kwargs.keys(),['import_format','import_params',"tmp_import_path"],ExperimentalImport.__name__,'ExperimentalImport')ParameterValidator.assert_type_and_value(kwargs['tmp_import_path'],str,cls.__name__,'tmp_import_path')tmp_import_path=Path(kwargs['tmp_import_path'])assertnottmp_import_path.is_file(), \
f"{cls.__name__}: parameter 'tmp_import_path' has to point to a directory where temporary files can be stored."PathBuilder.build(tmp_import_path,False)dataset=ImportParser.parse_dataset("experimental_dataset",{'format':kwargs['import_format'],'params':kwargs['import_params']},tmp_import_path)print(f"Imported dataset with {dataset.get_example_count()} sequences.")returnExperimentalImport(dataset,kwargs['import_params']['path'])
[docs]defgenerate_sequences(self,count:int,seed:int,path:Path,sequence_type:SequenceType,compute_p_gen:bool):ifcompute_p_gen:logging.warning(f"{ExperimentalImport.__name__}: generation probabilities cannot be computed for experimental data, skipping...")ifself._counter<self._dataset.get_example_count():sequences=self._dataset.get_data_from_index_range(self._counter,self._counter+count-1)self._counter+=len(sequences)write_bnp_data(path,BackgroundSequences.build_from_receptor_sequences(sequences))else:raiseRuntimeError(f"{ExperimentalImport.__name__}: all sequences provided to the generative model were already used in the simulation, "f"no more new sequences can be imported. Try increasing the number of sequences in the provided files or reduce the "f"number of sequences or repertoires to be generated.")