[docs]@staticmethoddefsplit_dataset(input_params:DataSplitterParams):ifisinstance(input_params.dataset,RepertoireDataset):returnManualSplitter._split_dataset(input_params,ManualSplitter._make_repertoire_dataset)elifisinstance(input_params.dataset,ElementDataset):returnManualSplitter._split_dataset(input_params,ManualSplitter._make_element_dataset)else:raiseValueError(f"DataSplitter: dataset is unexpected class: {type(input_params.dataset).__name__}, "f"expected one of {str(ReflectionHandler.all_nonabstract_subclass_basic_names(Dataset,'','dataset/'))[1:-1]}")
@staticmethoddef_split_dataset(input_params,make_dataset_func):train_metadata_path=input_params.split_config.manual_config.train_metadata_pathtest_metadata_path=input_params.split_config.manual_config.test_metadata_pathtrain_dataset=make_dataset_func(input_params,train_metadata_path,Dataset.TRAIN)test_dataset=make_dataset_func(input_params,test_metadata_path,Dataset.TEST)return[train_dataset],[test_dataset]@staticmethoddef_make_element_dataset(input_params,metadata_path,dataset_type:str)->ElementDataset:example_ids=input_params.dataset.get_example_ids()returnManualSplitter._make_subset(input_params,metadata_path,dataset_type,example_ids,'example_id')@staticmethoddef_make_repertoire_dataset(input_params,metadata_path,dataset_type:str)->RepertoireDataset:subject_ids=input_params.dataset.get_metadata(["subject_id"])["subject_id"]returnManualSplitter._make_subset(input_params,metadata_path,dataset_type,subject_ids,'subject_id')@staticmethoddef_make_subset(input_params,metadata_path,dataset_type,example_ids,col_name):ManualSplitter._check_unique_count(example_ids,input_params.dataset)metadata_df=ManualSplitter._get_metadata(metadata_path,dataset_type,col_name)indices_of_interest=metadata_df[col_name].astype(str).values.tolist()indices=[iforiinrange(len(example_ids))ifstr(example_ids[i])inindices_of_interest]logging.info(f"{ManualSplitter.__name__}: Making {dataset_type} dataset subset with {len(indices)} elements.")returnUtil.make_dataset(input_params.dataset,indices,input_params,0,dataset_type)@staticmethoddef_check_unique_count(example_ids:list,dataset):unique_example_count=np.unique(example_ids).shape[0]assertlen(example_ids)==unique_example_count,f"DataSplitter: there are {len(example_ids)} elements, but {unique_example_count} " \
f"unique identifiers. Check the metadata for the original dataset {dataset.name}."@staticmethoddef_get_metadata(metadata_path,dataset_type:str,col_name:str)->pd.DataFrame:metadata_df=pd.read_csv(metadata_path)assertcol_nameinmetadata_df,f"DataSplitter: {dataset_type} metadata {os.path.basename(metadata_path)} is missing column " \
f"'{col_name}' which should be used for matching examples when splitting to train and test data."returnmetadata_df