Source code for immuneML.api.galaxy.build_clustering_yaml

import argparse
import sys
from pathlib import Path

from immuneML.api.galaxy.Util import Util
from immuneML.data_model.bnp_util import write_yaml
from immuneML.util.PathBuilder import PathBuilder

from immuneML.ml_metrics.ClusteringMetric import INTERNAL_EVAL_METRICS, EXTERNAL_EVAL_METRICS



[docs]
def build_labels(labels_str):
    labels = labels_str.split(",")
    return [label.strip().strip("'\"") for label in labels]




[docs]
def parse_command_line_arguments(args):
    parser = argparse.ArgumentParser(
        description="Tool for building specification for applying previously trained ML models in Galaxy")

    parser.add_argument("-l", "--labels", type=str, default="",
                        help="Which metadata labels should be predicted for the dataset (separated by comma).")
    parser.add_argument("-e", "--eval_metrics", type=str, choices=INTERNAL_EVAL_METRICS + EXTERNAL_EVAL_METRICS,
                        default=[], nargs="+",
                        help="External evaluation metrics to use for clustering, for these metrics, clusters are compared to a provided label.")

    parser.add_argument("-k", "--encoding_k", type=int, required=True, help="")
    parser.add_argument("-n", "--n_clusters", type=int, required=True, help="")
    parser.add_argument("-d", "--dim_red_method", type=str, choices=["PCA", "UMAP", "TSNE", "None"], default="None",
                        help="External evaluation metrics to use for clustering, for these metrics, clusters are compared to a provided label.")
    parser.add_argument("-t", "--training_percentage", type=int, default=100)

    parser.add_argument("-o", "--output_path", required=True,
                        help="Output location for the generated yaml file (directory).")
    parser.add_argument("-f", "--file_name", default="specs.yaml",
                        help="Output file name for the yaml file. Default name is 'specs.yaml' if not specified.")

    return parser.parse_args(args)




[docs]
def build_specs(parsed_args):
    specs = {
        "definitions": {
            "datasets": {
                "dataset": {
                    "format": "AIRR",
                    "params": {"dataset_file": Util.discover_dataset_path()}
                }
            },
            'reports': {
                'dim_reduction': {'DimensionalityReduction': {"dim_red_method": {}}}
            },
            'encodings': {
                'kmer': {
                    'KmerFrequency': {
                        "k": parsed_args.encoding_k
                    }
                }
            },
            'ml_methods': {
                'kmeans': {
                    'KMeans': {
                        'n_clusters': parsed_args.n_clusters
                    }
                },
                "pca_2": {
                    "PCA": {
                        "n_components": 2
                    }
                }
            }
        },
        "instructions": {
            f"clustering": {
                "type": "Clustering",
                "dataset": "dataset",
                'metrics': parsed_args.eval_metrics,
                'clustering_settings': [
                    {'encoding': 'kmer', "dim_reduction": "pca_2", 'method': 'kmeans'},
                ],
                'split_config': {
                    'split_strategy': 'random',
                    'training_percentage': parsed_args.training_percentage / 100
                }
            }
        }
    }

    if parsed_args.labels != "":
        specs["instructions"]["clustering"]["labels"] = build_labels(parsed_args.labels)
        specs['definitions']['reports']['dim_reduction']["DimensionalityReduction"]['label'] = (
            specs)["instructions"]["clustering"]["labels"][0]

    if parsed_args.dim_red_method in ["PCA", "TSNE", "UMAP"]:
        specs["definitions"]["ml_methods"][parsed_args.dim_red_method.lower()] = {
            parsed_args.dim_red_method: {"n_components": 2}}
        if parsed_args.dim_red_method == 'TSNE':
            specs["definitions"]["ml_methods"][parsed_args.dim_red_method.lower()][parsed_args.dim_red_method]['init'] = "random"
        specs["definitions"]["reports"]["dim_reduction"]["DimensionalityReduction"]["dim_red_method"] = {
            parsed_args.dim_red_method.upper(): {"n_components": 2}
        }
        specs["instructions"]["clustering"]["reports"] = ["dim_reduction"]

    return specs




[docs]
def main(args):
    parsed_args = parse_command_line_arguments(args)
    specs = build_specs(parsed_args)

    PathBuilder.build(parsed_args.output_path)
    output_location = Path(parsed_args.output_path) / parsed_args.file_name

    write_yaml(output_location, specs)

    return str(output_location)



if __name__ == "__main__":
    main(sys.argv[1:])