Source code for immuneML.api.galaxy.build_dataset_overview_yaml

import argparse
import sys
from pathlib import Path

import yaml
import os

from immuneML.api.galaxy.Util import Util
from immuneML.util.PathBuilder import PathBuilder
from immuneML.data_model.SequenceParams import RegionType




[docs]
def get_dataset_specs(args):
    dataset_specs = {"format": args.format,
                     "params": {"region_type": RegionType.IMGT_CDR3.name,
                                "result_path": "./",
                                "path": "./"}}

    if args.is_repertoire == "True":
        dataset_specs["params"]["is_repertoire"] = True
        dataset_specs["params"]["metadata_file"] = args.metadata_file
    else:
        dataset_specs["params"]["is_repertoire"] = False
        if args.metadata_columns != "":
            dataset_specs["params"]["label_columns"] = args.metadata_columns.split(",")

        paired = True if args.paired == "True" else False

        dataset_specs["params"]["paired"] = paired
        if paired:
            dataset_specs["params"]["receptor_chains"] = args.receptor_chains

    return {"dataset": dataset_specs}





[docs]
def add_report_with_label(specs, args, report_name, report_key):
    if args.label_name != "":
        specs["definitions"]["reports"][f"{report_key}_report"] = {report_name: {"label": args.label_name}}
    else:
        specs["definitions"]["reports"][f"{report_key}_report"] = report_name

    specs["instructions"]["my_dataset_generation_instruction"]["analyses"][f"{report_key}_analysis"] = {
        "dataset": "dataset", "report": f"{report_key}_report"}




[docs]
def build_specs(args):
    specs = {
        "definitions": {
            "datasets": dict(),
            "reports": dict()
        },
        "instructions": {
            "my_dataset_generation_instruction": {
                "type": "ExploratoryAnalysis",
                "analyses": dict()
            }
        }
    }

    if args.existing_dataset == "True":
        specs["definitions"]["datasets"]["dataset"] = {"format": "AIRR", "params": {"dataset_file": Util.discover_dataset_path()}}
    else:
        specs["definitions"]["datasets"] = get_dataset_specs(args)

    if args.sequence_length_report == "True":
        specs["definitions"]["reports"]["sequence_length_report"] = "SequenceLengthDistribution"
        specs["instructions"]["my_dataset_generation_instruction"]["analyses"]["sequence_length_analysis"] = {
            "dataset": "dataset", "report": "sequence_length_report"}

    if args.sequence_count_report == "True":
        add_report_with_label(specs, args, report_name="SequenceCountDistribution", report_key="sequence_count")

        # Note: if 'existing' dataset is used, it is not known beforehand whether this is a repertoire dataset
        # however, 'is_repertoire' is True by default in Galaxy, and even when running the repertoire report
        # on a non-repertoire dataset, the worst thing that happens is that the report fails with a warning
        if args.is_repertoire == "True":
            add_report_with_label(specs, args, report_name="RepertoireClonotypeSummary", report_key="repertoire_clone_count")


    if args.vj_gene_report == "True":
        add_report_with_label(specs, args, report_name="VJGeneDistribution", report_key="vj_gene")

    if args.amino_acid_report == "True":
        add_report_with_label(specs, args, report_name="AminoAcidFrequencyDistribution", report_key="amino_acid")

    if len(specs["instructions"]["my_dataset_generation_instruction"]["analyses"]) == 0:
        specs["instructions"]["my_dataset_generation_instruction"]["analyses"] = {
            "dataset_overview": {"dataset": "dataset", "report": None}}

    return specs




[docs]
def parse_commandline_arguments(args):
    parser = argparse.ArgumentParser(
        description="Tool for building immuneML defintion YAML for Galaxy Create Dataset tool")
    parser.add_argument("-x", "--existing_dataset", choices=["True", "False"], default="False",
                        help="Whether to use an already existing dataset from the current working directory (use the 'dataset_name' parameter to pass the name).")

    parser.add_argument("-r", "--format", help="The format of the repertoire/receptor dataset")
    parser.add_argument("-m", "--metadata_file", default="",
                        help="The metadata file when using a repertoire dataset. When using a receptor dataset, you may supply an empty string.")
    parser.add_argument("-i", "--is_repertoire", choices=["True", "False"],
                        help="Whether to import a RepertoireDataset")
    parser.add_argument("-p", "--paired", choices=["True", "False"], default="False",
                        help="When the data is not repertoire data (metadata file = ''), this specifies whether the data is paired (ReceptorDataset) or unpaired (SequenceDataset)")
    parser.add_argument("-c", "--receptor_chains", choices=["TRA_TRB", "TRG_TRD", "IGH_IGL", "IGH_IGK"],
                        default="TRA_TRB",
                        help="When the data is a ReceptorDataset, this specifies the type of receptor chains that are used.")
    parser.add_argument("-a", "--metadata_columns", default="",
                        help="The name of metadata columns of a Sequence- or ReceptorDataset.")

    parser.add_argument("-l", "--label_name", default="", help="The label name to be used for reports.")
    parser.add_argument("-s", "--sequence_length_report", choices=["True", "False"], default="False",
                        help="Whether to run the SequenceLengthDistribution report.")
    parser.add_argument("-u", "--sequence_count_report", choices=["True", "False"], default="False",
                        help="Whether to run the SequenceCountDistribution report.")

    parser.add_argument("-g", "--vj_gene_report", choices=["True", "False"], default="False",
                        help="Whether to run the VJGeneDistribution report.")
    parser.add_argument("-q", "--amino_acid_report", choices=["True", "False"], default="False",
                        help="Whether to run the AminoAcidFrequencyDistribution report.")

    parser.add_argument("-o", "--output_path", required=True,
                        help="Output location for the generated yaml file (directiory).")
    parser.add_argument("-f", "--file_name", default="specs.yaml",
                        help="Output file name for the yaml file. Default name is 'specs.yaml' if not specified.")

    return parser.parse_args(args)




[docs]
def main(args):
    parsed_args = parse_commandline_arguments(args)
    specs = build_specs(parsed_args)

    PathBuilder.build(parsed_args.output_path)
    output_location = Path(parsed_args.output_path) / parsed_args.file_name

    with output_location.open("w") as file:
        yaml.dump(specs, file)

    return str(output_location)



if __name__ == "__main__":
    main(sys.argv[1:])