Source code for immuneML.reports.encoding_reports.RelevantSequenceExporter

import logging
import os
from pathlib import Path

import pandas as pd

from immuneML.data_model.datasets.RepertoireDataset import RepertoireDataset
from immuneML.data_model.SequenceParams import RegionType
from immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder import CompAIRRSequenceAbundanceEncoder
from immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder import SequenceAbundanceEncoder
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.PathBuilder import PathBuilder



[docs]
class RelevantSequenceExporter(EncodingReport):
    """
    Exports the sequences that are extracted as label-associated when using the :py:obj:`~immuneML.encodings.abundance_encoding.SequenceAbundanceEncoder.SequenceAbundanceEncoder` or
    :py:obj:`~immuneML.encodings.abundance_encoding.CompAIRRSequenceAbundanceEncoder.CompAIRRSequenceAbundanceEncoder` in AIRR-compliant format.


    **YAML specification:**

    .. indent with spaces
    .. code-block:: yaml

        definitions:
            reports:
                my_relevant_sequences: RelevantSequenceExporter

    """

    def __init__(self, dataset: RepertoireDataset = None, result_path: Path = None, name: str = None, number_of_processes: int = 1):
        super().__init__(dataset=dataset, result_path=result_path, name=name, number_of_processes=number_of_processes)


[docs]
    @classmethod
    def build_object(cls, **kwargs):
        return RelevantSequenceExporter(**kwargs)


    def _generate(self) -> ReportResult:

        df = pd.read_csv(self.dataset.encoded_data.info["relevant_sequence_path"])

        PathBuilder.build(self.result_path)
        filename = self.result_path / "relevant_sequences.csv"
        df.to_csv(filename, index=False)

        return ReportResult(self.name,
                            info=f"Exports the sequences that are extracted as label-associated using the {self.dataset.encoded_data.encoding} in AIRR-compliant format.",
                            output_tables=[ReportOutput(filename, "relevant sequences")])


[docs]
    def check_prerequisites(self):
        valid_encodings = [SequenceAbundanceEncoder.__name__, CompAIRRSequenceAbundanceEncoder.__name__]
        if self.dataset.encoded_data is None or self.dataset.encoded_data.info is None:
            logging.warning("RelevantSequenceExporter: the dataset is not encoded, skipping this report...")
            return False
        elif self.dataset.encoded_data.encoding not in valid_encodings:
            logging.warning(f"RelevantSequenceExporter: the dataset encoding ({self.dataset.encoded_data.encoding}) was not in the list of valid "
                            f"encodings ({valid_encodings}), skipping this report...")
            return False
        elif "relevant_sequence_path" not in self.dataset.encoded_data.info or not os.path.isfile(self.dataset.encoded_data.info['relevant_sequence_path']):
            logging.warning(f"RelevantSequenceExporter: the relevant sequences were not set for this encoded data, skipping this report...")
            return False
        else:
            return True