import logging
from pathlib import Path
import pandas as pd
import plotly.express as px
from immuneML.data_model.datasets.ElementDataset import SequenceDataset
from immuneML.encodings.motif_encoding.MotifEncoder import MotifEncoder
from immuneML.encodings.motif_encoding.PositionalMotifHelper import PositionalMotifHelper
from immuneML.reports.ReportOutput import ReportOutput
from immuneML.reports.ReportResult import ReportResult
from immuneML.reports.encoding_reports.EncodingReport import EncodingReport
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
[docs]
class NonMotifSequenceSimilarity(EncodingReport):
"""
Plots the similarity of positions outside the motifs of interest. This report can be used to investigate if the
motifs of interest as determined by the :py:obj:`~immuneML.encodings.motif_encoding.MotifEncoder.MotifEncoder`
have a tendency occur in sequences that are naturally very similar or dissimilar.
For each motif, the subset of sequences containing the motif is selected, and the hamming distances are computed
between all sequences in this subset. Finally, a plot is created showing the distribution of hamming distances
between the sequences containing the motif. For motifs occurring in sets of very similar sequences, this distribution
will lean towards small hamming distances. Likewise, for motifs occurring in a very diverse set of sequences, the
distribution will lean towards containing more large hamming distances.
**Specification arguments:**
- motif_color_map (dict): An optional mapping between motif sizes and colors. If no mapping is given, default colors will be chosen.
**YAML specification:**
.. indent with spaces
.. code-block:: yaml
definitions:
reports:
my_motif_sim:
NonMotifSimilarity:
motif_color_map:
3: "#66C5CC"
4: "#F6CF71"
5: "#F89C74"
"""
[docs]
@classmethod
def build_object(cls, **kwargs):
if "motif_color_map" in kwargs:
ParameterValidator.assert_type_and_value(kwargs["motif_color_map"], dict, NonMotifSequenceSimilarity.__name__, "motif_color_map")
kwargs["motif_color_map"] = {str(key): value for key, value in kwargs["motif_color_map"].items()}
return NonMotifSequenceSimilarity(**kwargs)
def __init__(self, dataset: SequenceDataset = None, result_path: Path = None,
motif_color_map: dict = None, name: str = None, number_of_processes: int = 1):
super().__init__(dataset=dataset, result_path=result_path, name=name, number_of_processes=number_of_processes)
self.sequence_length = 0
self.motif_color_map = motif_color_map
def _generate(self):
PathBuilder.build(self.result_path)
raw_counts = self.get_hamming_distance_counts()
plotting_data = self.get_plotting_data(raw_counts)
table_1 = self._write_output_table(raw_counts, self.result_path / "sequence_hamming_distances_raw.tsv",
"Hamming distances between sequences sharing the same motif, raw counts.")
table_2 = self._write_output_table(plotting_data, self.result_path / "sequence_hamming_distances_percentage.tsv",
"Hamming distances between sequences sharing the same motif, expressed as percentage.")
output_figure = self._safe_plot(plotting_data=plotting_data)
return ReportResult(
name=self.name,
output_figures=[output_figure],
output_tables=[table_1, table_2],
)
[docs]
def get_hamming_distance_counts(self):
np_sequences = PositionalMotifHelper.get_numpy_sequence_representation(self.dataset)
self.sequence_length = len(np_sequences[0])
raw_counts = pd.DataFrame([self._make_hamming_distance_hist_for_motif(motif_presence, np_sequences)
for motif_presence in self.dataset.encoded_data.examples.T])
### Original code with multiprocessing (fails with bionumpy + pickle error?)
# with Pool(processes=self.number_of_processes) as pool:
# partial_func = partial(self._make_hamming_distance_hist_for_motif, np_sequences=np_sequences)
# raw_counts = pd.DataFrame(pool.map(partial_func, self.dataset.encoded_data.examples.T))
raw_counts["motif"] = self.dataset.encoded_data.feature_names
return raw_counts
def _make_hamming_distance_hist_for_motif(self, motif_presence, np_sequences):
positive_sequences = np_sequences[motif_presence]
return self._make_hamming_distance_hist(positive_sequences)
def _make_hamming_distance_hist(self, sequences):
counts = {i: 0 for i in range(self.sequence_length)}
for dist in self._calculate_all_hamming_distances(sequences):
counts[dist] += 1
return counts
def _calculate_all_hamming_distances(self, sequences):
for i in range(len(sequences)):
for j in range(i + 1, len(sequences)):
yield sum(sequences[i] != sequences[j])
[docs]
def get_plotting_data(self, raw_counts):
motif_col = raw_counts.loc[:,"motif"]
counts = raw_counts.loc[:, raw_counts.columns != "motif"]
total = counts.apply(sum, axis=1)
plotting_data = counts.div(total, axis=0)
plotting_data["motif"] = motif_col
plotting_data = plotting_data[total > 0]
plotting_data = plotting_data.loc[::-1]
plotting_data = pd.melt(plotting_data, id_vars=["motif"], var_name="Hamming", value_name="Percentage")
plotting_data["Hamming"] = plotting_data["Hamming"].astype(str)
plotting_data["motif_size"] = plotting_data["motif"].apply(lambda motif: len(motif.split("-")[0].split("&")))
return plotting_data
def _plot(self, plotting_data) -> ReportOutput:
if self.motif_color_map is not None:
color_discrete_map = self.motif_color_map
color_discrete_sequence = None
else:
color_discrete_map = None
color_discrete_sequence = px.colors.sequential.Sunsetdark
plotting_data["motif_size"] = plotting_data["motif_size"].astype(str)
fig = px.line(plotting_data, x='Hamming', y='Percentage', markers=True, line_group='motif', color="motif_size",
template='plotly_white', color_discrete_sequence=color_discrete_sequence, color_discrete_map=color_discrete_map,
labels={"Percentage": "Percentage of sequence pairs containing motif",
"Hamming": "Hamming distance between sequences",
"motif_size": "Motif length<br>(number of<br>amino acids)"})
fig.layout.yaxis.tickformat = ',.0%'
fig.update_traces(opacity=0.7)
fig.update_layout(
font=dict(
size=14,
)
)
output_path = self.result_path / "sequence_hamming_distances.html"
fig.write_html(str(output_path))
return ReportOutput(path=output_path, name="Hamming distances between sequences sharing the same motif")
[docs]
def check_prerequisites(self):
valid_encodings = [MotifEncoder.__name__]
if self.dataset.encoded_data is None or self.dataset.encoded_data.info is None:
logging.warning(f"{self.__class__.__name__}: the dataset is not encoded, skipping this report...")
return False
elif self.dataset.encoded_data.encoding not in valid_encodings:
logging.warning(f"{self.__class__.__name__}: the dataset encoding ({self.dataset.encoded_data.encoding}) was not in the list of valid "
f"encodings ({valid_encodings}), skipping this report...")
return False
else:
return True