Source code for libgunshotmatch.project

#!/usr/bin/env python3
#
#  project.py
"""
Represents a collection of repeat analyses.

.. latex:vspace:: -3mm
"""
#
#  Copyright © 2020-2023 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#

# stdlib
import os
from typing import Any, Dict, List, Mapping, MutableSequence, Optional, Type

# 3rd party
import attr
import pandas  # type: ignore[import-untyped]
import pyms_nist_search
from domdf_python_tools.typing import PathLike
from pyms.DPA.Alignment import Alignment
from pyms.Peak.Class import Peak

# this package
from libgunshotmatch import gzip_util
from libgunshotmatch.consolidate import (
		ConsolidatedPeak,
		ConsolidatedPeakFilter,
		match_counter,
		pairwise_ms_comparisons
		)
from libgunshotmatch.datafile import Repeat
from libgunshotmatch.peak import PeakList, QualifiedPeak, peak_from_dict
from libgunshotmatch.utils import create_alignment

__all__ = ("Project", "consolidate")


[docs]@attr.define class Project: """ A project represents the aligned peaks from multiple datafiles. .. latex:vspace:: -5mm """ #: The name of the project. name: str #: Peak alignment for the repeats in this project. alignment: Alignment # datafile_data: Dict[str, DatafileDataElement] #: Mapping of repeat names to :class:`~.Repeat` objects. datafile_data: Dict[str, Repeat] #: List of peaks after :meth:`~.consolidate` is performed. :py:obj:`None` initially. consolidated_peaks: Optional[List[ConsolidatedPeak]] = attr.field(default=None)
[docs] def to_dict(self) -> Dict[str, Any]: """ Returns a dictionary representation of this :class:`~.Project`. All keys are native, JSON-serializable, Python objects. """ alignment_as_dict = { "peaks": [PeakList(x).to_list() for x in self.alignment.peakpos], "expr_code": self.alignment.expr_code, "similarity": self.alignment.similarity, } if self.consolidated_peaks is None: consolidated_peaks_as_list = None else: consolidated_peaks_as_list = [cp.to_dict() for cp in self.consolidated_peaks] datafile_data_as_dict = {k: v.to_dict() for k, v in self.datafile_data.items()} return { "name": self.name, "alignment": alignment_as_dict, "datafile_data": datafile_data_as_dict, # "datafile_data": list(self.datafile_data.keys()), "consolidated_peaks": consolidated_peaks_as_list, }
[docs] def export(self, output_dir: PathLike) -> str: """ Export as a ``gsmp`` file. :param output_dir: :returns: The output filename. """ export_filename = os.path.join(output_dir, f"{self.name}.gsmp") gzip_util.write_gzip_json(export_filename, self.to_dict(), indent=None) return export_filename
[docs] @classmethod def from_file(cls: Type["Project"], filename: PathLike) -> "Project": """ Parse a ``gsmp`` file. :param filename: The input filename. """ as_dict: Dict[str, Any] = gzip_util.read_gzip_json(filename) # type: ignore[assignment] return cls.from_dict(as_dict)
[docs] @classmethod def from_dict(cls: Type["Project"], d: Mapping[str, Any]) -> "Project": """ Construct a :class:`~.Project` from a dictionary. :param d: """ alignment_as_dict = d["alignment"] alignment_peaks: List[MutableSequence[Optional[Peak]]] = [] for row in alignment_as_dict["peaks"]: alignment_peaks.append([]) for peak in row: # print(peak) if peak is None: alignment_peaks[-1].append(None) else: alignment_peaks[-1].append(peak_from_dict(peak)) alignment = create_alignment( alignment_peaks, alignment_as_dict["expr_code"], alignment_as_dict["similarity"], ) consolidated_peaks_as_list = d["consolidated_peaks"] if consolidated_peaks_as_list is None: consolidated_peaks = None else: consolidated_peaks = [ConsolidatedPeak.from_dict(cp) for cp in consolidated_peaks_as_list] datafile_data = {k: Repeat.from_dict(v) for k, v in d["datafile_data"].items()} return cls( name=d["name"], alignment=alignment, consolidated_peaks=consolidated_peaks, datafile_data=datafile_data, )
[docs] def consolidate( self, engine: pyms_nist_search.Engine, peak_filter: Optional[ConsolidatedPeakFilter] = None, ) -> pandas.DataFrame: """ Consolidate the compound identification from the experiments into a single dataset. :param engine: :param peak_filter: Filter for the consolidated peaks. :returns: :class:`pandas.DataFrame` giving the results of pairwise mass spectral comparisons between the repeats for each aligned peak. """ consolidated_peaks, ms_comparison_df = consolidate(self, engine) if peak_filter is None: self.consolidated_peaks = consolidated_peaks else: self.consolidated_peaks = peak_filter.filter(consolidated_peaks) return ms_comparison_df
# chart_data = make_chart_data(self)
[docs]def consolidate( project: Project, engine: pyms_nist_search.Engine, ) -> pandas.DataFrame: """ Consolidate the compound identification from the experiments into a single dataset. :param project: :param engine: :returns: List of consolidated peaks and :class:`pandas.DataFrame` giving the results of pairwise mass spectral comparisons between the repeats for each aligned peak. .. versionadded:: 0.10.0 """ ms_comparison_df = pairwise_ms_comparisons(project.alignment) peak_numbers: List[int] = [] peak: Optional[QualifiedPeak] qualified_peak_array = [] # for experiment in project.alignment.expr_code: for experiment in project.datafile_data: qualified_peaks = project.datafile_data[experiment].qualified_peaks assert qualified_peaks is not None for peak in qualified_peaks: assert peak.peak_number is not None peak_numbers.append(peak.peak_number) qualified_peak_array.append(qualified_peaks) # Convert peak_numbers to a set and sort smallest to largest peak_numbers = sorted(set(peak_numbers)) consolidated_peaks = match_counter( engine=engine, peak_numbers=peak_numbers, qualified_peaks=qualified_peak_array, ms_comp_data=ms_comparison_df, ) return consolidated_peaks, ms_comparison_df