Source code for libgunshotmatch.project

#!/usr/bin/env python3
#
#  project.py
"""
Represents a collection of repeat analyses.

.. latex:vspace:: -3mm
"""
#
#  Copyright © 2020-2023 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#

# stdlib
import os
from typing import Any, Dict, List, Mapping, MutableSequence, Optional, Type

# 3rd party
import attr
import pandas  # type: ignore[import-untyped]
import pyms_nist_search
from domdf_python_tools.typing import PathLike
from pyms.DPA.Alignment import Alignment
from pyms.Peak.Class import Peak

# this package
from libgunshotmatch import gzip_util
from libgunshotmatch.consolidate import (
		ConsolidatedPeak,
		ConsolidatedPeakFilter,
		match_counter,
		pairwise_ms_comparisons
		)
from libgunshotmatch.datafile import Repeat
from libgunshotmatch.peak import PeakList, QualifiedPeak, peak_from_dict
from libgunshotmatch.utils import create_alignment

__all__ = ("Project", "consolidate")


[docs]@attr.define
class Project:
	"""
	A project represents the aligned peaks from multiple datafiles.

	.. latex:vspace:: -5mm
	"""

	#: The name of the project.
	name: str

	#: Peak alignment for the repeats in this project.
	alignment: Alignment

	# datafile_data: Dict[str, DatafileDataElement]

	#: Mapping of repeat names to :class:`~.Repeat` objects.
	datafile_data: Dict[str, Repeat]

	#: List of peaks after :meth:`~.consolidate` is performed. :py:obj:`None` initially.
	consolidated_peaks: Optional[List[ConsolidatedPeak]] = attr.field(default=None)

[docs]	def to_dict(self) -> Dict[str, Any]:
		"""
		Returns a dictionary representation of this :class:`~.Project`.

		All keys are native, JSON-serializable, Python objects.
		"""

		alignment_as_dict = {
				"peaks": [PeakList(x).to_list() for x in self.alignment.peakpos],
				"expr_code": self.alignment.expr_code,
				"similarity": self.alignment.similarity,
				}

		if self.consolidated_peaks is None:
			consolidated_peaks_as_list = None
		else:
			consolidated_peaks_as_list = [cp.to_dict() for cp in self.consolidated_peaks]

		datafile_data_as_dict = {k: v.to_dict() for k, v in self.datafile_data.items()}

		return {
				"name": self.name,
				"alignment": alignment_as_dict,
				"datafile_data": datafile_data_as_dict,  # "datafile_data": list(self.datafile_data.keys()),
				"consolidated_peaks": consolidated_peaks_as_list,
				}

[docs]	def export(self, output_dir: PathLike) -> str:
		"""
		Export as a ``gsmp`` file.

		:param output_dir:

		:returns: The output filename.
		"""

		export_filename = os.path.join(output_dir, f"{self.name}.gsmp")
		gzip_util.write_gzip_json(export_filename, self.to_dict(), indent=None)
		return export_filename

[docs]	@classmethod
	def from_file(cls: Type["Project"], filename: PathLike) -> "Project":
		"""
		Parse a ``gsmp`` file.

		:param filename: The input filename.
		"""

		as_dict: Dict[str, Any] = gzip_util.read_gzip_json(filename)  # type: ignore[assignment]
		return cls.from_dict(as_dict)

[docs]	@classmethod
	def from_dict(cls: Type["Project"], d: Mapping[str, Any]) -> "Project":
		"""
		Construct a :class:`~.Project` from a dictionary.

		:param d:
		"""

		alignment_as_dict = d["alignment"]
		alignment_peaks: List[MutableSequence[Optional[Peak]]] = []
		for row in alignment_as_dict["peaks"]:
			alignment_peaks.append([])
			for peak in row:
				# print(peak)
				if peak is None:
					alignment_peaks[-1].append(None)
				else:
					alignment_peaks[-1].append(peak_from_dict(peak))

		alignment = create_alignment(
				alignment_peaks,
				alignment_as_dict["expr_code"],
				alignment_as_dict["similarity"],
				)

		consolidated_peaks_as_list = d["consolidated_peaks"]
		if consolidated_peaks_as_list is None:
			consolidated_peaks = None
		else:
			consolidated_peaks = [ConsolidatedPeak.from_dict(cp) for cp in consolidated_peaks_as_list]

		datafile_data = {k: Repeat.from_dict(v) for k, v in d["datafile_data"].items()}

		return cls(
				name=d["name"],
				alignment=alignment,
				consolidated_peaks=consolidated_peaks,
				datafile_data=datafile_data,
				)

[docs]	def consolidate(
			self,
			engine: pyms_nist_search.Engine,
			peak_filter: Optional[ConsolidatedPeakFilter] = None,
			) -> pandas.DataFrame:
		"""
		Consolidate the compound identification from the experiments into a single dataset.

		:param engine:
		:param peak_filter: Filter for the consolidated peaks.

		:returns: :class:`pandas.DataFrame` giving the results of pairwise mass spectral comparisons
			between the repeats for each aligned peak.
		"""

		consolidated_peaks, ms_comparison_df = consolidate(self, engine)

		if peak_filter is None:
			self.consolidated_peaks = consolidated_peaks
		else:
			self.consolidated_peaks = peak_filter.filter(consolidated_peaks)

		return ms_comparison_df

		# chart_data = make_chart_data(self)


[docs]def consolidate(
		project: Project,
		engine: pyms_nist_search.Engine,
		) -> pandas.DataFrame:
	"""
	Consolidate the compound identification from the experiments into a single dataset.

	:param project:
	:param engine:

	:returns: List of consolidated peaks and :class:`pandas.DataFrame`
		giving the results of pairwise mass spectral comparisons between the repeats for each aligned peak.

	.. versionadded:: 0.10.0
	"""

	ms_comparison_df = pairwise_ms_comparisons(project.alignment)

	peak_numbers: List[int] = []
	peak: Optional[QualifiedPeak]

	qualified_peak_array = []

	# for experiment in project.alignment.expr_code:
	for experiment in project.datafile_data:
		qualified_peaks = project.datafile_data[experiment].qualified_peaks
		assert qualified_peaks is not None
		for peak in qualified_peaks:
			assert peak.peak_number is not None
			peak_numbers.append(peak.peak_number)
		qualified_peak_array.append(qualified_peaks)

	# Convert peak_numbers to a set and sort smallest to largest
	peak_numbers = sorted(set(peak_numbers))

	consolidated_peaks = match_counter(
			engine=engine,
			peak_numbers=peak_numbers,
			qualified_peaks=qualified_peak_array,
			ms_comp_data=ms_comparison_df,
			)

	return consolidated_peaks, ms_comparison_df