Source code for mwtab.mwextract

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
mwtab.mwextract
~~~~~~~~~~~

This module provides a number of functions and classes for extracting and saving data and metadata
stored in ``mwTab`` formatted files in the form of :class:`~mwtab.mwtab.MWTabFile`.
"""
import csv
import json
import os
import re


[docs]class ItemMatcher(object): """ItemMatcher class that can be called to match items from ``mwTab`` formatted files in the form of :class:`~mwtab.mwtab.MWTabFile`. """ section_conversion = { "PR": "PROJECT", "ST": "STUDY", "SU": "SUBJECT", "CO": "COLLECTION", "TR": "TREATMENT", "SP": "SAMPLEPREP", "CH": "CHROMATOGRAPHY", "AN": "ANALYSIS", "MS": "MS", "NM": "NMR", } def __init__(self, full_key, value_comparison): """ItemMatcher initializer. :param str full_key: Key to match in :class:`~mwtab.mwtab.MWTabFile`. :param value_comparison: Value to match in :class:`~mwtab.mwtab.MWTabFile`. :type value_comparison: :class:`re.Pattern` or :py:class:`str` """ self.full_key = full_key self.section, self.key = self.full_key.split(":") self.section = ItemMatcher.section_conversion[self.section] self.value_comparison = value_comparison def __call__(self, mwtabfile): """Match key value pair in :class:`~mwtab.mwtab.MWTabFile`. :param mwtabfile: Instance of :class:`~mwtab.mwtab.MWTabFile`. :type mwtabfile: :class:`~mwtab.mwtab.MWTabFile` :return: True if key and value are present, False otherwise. :rtype: :py:obj:`True` or :py:obj:`False` """ return mwtabfile[self.section][self.key] == self.value_comparison
[docs]class ReGeXMatcher(ItemMatcher): """ReGeXMatcher class that can be called to match items from ``mwTab`` formatted files in the form of :class:`~mwtab.mwtab.MWTabFile` using regular expressions. """ def __init__(self, full_key, value_comparison): """ItemMatcher initializer. :param str full_key: Key to match in :class:`~mwtab.mwtab.MWTabFile`. :param value_comparison: Value, in the form of a regular expression, to match in :class:`~mwtab.mwtab.MWTabFile`. :type value_comparison: :class:`re.Pattern` """ super(ReGeXMatcher, self).__init__(full_key, value_comparison) def __call__(self, mwtabfile): """Match key value pair in :class:`~mwtab.mwtab.MWTabFile`. :param mwtabfile: Instance of :class:`~mwtab.mwtab.MWTabFile`. :type mwtabfile: :class:`~mwtab.mwtab.MWTabFile` :return: True if key and value are present, False otherwise. :rtype: :py:obj:`True` or :py:obj:`False` """ return re.search(self.value_comparison, mwtabfile[self.section][self.key])
[docs]def generate_matchers(items): """Construct a generator that yields Matchers :class:`~mwtab.mwtab.ItemMatcher` or :class:`~mwtab.mwtab.ReGeXMatcher`. :param iterable items: Iterable object containing key value pairs to match. :return: Yields a Matcher object for each given item. :rtype: :class:`~mwtab.mwtab.ItemMatcher` or :class:`~mwtab.mwtab.ReGeXMatcher` """ for item in items: if type(item[1]) == re.Pattern: yield ReGeXMatcher(item[0], item[1]) else: yield ItemMatcher(item[0], item[1])
[docs]def extract_metabolites(sources, matchers): """Extract metabolite data from ``mwTab`` formatted files in the form of :class:`~mwtab.mwtab.MWTabFile`. :param generator sources: Generator of mwtab file objects (:class:`~mwtab.mwtab.MWTabFile`). :param generator matchers: Generator of matcher objects (:class:`~mwtab.mwextract.ItemMatcher` or :class:`~mwtab.mwextract.ReGeXMatcher`). :return: Extracted metabolites dictionary. :rtype: :py:class:`dict` """ metabolites = dict() for mwtabfile in sources: if all(matcher(mwtabfile) for matcher in matchers): data_section_key = list(set(mwtabfile.keys()) & {"MS_METABOLITE_DATA", "NMR_METABOLITE_DATA", "NMR_BINNED_DATA"})[0] for data_list in mwtabfile[data_section_key]["Data"]: for test_key in (key for key in data_list.keys() if key != "Metabolite"): try: if float(data_list[test_key]) > 0: metabolites.setdefault(data_list["Metabolite"], dict())\ .setdefault(mwtabfile.study_id, dict())\ .setdefault(mwtabfile.analysis_id, set())\ .add(test_key) except Exception as e: pass return metabolites
[docs]def extract_metadata(mwtabfile, keys): """Extract metadata data from ``mwTab`` formatted files in the form of :class:`~mwtab.mwtab.MWTabFile`. :param mwtabfile: mwTab file object for metadata to be extracted from. :type mwtabfile: :class:`~mwtab.mwtab.MWTabFile` :param list keys: List of metadata field keys for metadata values to be extracted. :return: Extracted metadata dictionary. :rtype: :py:class:`dict` """ extracted_values = {} for section in mwtabfile: for metadata in mwtabfile[section]: for key in keys: if metadata == key: # TODO: Allow for partial match, ReGeX, etc. extracted_values.setdefault(key, set()).add(mwtabfile[section][metadata]) return extracted_values
[docs]def write_metadata_csv(to_path, extracted_values, no_header=False): """Write extracted metadata :py:class:`dict` into csv file. Example: "metadata","value1","value2" "SUBJECT_TYPE","Human","Plant" :param str to_path: Path to output file. :param dict extracted_values: Metadata dictionary to be saved. :param bool no_header: If true header is not included, otherwise header is included. :return: None :rtype: :py:obj:`None` """ if not os.path.exists(os.path.dirname(os.path.splitext(to_path)[0])): dirname = os.path.dirname(to_path) if dirname: os.makedirs(dirname) if not os.path.splitext(to_path)[1]: to_path += ".csv" with open(to_path, "w", newline="") as outfile: wr = csv.writer(outfile, quoting=csv.QUOTE_ALL) if not no_header: max_value_num = max([len(extracted_values[key]) for key in extracted_values.keys()]) line_list = ["metadata"] line_list.extend(["value{}".format(num) for num in range(max_value_num)]) wr.writerow(line_list) for key in extracted_values: line_list = [key] line_list.extend([val for val in sorted(extracted_values[key])]) wr.writerow(line_list)
[docs]def write_metabolites_csv(to_path, extracted_values, no_header=False): """Write extracted metabolites data :py:class:`dict` into csv file. Example: "metabolite_name","num-studies","num_analyses","num_samples" "1,2,4-benzenetriol","1","1","24" "1-monostearin","1","1","24" ... :param str to_path: Path to output file. :param dict extracted_values: Metabolites data dictionary to be saved. :param bool no_header: If true header is not included, otherwise header is included. :return: None :rtype: :py:obj:`None` """ csv_list = [] for metabolite_key in extracted_values.keys(): num_analyses = 0 num_samples = 0 for study_key in extracted_values[metabolite_key]: num_analyses += len(extracted_values[metabolite_key][study_key]) for analysis_key in extracted_values[metabolite_key][study_key]: num_samples += len(extracted_values[metabolite_key][study_key][analysis_key]) csv_list.append([ metabolite_key, len(extracted_values[metabolite_key]), num_analyses, num_samples ]) if not os.path.exists(os.path.dirname(os.path.splitext(to_path)[0])): dirname = os.path.dirname(to_path) if dirname: os.makedirs(dirname) if not os.path.splitext(to_path)[1]: to_path += ".csv" with open(to_path, "w", newline="") as outfile: wr = csv.writer(outfile, quoting=csv.QUOTE_ALL) if not no_header: wr.writerow(["metabolite_name", "num-studies", "num_analyses", "num_samples"]) for line_list in csv_list: wr.writerow(line_list)
[docs]class SetEncoder(json.JSONEncoder): """SetEncoder class for encoding Python sets :py:class:`set` into json serializable objects :py:class:`list`. """
[docs] def default(self, obj): """Method for encoding Python objects. If object passed is a set, converts the set to JSON serializable lists or calls base implementation. :param object obj: Python object to be json encoded. :return: JSON serializable object. :rtype: :py:class:`dict`, :py:class:`list`, :py:class:`tuple`, :py:class:`str`, :py:class:`int`, :py:class:`float`, :py:obj:`bool`, or :py:obj:`None` """ if isinstance(obj, set): return list(obj) return json.JSONEncoder.default(self, obj)
[docs]def write_json(to_path, extracted_dict): """Write extracted data or metadata :py:class:`dict` into json file. Metabolites example: { "1,2,4-benzenetriol": { "ST000001": { "AN000001": [ "LabF_115816", ... ] } } } Metadata example: { "SUBJECT_TYPE": [ "Plant", "Human" ] } :param str to_path: Path to output file. :param dict extracted_dict: Metabolites data or metadata dictionary to be saved. :return: None :rtype: :py:obj:`None` """ if not os.path.exists(os.path.dirname(os.path.splitext(to_path)[0])): dirname = os.path.dirname(to_path) if dirname: os.makedirs(dirname) if not os.path.splitext(to_path)[1]: to_path += ".json" with open(to_path, "w") as outfile: json.dump(extracted_dict, outfile, sort_keys=True, indent=4, cls=SetEncoder)