Source code for mwtab.mwrest

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# TODO: Add qualifications for output items per specific context input item.
"""
mwtab.mwrest
~~~~~~~~~~~~

This module provides routines for accessing the Metabolomics Workbench REST API.

See https://www.metabolomicsworkbench.org/tools/MWRestAPIv1.0.pdf for details.
"""

from collections import OrderedDict
from . import fileio
import re
import json


VERBOSE = False
BASE_URL = "https://www.metabolomicsworkbench.org/rest/"


[docs]def analysis_ids(base_url=BASE_URL): """ Method for retrieving a list of analysis ids for every current analysis in Metabolomics Workbench. :param str base_url: Base url to Metabolomics Workbench REST API. :return: List of every available Metabolomics Workbench analysis identifier. :rtype: :py:class:`list` """ st_an_dict = _pull_study_analysis(base_url) analyses = list() [analyses.extend(st_an_dict[k]) for k in st_an_dict.keys()] if VERBOSE: print("Found {} Analysis Files to be Downloaded".format(len(analyses))) return analyses
[docs]def study_ids(base_url=BASE_URL): """ Method for retrieving a list of study ids for every current study in Metabolomics Workbench. :param str base_url: Base url to Metabolomics Workbench REST API. :return: List of every available Metabolomics Workbench study identifier. :rtype: :py:class:`list` """ st_an_dict = _pull_study_analysis(base_url) studies = list(st_an_dict.keys()) if VERBOSE: print("Found {} Study Files to be Downloaded".format(len(studies))) return studies
def _pull_study_analysis(base_url=BASE_URL): """ Method for requesting a JSON string containing all study ids and analysis ids from Metabolomics Workbench's REST API. Requests a JSON file which contains a list of studies and their accompanying analyses. The JSON file is converted into a python object (dict) which can then be parsed to create a dictionary with the form study id (key): analysis id(s) (values). :param str base_url: Base url to Metabolomics Workbench REST API. :return: Dictionary of study ids (keys) and lists of analyses (value). :rtype: :py:class:`dict` """ url = GenericMWURL( {"context": "study", "input_item": "study_id", "input_value": "ST", "output_item": "analysis"}, base_url ).url mwrestfile = next(fileio.read_mwrest(url, **{"convertJSON": True})) json_object = mwrestfile._is_json(mwrestfile.text) study_analysis_dict = dict() for k in json_object.keys(): if study_analysis_dict.get(json_object[k]["study_id"]): study_analysis_dict[json_object[k]["study_id"]].append(json_object[k]["analysis_id"]) else: study_analysis_dict[json_object[k]["study_id"]] = [json_object[k]["analysis_id"]] return study_analysis_dict
[docs]def generate_mwtab_urls(input_items, base_url=BASE_URL, output_format='txt'): """ Method for generating URLS to be used to retrieve `mwtab` files for analyses and studies through the REST API of the Metabolomics Workbench database. :param list input_items: List of Metabolomics Workbench input values for mwTab files. :param str base_url: Base url to Metabolomics Workbench REST API. :param str output_format: Output format for the mwTab files to be retrieved in. :return: Metabolomics Workbench REST URL string(s). :rtype: :py:class:`str` """ for input_item in input_items: if input_item.isdigit(): analysis_id = "AN{}".format(input_item.zfill(6)) yield GenericMWURL({ "context": "study", "input_item": "analysis_id", "input_value": analysis_id, "output_item": "mwtab", "output_format": output_format }, base_url).url elif re.match(r'(AN[0-9]{6}$)', input_item): yield GenericMWURL({ "context": "study", "input_item": "analysis_id", "input_value": input_item, "output_item": "mwtab", "output_format": output_format }, base_url).url elif re.match(r'(ST[0-9]{1,6}$)', input_item): yield GenericMWURL({ "context": "study", "input_item": "study_id", "input_value": input_item, "output_item": "mwtab", "output_format": output_format }, base_url).url
[docs]def generate_urls(input_items, base_url=BASE_URL, **kwds): """ Method for creating a generator which yields validated Metabolomics Workbench REST urls. :param list input_items: List of Metabolomics Workbench input values for mwTab files. :param str base_url: Base url to Metabolomics Workbench REST API. :param dict kwds: Keyword arguments of Metabolomics Workbench URL Path items. :return: Metabolomics Workbench REST URL string(s). :rtype: :py:class:`str` """ for input_item in input_items: params = dict(kwds) params["input_item"] = input_item yield GenericMWURL(params, base_url).url
[docs]class GenericMWURL(object): """GenericMWURL class that stores and validates parameters specifying a Metabolomics Workbench REST URL. Metabolomics REST API requests are performed using URL requests in the form of https://www.metabolomicsworkbench.org/rest/context/input_specification/output_specification where: if context = "study" | "compound" | "refmet" | "gene" | "protein" input_specification = input_item/input_value output_specification = output_item/[output_format] elif context = "moverz" input_specification = input_item/input_value1/input_value2/input_value3 input_item = "LIPIDS" | "MB" | "REFMET" input_value1 = m/z_value input_value2 = ion_type_value input_value3 = m/z_tolerance_value output_specification = output_format output_format = "txt" elif context = "exactmass" input_specification = input_item/input_value1/input_value2 input_item = "LIPIDS" | "MB" | "REFMET" input_value1 = LIPID_abbreviation input_value2 = ion_type_value output_specification = None """ context = { "study": { "input_item": { 'study_id', 'study_title', 'institute', 'last_name', 'analysis_id', 'metabolite_id' }, "output_item": { 'summary', 'factors', 'analysis', 'metabolites', 'mwtab', 'source', 'species', 'disease', 'number_of_metabolites', 'data', 'datatable', 'untarg_studies', 'untarg_factors', 'untarg_data' } }, "compound": { "input_item": { 'regno', 'formula', 'inchi_key', 'lm_id', 'pubchem_cid', 'hmdb_id', 'kegg_id', 'chebi_id', 'metacyc_id', 'abbrev' }, "output_item": { 'all', 'regno', 'formula', 'exactmass', 'inchi_key', 'name', 'sys_name', 'smiles', 'lm_id', 'pubchem_cid', 'hmdb_id', 'kegg_id', 'chebi_id', 'metacyc_id', 'classification', 'molfile', 'png' } }, "refmet": { "input_item": { 'all', 'match', 'name', 'inchi_key', 'regno', 'pubchem_cid', 'formula', 'main_class', 'sub_class' }, "output_item": { 'all', 'name', 'inchi_key', 'regno', 'pubchem_cid', 'exactmass', 'formula', 'synonyms', 'sys_name', 'main_class', 'sub_class' } }, "gene": { "input_item": { 'mgp_id', 'gene_id', 'gene_name', 'gene_symbol', 'taxid' }, "output_item": { 'all', 'lmp_id', 'mgp_id', 'gene_name', 'gene_symbol', 'gene_synonyms', 'alt_names', 'chromosome', 'map_location', 'summary', 'taxid', 'species', 'species_long' } }, "protein": { "input_item": { 'mgp_id', 'gene_id', 'gene_name', 'gene_symbol', 'taxid', 'mrna_id', 'refseq_id', 'protein_gi', 'uniprot_id', 'protein_entry', 'protein_name' }, "output_item": { 'all', 'mgp_id', 'gene_id', 'gene_name', 'gene_symbol', 'taxid', 'species', 'species_long', 'mrna_id', 'refseq_id', 'protein_gi', 'uniprot_id', 'protein_entry', 'protein_name', 'seqlength', 'seq', 'is_identical_to' } }, "moverz": { "input_item": { 'LIPIDS', 'MB', 'REFMET' }, "ion_type_value": { 'M+H', 'M+H-H2O', 'M+2H', 'M+3H', 'M+4H', 'M+K', 'M+2K', 'M+Na', 'M+2Na', 'M+Li', 'M+2Li', 'M+NH4', 'M+H+CH3CN', 'M+Na+CH3CN', 'M.NaFormate+H', 'M.NH4Formate+H', 'M.CH3', 'M.TMSi', 'M.tBuDMSi', 'M-H', 'M-H-H2O', 'M+Na-2H', 'M+K-2H', 'M-2H', 'M-3H', 'M4H', 'M.Cl', 'M.F', 'M.HF2', 'M.OAc', 'M.Formate', 'M.NaFormate-H', 'M.NH4Formate-H', 'Neutral' } }, "exactmass": { "LIPID_abbreviation": { 'ArthroCer', 'asialoGM2Cer', 'CAR', 'CE', 'Cer', 'CerP', 'CoA', 'DG', 'DGDG', 'FA', 'GalCer', 'GB3Cer', 'GlcCer', 'GM3Cer', 'GM4Cer', 'iGB3Cer', 'LacCer', 'Lc3Cer', 'Manb1-4GlcCer', 'MG', 'MGDG', 'MolluCer', 'PA', 'PC', 'PE', 'PE-Cer', 'PG', 'PGP', 'PI', 'PI-Cer', 'PIP', 'PIP2', 'PIP3', 'PS', 'SM', 'SQDG', 'TG' }, "ion_type_value": { 'Neutral', 'M+H', 'M+H-H2O', 'M+2H', 'M+3H', 'M+4H', 'M+K', 'M+2K', 'M+2K-H', 'M+Na', 'M+2Na', 'M+2Na-H', 'M+Li', 'M+2Li', 'M+Ag', 'M+NH4', 'M-H', 'M-CH3', 'M2H', 'M-3H', 'M-4H', 'M.Cl', 'M.OAc', 'M.Formate' } } } def __init__(self, rest_params, base_url=BASE_URL): """URL initializer. :param dict rest_params: Dictionary of Metabolomics Workbench URL Path items. :param str base_url: Base url to Metabolomics Workbench REST API. """ self.rest_params = rest_params self.base_url = base_url self._validate() self.url = self._create_url() def _validate(self): """Validate URL parameters. Raises error if self.rest_params is lacking a "context" keyword. Sub-functions raise error if missing or invalid parameter(s) in self.rest_params. :return: None :rtype: :py:obj:`None` """ if not self.rest_params["context"] in self.context.keys(): raise KeyError("Error: Invalid/missing context") elif self.rest_params["context"] in {"study", "compound", "refmet", "gene", "protein"}: self._validate_generic() elif self.rest_params["context"] == "moverz": self._validate_moverz() elif self.rest_params["context"] == "exactmass": self._validate_exactmass() def _validate_generic(self): """Validates keyword arguments for study, compound, refmet, gene, and protein contexts. Raises error if missing or invalid parameter(s) in self.rest_params. context = "study" input_item = "study_id" | "study_title" | "institute" | "last_name" | "analysis_id" | "metabolite_id" input_value = input_value output_item = "summary" | "factors" | "analysis" | "metabolites" | "mwtab" | "source" | "species" | "disease" | "number_of_metabolites" | "data" | "datatable" | "untarg_studies" | "untarg_factors" | "untarg_data" output_format = "txt" | "json" [Default: "json"] context = "compound" input_item = "regno" | "formula" | "inchi_key" | "lm_id" | "pubchem_cid" | "hmdb_id" | "kegg_id" | "chebi_id" | "metacyc_id" | "abbrev" input_value = input_value output_item = "all" | "regno" | "formula" | "exactmass" | "inchi_key" | "name" | "sys_name" | "smiles" | "lm_id" | "pubchem_cid" | "hmdb_id" | "kegg_id" | "chebi_id" | "metacyc_id" | "classification" | "molfile" | "png" | "regno,formula,exactmass,..." output_format = "txt" | "json" [Default: "json"] context = "refmet" input_item = "all" | "match" | "name" | "inchi_key" | "regno" | "pubchem_cid" | "formula" | "main_class" | "sub_class" input_value = input_value output_item = "all" | "name" | "inchi_key" | "regno" | "pubchem_cid" | "exactmass" | "formula" | "synonyms" | "sys_name" | "main_class" | "sub_class" | "name,inchi_key,regno,..." output_format = "txt" | "json" [Default: "json"] context = "gene" input_item = "mgp_id" | "gene_id" | "gene_name" | "gene_symbol" | "taxid" input_value = input_value output_item = "all" | "lmp_id" | "mgp_id" | "gene_name" | "gene_symbol" | "gene_synonyms" | "alt_names" | "chromosome" | "map_location" | "summary" | "taxid" | "species" | "species_long" | "mgp_id,gene_id,gene_name,..." output_format = "txt" | "json" [Default: "json"] context = "protein" input_item = "mgp_id" | "gene_id" | "gene_name" | "gene_symbol" | "taxid" | "mrna_id" | "refseq_id" | "protein_gi" | "uniprot_id" | "protein_entry" | "protein_name" input_value = input_value output_item = "all" | "mgp_id" | "gene_id" | "gene_name" | "gene_symbol" | "taxid" | "species" | "species_long" | "mrna_id" | "refseq_id" | "protein_gi" | "uniprot_id" | "protein_entry" | "protein_name" | "seqlength" | "seq" | "is_identical_to" | "mgp_id,gene_id,gene_name,..." output_format = "txt" | "json" [Default: "json"] Uses static method self._validate_id() to validate input_value parameter. :return: None :rtype: :py:obj:`None` """ keywords = {"input_item", "input_value", "output_item"} # validate all required parameters are present if not all(k in self.rest_params.keys() for k in keywords): raise KeyError("Missing input item(s): " + str(keywords.difference(self.rest_params.keys()))) # validate input_item if not any(k in self.rest_params["input_item"] for k in self.context[self.rest_params["context"]]["input_item"]): raise ValueError("Invalid input item") # validate input_item self._validate_input(self.rest_params["input_item"], self.rest_params["input_value"]) # validate output_item(s) if type(self.rest_params["output_item"]) == list: if self.rest_params["context"] == "study": raise ValueError("Invalid output items. Study only takes a single output item.") elif not all(k in self.context[self.rest_params["context"]]["output_item"] for k in self.rest_params["output_item"]): raise ValueError("Invalid output item(s): " + str(set(self.rest_params["output_item"]).difference(self.context[self.rest_params["context"]]["output_item"]))) elif not any(k in self.rest_params["output_item"] for k in self.context[self.rest_params["context"]]["output_item"]): raise ValueError("Invalid output item") def _validate_moverz(self): """Validate keyword arguments for moverz context. Raises error if missing or invalid parameter(s) in self.rest_params. context = "moverz" input_item = "LIPIDS" | "MB" | "REFMET" input_value1 = m/z_value input_value2 = ion_type_value input_value3 = m/z_tolerance_value output_format = "txt" m/z_value range: 50-2000 See CONTEXT variable for supported ion type values. m/z_tolerance_value range: 0.0001-1 :return: None :rtype: :py:obj:`None` """ keywords = {"input_item", "m/z_value", "ion_type_value", "m/z_tolerance_value"} if not all(k in self.rest_params.keys() for k in keywords): raise KeyError("Missing input item(s): " + str(keywords.difference(self.rest_params.keys()))) elif not any(k in self.rest_params["input_item"] for k in self.context["moverz"]["input_item"]): raise ValueError("Invalid input item") elif not 50 <= float(self.rest_params["m/z_value"]) <= 2000: raise ValueError("m/z value outside of range: 50-2000") elif not self.rest_params["ion_type_value"] in self.context["moverz"]["ion_type_value"]: raise ValueError("Invalid ion type value") elif not 0.0001 <= float(self.rest_params["m/z_tolerance_value"]) <= 1: raise ValueError("m/z tolerance value outside of range: 0.0001-1") def _validate_exactmass(self): """Validate keyword arguments for exactmass context. Raises error if missing or invalid parameter(s) in self.rest_params. context = "exactmass" LIPID_abbreviation = ... ion_type_value = ... See :class:`~mwtab.mwrest.GenericMWURL.context` variable for full list of possible values for LIPID_abbreviation and ion_type_value. :return: None :rtype: :py:obj:`None` """ keywords = {"LIPID_abbreviation", "ion_type_value"} if not all(k in self.rest_params.keys() for k in keywords): raise KeyError("Missing input item(s): " + str(keywords.difference(self.rest_params.keys()))) elif not any(k in self.rest_params["LIPID_abbreviation"] for k in self.context["exactmass"]["LIPID_abbreviation"]): raise ValueError("Invalid LIPID abbreviation") elif not self.rest_params['ion_type_value'] in self.context["exactmass"]["ion_type_value"]: raise ValueError("Invalid ion type value") @staticmethod def _validate_input(input_item, input_value): """Validate keyword arguments for input item where an id is used (ie. study, compound, refmet, gene, and protein). If invalid, raises value error. :param str input_item: String representing the input item from the input specification. :param str input_value: String representing the input vlaue from the input specification. :return: None :rtype: :py:obj:`None` """ if input_item == "study_id": # allows for pulling a range of entries (ie. ST0001 pulls studies 100-199) if not re.match(r"(ST[0-9]{0,6}$)", input_value): raise ValueError("Invalid Metabolomics Workbench (MW) study ID (ST<6-digit integer>)") elif input_item in ["study_title", "institute", "last_name", "formula", "gene_name", "gene_symbol", "protein_entry", "protein_name"]: if not type(input_value) == str: raise ValueError("Invalid {} (<string>)".format(input_item.replace("_", " "))) elif input_item == "analysis_id": if not re.match(r'(AN[0-9]{6}$)', input_value): raise ValueError("Invalid Metabolomics Workbench analysis ID for a study (AN<6-digit integer>)") elif input_item == "metabolite_id": if not re.match(r"(ME[0-9]{6}$)", input_value): raise ValueError("Invalid Metabolomics Workbench metabolite ID for a study (ME<6-digit integer>)") elif input_item == "regno": if not input_value.isdigit(): raise ValueError("Invalid Metabolomics Workbench Metabolite database ID (<integer>)") elif input_item == "inchi_key": if not re.match(r"([A-Z,\-]{27}$)", input_value): raise ValueError("Invalid InChIKey (27-character string)") elif input_item == "lm_id": if not re.match(r"(LM[A-Z]{2}[0-9]{8,10}$)", input_value): raise ValueError("Invalid LIPID MAPS ID (LM<2-character LIPID MAPS category><8-10 character string>)") elif input_item == "pubchem_cid": if not input_value.isdigit(): raise ValueError("Invalid PubChem Compound ID (<integer>)") elif input_item == "hmdb_id": if not re.match(r"(HMDB[0-9]+$)", input_value): raise ValueError("Invalid Human Metabolome Database ID (HMDB<integer>)") elif input_item == "kegg_id": if not re.match(r"(CO[0-9]+$)", input_value): raise ValueError("Invalid KEGG compound ID (CO<integer>)") elif input_item == "chebi_id": if not input_value.isdigit(): raise ValueError("Invalid ChEBI compound id (<integer>)") # TODO: update the following two input types. elif input_item == "metacyc_id": if not type(input_value) == str: raise ValueError("Invalid METACYC compound ID (<string>)") elif input_item == "abbrev": if not type(input_value) == str: raise ValueError("Invalid : Lipid bulk abbreviation (<string>)") elif input_item == "mgp_id": if not re.match(r"(MGP[0-9]{6}$)", input_value): raise ValueError("Invalid Human Metabolome Gene/Protein (MGP) database gene ID (MGP<6-digit integer>)") elif input_item == "gene_id": if not input_value.isdigit(): raise ValueError("Invalid Entrez gene ID (<integer>)") elif input_item == "taxid": if not input_value.isdigit(): raise ValueError("Invalid NCBI taxonomy ID (<integer>)") elif input_item == "mrna_id": if not re.match(r"(NM_[0-9]+$)", input_value): raise ValueError("Invalid mRNA ID (NM_<integer>)") elif input_item == "refseq_id": if not re.match(r"(NP_[0-9]+$)", input_value): raise ValueError("Invalid mRNA ID (NP_<integer>)") elif input_item == "protein_gi": if not input_value.isdigit(): raise ValueError("Invalid NCBI protein GI (<integer>)") elif input_item == "uniprot_id": # regex from https://www.uniprot.org/help/accession_numbers if not re.match(r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}", input_value): raise ValueError("Invalid UniProt ID (see uniprot.org/help/accession_numbers)") def _create_url(self): """Method for constructing a formatted Metabolomics Workbench REST API URL from the given class parameters. :return: Formatted Metabolomics Workbench REST API URL. :rtype: :py:class:`str` """ if self.rest_params["context"] in {"study", "compound", "refmet", "gene", "protein"}: # allows for url to include or not include output_format if self.rest_params.get("output_format"): return self.base_url + "/".join([self.rest_params.get(p) for p in [ "context", "input_item", "input_value", "output_item", "output_format" ]]) else: return self.base_url + "/".join([self.rest_params.get(p) for p in [ "context", "input_item", "input_value", "output_item" ]]) elif self.rest_params["context"] == "moverz": rest_params = [self.rest_params.get(p) for p in [ "context", "input_item", "m/z_value", "ion_type_value", "m/z_tolerance_value" ]] rest_params.append("txt") return self.base_url + "/".join(rest_params) elif self.rest_params["context"] == "exactmass": return self.base_url + "/".join([self.rest_params.get(p) for p in [ "context", "LIPID_abbreviation", "ion_type_value" ]])
[docs]class MWRESTFile(object): """MWRESTFile class that stores data from a single file download through Metabolomics Workbench's REST API. Mirrors :class:`~mwtab.mwtab.MWTabFile`. """ def __init__(self, source): """File initializer. :param str source: Source a `MWRESTFile` instance was created from. """ self.source = source self.text = ""
[docs] def read(self, filehandle): """Read data into a :class:`~mwtab.mwrest.MWRESTFile` instance. :param filehandle: file-like object. :type filehandle: :py:class:`io.TextIOWrapper`, :py:class:`gzip.GzipFile`, :py:class:`bz2.BZ2File`, :py:class:`zipfile.ZipFile` :return: None :rtype: :py:obj:`None` """ self.text = filehandle.read().decode("utf-8") # input_str = input_str.replace("\r\n", "\n") # self.text = re.sub(r"</br>", "", self.text) # included to remove remaining HTML tags filehandle.close()
[docs] def write(self, filehandle): """Write :class:`~mwtab.mwrest.MWRESTFile` data into file. :param filehandle: file-like object. :type filehandle: :py:class:`io.TextIOWrapper` :return: None :rtype: :py:obj:`None` """ try: filehandle.write(self.text) except IOError: raise IOError('"filehandle" parameter must be writable.') filehandle.close()
@staticmethod def _is_json(string): """Test if input string is in JSON format. :param string: Input string. :type string: :py:class:`str` or :py:class:`bytes` :return: Input string if in JSON format or False otherwise. :rtype: :py:class:`str` or :py:obj:`False` """ try: if isinstance(string, bytes): json_str = json.loads(string.decode("utf-8"), object_pairs_hook=OrderedDict) elif isinstance(string, str): json_str = json.loads(string, object_pairs_hook=OrderedDict) else: raise TypeError("Expecting <class 'str'> or <class 'bytes'>, but {} was passed".format(type(string))) return json_str except ValueError: return False