Source code for mwtab.validator

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
mwtab.validator
~~~~~~~~~~~~~~~

This module contains routines to validate consistency of the ``mwTab``
formatted files, e.g. make sure that ``Samples`` and ``Factors``
identifiers are consistent across the file, make sure that all
required key-value pairs are present.
"""

from copy import deepcopy
from collections import OrderedDict
from datetime import datetime
from .mwschema import section_schema_mapping
from re import match
import io
import sys
import mwtab


VERBOSE = False
LOG = None

METABOLITES_REGEXS = {
    "hmdb_id": {
        r"(?i)[\s|\S]{,}(HMDB)",
        r"(?i)(Human Metabolome D)[\S]{,}",
    },
    "inchi_key": {
        r"(?i)(inchi)[\S]{,}",
    },
    "kegg_id": {
        r"(?i)(kegg)$",
        r"(?i)(kegg)(\s|_)(i)",
    },
    "moverz": {
        r"(?i)(m/z)",
    },
    "moverz_quant": {
        r"(?i)(moverz)(\s|_)(quant)",
        r"(?i)(quan)[\S]{,}(\s|_)(m)[\S]{,}(z)",
    },
    "other_id": {
        r"(?i)(other)(\s|_)(id)$",
    },
    "other_id_type": {
        r"(?i)(other)(\s|_)(id)(\s|_)(type)$",
    },
    "pubchem_id": {
        r"(?i)(pubchem)[\S]{,}",
    },
    "retention_index": {
        r"(?i)(ri)$",
        r"(?i)(ret)[\s|\S]{,}(index)",
    },
    "retention_index_type": {
        r"(?i)(ri)(\s|_)(type)",
    },
    "retention_time": {
        r"(?i)(r)[\s|\S]{,}(time)[\S]{,}",
    },
}

ITEM_SECTIONS = {
    "METABOLOMICS WORKBENCH",
    "PROJECT",
    "STUDY",
    "ANALYSIS",
    "SUBJECT",
    "COLLECTION",
    "TREATMENT",
    "SAMPLEPREP",
    "CHROMATOGRAPHY",
    "MS",
    "NM",
}

VALIDATION_LOG_HEADER = \
"""Validation Log
{}
mwtab Python Library Version: {}
Source:        {}
Study ID:      {}
Analysis ID:   {}
File format:   {}"""


def validate_subject_samples_factors(mwtabfile):
    """Validate ``SUBJECT_SAMPLE_FACTORS`` section.

    :param mwtabfile: Instance of :class:`~mwtab.mwtab.MWTabFile`.
    :type mwtabfile: :class:`~mwtab.mwtab.MWTabFile` or
                     :py:class:`collections.OrderedDict`
    """
    subject_samples_factors_errors = list()

    for index, subject_sample_factor in enumerate(mwtabfile["SUBJECT_SAMPLE_FACTORS"]):
        if not subject_sample_factor["Subject ID"]:
            subject_samples_factors_errors.append(
                "SUBJECT_SAMPLE_FACTORS: Entry #{} missing Subject ID.".format(index+1)
            )
        if not subject_sample_factor["Sample ID"]:
            subject_samples_factors_errors.append(
                "SUBJECT_SAMPLE_FACTORS: Entry #{} missing Sample ID.".format(index + 1)
            )
        if subject_sample_factor.get("Factors"):
            for factor_key in subject_sample_factor["Factors"]:
                if not subject_sample_factor["Factors"][factor_key]:
                    subject_samples_factors_errors.append(
                        "SUBJECT_SAMPLE_FACTORS: Entry #{} missing value for Factor {}.".format(index + 1, factor_key)
                    )
        if subject_sample_factor.get("Additional sample data"):
            for additional_key in subject_sample_factor["Additional sample data"]:
                if not subject_sample_factor["Additional sample data"][additional_key]:
                    subject_samples_factors_errors.append(
                        "SUBJECT_SAMPLE_FACTORS: Entry #{} missing value for Additional sample data {}.".format(
                            index + 1, additional_key
                        )
                    )

    return subject_samples_factors_errors


def validate_data(mwtabfile, data_section_key, null_values):
    """Validates ``MS_METABOLITE_DATA``, ``NMR_METABOLITE_DATA``, and ``NMR_BINNED_DATA`` sections.

    :param mwtabfile: Instance of :class:`~mwtab.mwtab.MWTabFile`.
    :type mwtabfile: :class:`~mwtab.mwtab.MWTabFile` or
                     :py:class:`collections.OrderedDict`
    :param data_section_key: Section key (either MS_METABOLITE_DATA, NMR_METABOLITE_DATA, or NMR_BINNED_DATA)
    :type data_section_key: :py:class:`str`
    :param bool null_values: whether null values are present.
    """
    data_errors = list()

    subject_sample_factors_sample_id_set = {subject_sample_factor["Sample ID"] for subject_sample_factor in mwtabfile["SUBJECT_SAMPLE_FACTORS"]}
    data_sample_id_set = set(list(mwtabfile[data_section_key]["Data"][0].keys())[1:])

    # Removed for mwTab File Spec. 1.5
    # if subject_sample_factors_sample_id_set - data_sample_id_set:
    #     data_errors.append("{}: Section missing data entry for sample(s): {}.".format(
    #         data_section_key,
    #         subject_sample_factors_sample_id_set - data_sample_id_set
    #     ))
    if data_sample_id_set - subject_sample_factors_sample_id_set:
        data_errors.append("SUBJECT_SAMPLE_FACTORS: Section missing sample ID(s) {} found in {} section.".format(
            data_sample_id_set - subject_sample_factors_sample_id_set,
            data_section_key
        ))

    for index, metabolite in enumerate(mwtabfile[data_section_key]["Data"]):
        # if set(list(metabolite.keys())[1:]) != subject_sample_factors_sample_id_set:
        #     print(len(subject_sample_factors_sample_id_set), len(metabolite) - 1)
        #     print(
        #         "{}: Metabolite \"{}\" missing data entry for {} samples".format(
        #             data_section_key,
        #             metabolite[list(metabolite.keys())[0]],
        #             len(subject_sample_factors_sample_id_set - set(list(metabolite.keys())[1:]))
        #         ),
        #         file=error_stream
        #     )
        if null_values:
            for data_point_key in metabolite.keys():
                if data_point_key != "Metabolite":
                    try:
                        float(metabolite[data_point_key])
                    except ValueError as e:
                        metabolite[data_point_key] = ""
                        data_errors.append(
                            "{}: Data entry #{} contains non-numeric value converted to \"\".".format(data_section_key, index + 1))

    return data_errors


def validate_metabolites(mwtabfile, data_section_key):
    """Validate ``METABOLITES`` section.

    :param mwtabfile: Instance of :class:`~mwtab.mwtab.MWTabFile`.
    :type mwtabfile: :class:`~mwtab.mwtab.MWTabFile` or
                     :py:class:`collections.OrderedDict`
    :param data_section_key: Section key (either MS_METABOLITE_DATA, NMR_METABOLITE_DATA, or NMR_BINNED_DATA)
    :type data_section_key: :py:class:`str`
    """
    metabolites_errors = list()

    for index, metabolite in enumerate(mwtabfile[data_section_key]["Metabolites"]):
        for field_key in list(metabolite.keys())[1:]:
            if not any(k == field_key for k in METABOLITES_REGEXS.keys()):
                for regex_key in METABOLITES_REGEXS.keys():
                    if any(match(p, field_key) for p in METABOLITES_REGEXS[regex_key]):
                        metabolites_errors.append("METABOLITES: Data entry #{} contains field name \"{}\" which matches a commonly used field name \"{}\".".format(index + 1, field_key, regex_key))
                        field_key = regex_key
                        break

    return metabolites_errors


def validate_extended(mwtabfile, data_section_key):
    """Validate ``EXTENDED_MS_METABOLITE_DATA``, ``EXTENDED_NMR_METABOLITE_DATA``, and ``EXTENDED_NMR_BINNED_DATA`` sections.

    :param mwtabfile: Instance of :class:`~mwtab.mwtab.MWTabFile`.
    :type mwtabfile: :class:`~mwtab.mwtab.MWTabFile` or
                     :py:class:`collections.OrderedDict`
    :param data_section_key: Section key (either MS_METABOLITE_DATA, NMR_METABOLITE_DATA, or NMR_BINNED_DATA)
    :type data_section_key: :py:class:`str`
    """
    extended_errors = list()

    sample_id_set = {subject_sample_factor["Sample ID"] for subject_sample_factor in
                     mwtabfile["SUBJECT_SAMPLE_FACTORS"]}

    for index, extended_data in enumerate(mwtabfile[data_section_key]["Extended"]):
        if "sample_id" not in extended_data.keys():
            extended_errors.append("EXTENDED_{}: Data entry #{} missing Sample ID.".format(data_section_key, index + 1))
        elif not extended_data["sample_id"] in sample_id_set:
            extended_errors.append(
                "EXTENDED_{}: Data entry #{} contains Sample ID \"{}\" not found in SUBJECT_SAMPLE_FACTORS section.".format(
                    data_section_key, index + 1, extended_data["sample_id"]
                ))

    return extended_errors


def validate_section_schema(section, schema, section_key):
    """Validate section of ``mwTab`` formatted file.

    :param section: Section of :class:`~mwtab.mwtab.MWTabFile`.
    :type section: :py:class:`collections.OrderedDict`
    :param schema: Schema definition.
    :type schema: :py:class:`~schema.schema`
    :param str section_key: Section key.

    :return: Validated section.
    :rtype: :py:class:`collections.OrderedDict`
    """
    schema_errors = list()

    if section_key in ITEM_SECTIONS:
        for key in section.keys():
            if not section[key]:
                schema_errors.append("{}: Contains item \"{}\" with null value.".format(section_key, key))
                del section[key]

    return schema.validate(section), schema_errors


[docs]def validate_file(mwtabfile, section_schema_mapping=section_schema_mapping, verbose=False, metabolites=True): """Validate ``mwTab`` formatted file. :param mwtabfile: Instance of :class:`~mwtab.mwtab.MWTabFile`. :type mwtabfile: :class:`~mwtab.mwtab.MWTabFile` or :py:class:`collections.OrderedDict` :param dict section_schema_mapping: Dictionary that provides mapping between section name and schema definition. :param bool verbose: whether to be verbose or not. :param bool metabolites: whether to validate metabolites section. :return: Validated file. :rtype: :py:class:`collections.OrderedDict` """ # setup if not verbose: error_stout = io.StringIO() else: error_stout = sys.stdout validated_mwtabfile = deepcopy(OrderedDict(mwtabfile)) # generate validation log header(s) file_format = mwtabfile.source.split("/")[-1] if "https://www.metabolomicsworkbench.org/" in mwtabfile.source else \ mwtabfile.source.split(".")[1] print(VALIDATION_LOG_HEADER.format( str(datetime.now()), mwtab.__version__, mwtabfile.source, mwtabfile.study_id, mwtabfile.analysis_id, file_format ), file=error_stout) # create list to collect validation errors errors = list() # validate PROJECT, STUDY, ANALYSIS... and Schemas for section_key, section in mwtabfile.items(): try: schema = section_schema_mapping[section_key] # section = validate_section_schema(section, schema, section_key, error_stout) section, schema_errors = validate_section_schema(section, schema, section_key) errors.extend(schema_errors) validated_mwtabfile[section_key] = section except Exception as e: errors.append("SCHEMA: Section \"{}\" does not match the allowed schema. ".format(section_key) + str(e)) # validate SUBJECT_SAMPLE_FACTORS # validate_subject_samples_factors(validated_mwtabfile, error_stout) errors.extend(validate_subject_samples_factors(validated_mwtabfile)) # validate ..._DATA sections data_section_key = list(set(validated_mwtabfile.keys()) & {"MS_METABOLITE_DATA", "NMR_METABOLITE_DATA", "NMR_BINNED_DATA"}) if data_section_key: data_section_key = data_section_key[0] # validate_data(validated_mwtabfile, data_section_key, error_stout, False) errors.extend(validate_data(validated_mwtabfile, data_section_key, False)) if data_section_key in ("MS_METABOLITE_DATA", "NMR_METABOLITE_DATA"): # temp for testing if metabolites: if "Metabolites" in validated_mwtabfile[data_section_key].keys(): errors.extend(validate_metabolites(validated_mwtabfile, data_section_key)) else: errors.append("DATA: Missing METABOLITES section.") if "Extended" in validated_mwtabfile[data_section_key].keys(): errors.extend(validate_extended(validated_mwtabfile, data_section_key)) else: if "MS" in validated_mwtabfile.keys(): if not validated_mwtabfile["MS"].get("MS_RESULTS_FILE"): errors.append("DATA: Missing MS_METABOLITE_DATA section or MS_RESULTS_FILE item in MS section.") elif "NM" in validated_mwtabfile.keys(): if not validated_mwtabfile['NM'].get('NMR_RESULTS_FILE'): errors.append("DATA: Missing either NMR_METABOLITE_DATA or NMR_BINNED_DATA section or NMR_RESULTS_FILE item in NM secction.") # finish writing validation/error log if errors: print("Status: Contains Validation Errors", file=error_stout) print("Number Errors: {}\n".format(len(errors)), file=error_stout) print("Error Log:\n" + "\n".join(errors), file=error_stout) else: print("Status: Passing", file=error_stout) if verbose: return validated_mwtabfile, None else: return validated_mwtabfile, error_stout.getvalue()