Source code for mwtab.tokenizer

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
mwtab.tokenizer
~~~~~~~~~~~~~~~

This module provides the :func:`~mwtab.tokenizer.tokenizer` lexical analyzer for
`mwTab` format syntax. It is implemented as Python generator-based state
machine which generates (yields) tokens one at a time when :py:func:`next()`
is invoked on :func:`~mwtab.tokenizer.tokenizer` instance.

Each token is a tuple of "key-value"-like pairs, tuple of
``SUBJECT_SAMPLE_FACTORS`` or tuple of data deposited between
``*_START`` and ``*_END`` blocks.
"""

from __future__ import print_function, division, unicode_literals
from collections import deque, namedtuple, OrderedDict


KeyValue = namedtuple("KeyValue", ["key", "value"])
KeyValueExtra = namedtuple("KeyValueExtra", ["key", "value", "extra"])


[docs]def tokenizer(text):
    """A lexical analyzer for the `mwtab` formatted files.

    :param text: `mwTab` formatted text.
    :type text: py:class:`str`
    :return: Tuples of data.
    :rtype: py:class:`~collections.namedtuple`
    """
    stream = deque(text.split("\n"))

    while len(stream) > 0:
        line = stream.popleft()
        try:

            # header
            if line.startswith("#METABOLOMICS WORKBENCH"):
                yield KeyValue("#METABOLOMICS WORKBENCH", "\n")
                for identifier in line.split(" "):
                    if ":" in identifier:
                        key, value = identifier.split(":")
                        yield KeyValue(key, value)

            # SUBJECT_SAMPLE_FACTORS header (reached new section)
            elif line.startswith("#SUBJECT_SAMPLE_FACTORS:"):
                yield KeyValue("#ENDSECTION", "\n")
                yield KeyValue("#SUBJECT_SAMPLE_FACTORS", "\n")

            # section header (reached new section)
            elif line.startswith("#"):
                yield KeyValue("#ENDSECTION", "\n")
                yield KeyValue(line.strip(), "\n")

            # SUBJECT_SAMPLE_FACTORS line
            elif line.startswith("SUBJECT_SAMPLE_FACTORS"):
                line_items = line.split("\t")
                subject_sample_factors_dict = OrderedDict({
                    "Subject ID": line_items[1],
                    "Sample ID": line_items[2],
                    "Factors": {factor_item.split(":")[0].strip(): factor_item.split(":")[1].strip() for factor_item in
                                line_items[3].split("|")}
                })
                if line_items[4]:
                    subject_sample_factors_dict["Additional sample data"] = {
                        factor_item.split("=")[0].strip(): factor_item.split("=")[1].strip() for factor_item in line_items[4].split(";")
                    }
                yield KeyValue(line_items[0].strip(), subject_sample_factors_dict)

            # data start header
            elif line.endswith("_START"):
                yield KeyValue(line, "\n")

                # tokenize lines in data section till line ending with "_END" is reached
                while not line.endswith("_END"):
                    line = stream.popleft()
                    if line.endswith("_END"):
                        yield KeyValue(line.strip(), "\n")
                    else:
                        data = line.split("\t")
                        yield KeyValue(data[0], tuple(data))

            # item line in item section (e.g. PROJECT, SUBJECT, etc..)
            elif line:
                if "_RESULTS_FILE" in line:
                    line_items = line.split("\t")
                    # if len(line_items) > 2:
                    #     extra_items = list()
                    #     for extra_item in line_items[2:]:
                    #         k, v = extra_item.split(":")
                    #         extra_items.append(tuple([k.strip(), v.strip()]))
                    #     yield KeyValueExtra(line_items[0].strip()[3:], line_items[1], extra_items)
                    # else:
                    #     yield KeyValue(line_items[0].strip()[3:], line_items[1])
                    yield KeyValue(line_items[0].strip()[3:], " ".join(line_items[1:]))
                else:
                    key, value = line.split("\t")
                    if ":" in key:
                        if ":UNITS" in key:
                            yield KeyValue("Units", value)
                        else:
                            yield KeyValue(key.strip()[3:], value)
                    else:
                        yield KeyValue(key.strip(), value)

        except IndexError as e:
            raise IndexError("LINE WITH ERROR:\n\t", repr(line), e)
        except ValueError as e:
            raise ValueError("LINE WITH ERROR:\n\t", repr(line), e)

    # end of file
    yield KeyValue("#ENDSECTION", "\n")
    yield KeyValue("!#ENDFILE", "\n")  # This is to ensure that tokenizer terminates when #END is missing.
Source code for mwtab.tokenizer

mwtab

Navigation

Related Topics