Source code for mwtab.tokenizer
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
mwtab.tokenizer
~~~~~~~~~~~~~~~
This module provides the :func:`~mwtab.tokenizer.tokenizer` lexical analyzer for
`mwTab` format syntax. It is implemented as Python generator-based state
machine which generates (yields) tokens one at a time when :py:func:`next()`
is invoked on :func:`~mwtab.tokenizer.tokenizer` instance.
Each token is a tuple of "key-value"-like pairs, tuple of
``SUBJECT_SAMPLE_FACTORS`` or tuple of data deposited between
``*_START`` and ``*_END`` blocks.
"""
from __future__ import print_function, division, unicode_literals
from collections import deque, namedtuple, OrderedDict
KeyValue = namedtuple("KeyValue", ["key", "value"])
KeyValueExtra = namedtuple("KeyValueExtra", ["key", "value", "extra"])
[docs]def tokenizer(text):
"""A lexical analyzer for the `mwtab` formatted files.
:param text: `mwTab` formatted text.
:type text: py:class:`str`
:return: Tuples of data.
:rtype: py:class:`~collections.namedtuple`
"""
stream = deque(text.split("\n"))
while len(stream) > 0:
line = stream.popleft()
try:
# header
if line.startswith("#METABOLOMICS WORKBENCH"):
yield KeyValue("#METABOLOMICS WORKBENCH", "\n")
for identifier in line.split(" "):
if ":" in identifier:
key, value = identifier.split(":")
yield KeyValue(key, value)
# SUBJECT_SAMPLE_FACTORS header (reached new section)
elif line.startswith("#SUBJECT_SAMPLE_FACTORS:"):
yield KeyValue("#ENDSECTION", "\n")
yield KeyValue("#SUBJECT_SAMPLE_FACTORS", "\n")
# section header (reached new section)
elif line.startswith("#"):
yield KeyValue("#ENDSECTION", "\n")
yield KeyValue(line.strip(), "\n")
# SUBJECT_SAMPLE_FACTORS line
elif line.startswith("SUBJECT_SAMPLE_FACTORS"):
line_items = line.split("\t")
subject_sample_factors_dict = OrderedDict({
"Subject ID": line_items[1],
"Sample ID": line_items[2],
"Factors": {factor_item.split(":")[0].strip(): factor_item.split(":")[1].strip() for factor_item in
line_items[3].split("|")}
})
if line_items[4]:
subject_sample_factors_dict["Additional sample data"] = {
factor_item.split("=")[0].strip(): factor_item.split("=")[1].strip() for factor_item in line_items[4].split(";")
}
yield KeyValue(line_items[0].strip(), subject_sample_factors_dict)
# data start header
elif line.endswith("_START"):
yield KeyValue(line, "\n")
# tokenize lines in data section till line ending with "_END" is reached
while not line.endswith("_END"):
line = stream.popleft()
if line.endswith("_END"):
yield KeyValue(line.strip(), "\n")
else:
data = line.split("\t")
yield KeyValue(data[0], tuple(data))
# item line in item section (e.g. PROJECT, SUBJECT, etc..)
elif line:
if "_RESULTS_FILE" in line:
line_items = line.split("\t")
# if len(line_items) > 2:
# extra_items = list()
# for extra_item in line_items[2:]:
# k, v = extra_item.split(":")
# extra_items.append(tuple([k.strip(), v.strip()]))
# yield KeyValueExtra(line_items[0].strip()[3:], line_items[1], extra_items)
# else:
# yield KeyValue(line_items[0].strip()[3:], line_items[1])
yield KeyValue(line_items[0].strip()[3:], " ".join(line_items[1:]))
else:
key, value = line.split("\t")
if ":" in key:
if ":UNITS" in key:
yield KeyValue("Units", value)
else:
yield KeyValue(key.strip()[3:], value)
else:
yield KeyValue(key.strip(), value)
except IndexError as e:
raise IndexError("LINE WITH ERROR:\n\t", repr(line), e)
except ValueError as e:
raise ValueError("LINE WITH ERROR:\n\t", repr(line), e)
# end of file
yield KeyValue("#ENDSECTION", "\n")
yield KeyValue("!#ENDFILE", "\n") # This is to ensure that tokenizer terminates when #END is missing.