Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#178 support for fods #179

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added horror/cell_starting_with_a_space.ods
Binary file not shown.
273 changes: 273 additions & 0 deletions horror/simple.fods

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions messytables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from messytables.core import Cell, TableSet, RowSet, seekable_stream
from messytables.commas import CSVTableSet, CSVRowSet
from messytables.ods import ODSTableSet, ODSRowSet
from messytables.fods import FODSTableSet, FODSRowSet
from messytables.excel import XLSTableSet, XLSRowSet

# XLSXTableSet has been deprecated and its functionality is now provided by
Expand Down
6 changes: 4 additions & 2 deletions messytables/any.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from messytables import (ZIPTableSet, PDFTableSet, CSVTableSet, XLSTableSet,
HTMLTableSet, ODSTableSet)
HTMLTableSet, ODSTableSet, FODSTableSet)
import messytables
import re

Expand Down Expand Up @@ -39,6 +39,7 @@ def TABTableSet(fileobj):
'HTML': HTMLTableSet,
'CSV': CSVTableSet,
'ODS': ODSTableSet,
'FODS': FODSTableSet,
'PDF': PDFTableSet}


Expand Down Expand Up @@ -107,7 +108,8 @@ def guess_ext(ext):
'xlsm': 'XLS',
'xltx': 'XLS',
'xltm': 'XLS',
'ods': 'ODS'}
'ods': 'ODS',
'fods': 'FODS'}
if ext in lookup:
return lookup.get(ext, None)

Expand Down
215 changes: 215 additions & 0 deletions messytables/fods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import io
import re

from lxml import etree

from messytables.core import RowSet, TableSet, Cell
from messytables.types import (StringType, DecimalType,
DateType, BoolType, CurrencyType,
TimeType, PercentageType)


FODS_NAMESPACES_TAG_MATCH = re.compile(
b"(<office:document[^>]*>)", re.MULTILINE)
ODS_TABLE_MATCH = re.compile(
b".*?(<table:table.*?<\/.*?:table>).*?", re.DOTALL)
ODS_TABLE_NAME = re.compile(b'.*?table:name=\"(.*?)\".*?')
ODS_ROW_MATCH = re.compile(
b".*?(<table:table-row.*?<\/.*?:table-row>).*?", re.DOTALL)

NS_OPENDOCUMENT_PTTN = u"urn:oasis:names:tc:opendocument:xmlns:%s"
NS_CAL_PTTN = u"urn:org:documentfoundation:names:experimental:calc:xmlns:%s"
NS_OPENDOCUMENT_TABLE = NS_OPENDOCUMENT_PTTN % "table:1.0"
NS_OPENDOCUMENT_OFFICE = NS_OPENDOCUMENT_PTTN % "office:1.0"

TABLE_CELL = 'table-cell'
VALUE_TYPE = 'value-type'
COLUMN_REPEAT = 'number-columns-repeated'
EMPTY_CELL_VALUE = ''

ODS_VALUE_TOKEN = {
"float": "value",
"date": "date-value",
"time": "time-value",
"boolean": "boolean-value",
"percentage": "value",
"currency": "value"
}

ODS_TYPES = {
'float': DecimalType(),
'date': DateType('%Y-%m-%d'),
'boolean': BoolType(),
'percentage': PercentageType(),
'time': TimeType()
}


class FODSTableSet(TableSet):
"""
A wrapper around ODS files. Because they are zipped and the info we want
is in the zipped file as content.xml we must ensure that we either have
a seekable object (local file) or that we retrieve all of the content from
the remote URL.
"""

def __init__(self, fileobj, window=None, **kw):
'''Initialize the object.

:param fileobj: may be a file path or a file-like object. Note the
file-like object *must* be in binary mode and must be seekable (it will
get passed to zipfile).

As a specific tip: urllib2.urlopen returns a file-like object that is
not in file-like mode while urllib.urlopen *does*!

To get a seekable file you *cannot* use
messytables.core.seekable_stream as it does not support the full seek
functionality.
'''
if hasattr(fileobj, 'read'):
# wrap in a StringIO so we do not have hassle with seeks and
# binary etc (see notes to __init__ above)
# TODO: rather wasteful if in fact fileobj comes from disk
fileobj = io.BytesIO(fileobj.read())

self.window = window

self.content = fileobj.read()

def make_tables(self):
"""
Return the sheets in the workbook.

A regex is used for this to avoid having to:

1. load large the entire file into memory, or
2. SAX parse the file more than once
"""
namespace_tags = self._get_namespace_tags()
sheets = [m.groups(0)[0]
for m in ODS_TABLE_MATCH.finditer(self.content)]
return [FODSRowSet(sheet, self.window, namespace_tags)
for sheet in sheets]

def _get_namespace_tags(self):
match = re.search(FODS_NAMESPACES_TAG_MATCH, self.content)
assert match
tag_open = match.groups()[0]
tag_close = b'</office:document>'
return tag_open, tag_close


class FODSRowSet(RowSet):
""" ODS support for a single sheet in the ODS workbook. Unlike
the CSV row set this is not a streaming operation. """

def __init__(self, sheet, window=None, namespace_tags=None):
self.sheet = sheet

self.name = "Unknown"
m = ODS_TABLE_NAME.match(self.sheet)
if m:
self.name = m.groups(0)[0]

self.window = window or 1000

# We must wrap the XML fragments in a valid header otherwise iterparse
# will explode with certain (undefined) versions of libxml2. The
# namespaces are in the ODS file, and change with the libreoffice
# version saving it, so get them from the ODS file if possible. The
# default namespaces are an option to preserve backwards compatibility
# of ODSRowSet.
if namespace_tags:
self.namespace_tags = namespace_tags
else:
namespaces = {
"dc": u"http://purl.org/dc/elements/1.1/",
"draw": NS_OPENDOCUMENT_PTTN % u"drawing:1.0",
"number": NS_OPENDOCUMENT_PTTN % u"datastyle:1.0",
"office": NS_OPENDOCUMENT_PTTN % u"office:1.0",
"svg": NS_OPENDOCUMENT_PTTN % u"svg-compatible:1.0",
"table": NS_OPENDOCUMENT_PTTN % u"table:1.0",
"text": NS_OPENDOCUMENT_PTTN % u"text:1.0",
"calcext": NS_CAL_PTTN % u"calcext:1.0",
}

ods_header = u"<wrapper {0}>"\
.format(" ".join('xmlns:{0}="{1}"'.format(k, v)
for k, v in namespaces.iteritems())).encode('utf-8')
ods_footer = u"</wrapper>".encode('utf-8')
self.namespace_tags = (ods_header, ods_footer)

super(FODSRowSet, self).__init__(typed=True)

def raw(self, sample=False):
""" Iterate over all rows in this sheet. """
rows = ODS_ROW_MATCH.findall(self.sheet)

for row in rows:
row_data = []

block = self.namespace_tags[0] + row + self.namespace_tags[1]
partial = io.BytesIO(block)
empty_row = True

for action, element in etree.iterparse(partial, ('end',)):
if element.tag != _tag(NS_OPENDOCUMENT_TABLE, TABLE_CELL):
continue

cell = _read_cell(element)
if empty_row is True and cell.value != EMPTY_CELL_VALUE:
empty_row = False

repeat = element.attrib.get(
_tag(NS_OPENDOCUMENT_TABLE, COLUMN_REPEAT))
if repeat:
number_of_repeat = int(repeat)
row_data += [cell] * number_of_repeat
else:
row_data.append(cell)

if empty_row:
# ignore blank lines
continue

del partial
yield row_data
del rows


def _read_cell(element):
cell_type = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, VALUE_TYPE))
value_token = ODS_VALUE_TOKEN.get(cell_type, 'value')
if cell_type == 'string':
cell = _read_text_cell(element)
elif cell_type == 'currency':
value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
currency = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, 'currency'))
cell = Cell(value + ' ' + currency, type=CurrencyType())
elif cell_type is not None:
value = element.attrib.get(_tag(NS_OPENDOCUMENT_OFFICE, value_token))
cell = Cell(value, type=ODS_TYPES.get(cell_type, StringType()))
else:
cell = Cell(EMPTY_CELL_VALUE, type=StringType())

return cell


def _read_text_cell(element):
children = element.getchildren()
text_content = []
for child in children:
if child.text:
text_content.append(child.text)
else:
text_content.append(EMPTY_CELL_VALUE)
if len(text_content) > 0:
cell_value = '\n'.join(text_content)
else:
cell_value = EMPTY_CELL_VALUE
return Cell(cell_value, type=StringType())


def _tag(namespace, tag):
return '{%s}%s' % (namespace, tag)
3 changes: 2 additions & 1 deletion test/test_any.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
from nose.tools import assert_equal
from nose.plugins.skip import SkipTest
from messytables import (any_tableset, XLSTableSet, ZIPTableSet, PDFTableSet,
CSVTableSet, ODSTableSet,
CSVTableSet, ODSTableSet, FODSTableSet,
ReadError)

suite = [{'filename': 'simple.csv', 'tableset': CSVTableSet},
{'filename': 'simple.xls', 'tableset': XLSTableSet},
{'filename': 'simple.xlsx', 'tableset': XLSTableSet},
{'filename': 'simple.zip', 'tableset': ZIPTableSet},
{'filename': 'simple.ods', 'tableset': ODSTableSet},
{'filename': 'simple.fods', 'tableset': FODSTableSet},
{'filename': 'bian-anal-mca-2005-dols-eng-1011-0312-tab3.xlsm',
'tableset': XLSTableSet},
]
Expand Down