From 91300fe726fe06ed022d20f62a4b4f054a078ff8 Mon Sep 17 00:00:00 2001 From: iuvbio Date: Wed, 18 Aug 2021 22:51:47 +0200 Subject: [PATCH] add fods parser and example --- pandas_ods_reader/parsers/fods.py | 63 +++++ tests/rsc/example_headers.fods | 441 ++++++++++++++++++++++++++++++ 2 files changed, 504 insertions(+) create mode 100644 pandas_ods_reader/parsers/fods.py create mode 100644 tests/rsc/example_headers.fods diff --git a/pandas_ods_reader/parsers/fods.py b/pandas_ods_reader/parsers/fods.py new file mode 100644 index 0000000..b40d38a --- /dev/null +++ b/pandas_ods_reader/parsers/fods.py @@ -0,0 +1,63 @@ +from collections import OrderedDict + +from lxml import etree +import pandas as pd + + +BODY_TAG = "office:body" +SPREADSHEET_TAG = "office:spreadsheet" +TABLE_TAG = "table:table" +TABLE_ROW_TAG = "table:table-row" +TABLE_CELL_TAG = "table:table-cell" +TABLE_CELL_TEXT_TAG = "text:p" + + +def get_sheet(spreadsheet, sheet_id): + namespaces = spreadsheet.nsmap + if isinstance(sheet_id, str): + sheet = spreadsheet.find( + f"{TABLE_TAG}[@table:name='{sheet_id}']", namespaces=namespaces + ) + if sheet is None: + raise KeyError(f"There is no sheet named {sheet_id}") + return sheet + tables = spreadsheet.findall(TABLE_TAG, namespaces=namespaces) + if sheet_id == 0 or sheet_id > len(tables): + raise IndexError(f"There is no sheet at index {sheet_id}.") + return tables[sheet_id - 1] + + +def load_fods(doc, sheet_id, headers=True, columns=None): + if not isinstance(sheet_id, (str, int)): + raise ValueError("Sheet id has to be either `str` or `int`") + root = doc.getroot() + namespaces = root.nsmap + spreadsheet = doc.find(BODY_TAG, namespaces=namespaces).find( + SPREADSHEET_TAG, namespaces=namespaces + ) + sheet = get_sheet(spreadsheet, sheet_id) + rows = sheet.findall(TABLE_ROW_TAG, namespaces=namespaces) + data = [] + for row in rows: + cells = row.findall(TABLE_CELL_TAG, namespaces=namespaces) + data.append( + [ + cell.find(TABLE_CELL_TEXT_TAG, namespaces=namespaces).text + for cell in cells + ] + ) + orig_columns = data.pop(0) if headers else None + if columns is None: + if orig_columns: + columns = orig_columns + else: + columns = [f"column.{i}" for i in range(len(data[0]))] + return pd.DataFrame( + OrderedDict({column: datarow for column, datarow in zip(columns, data)}) + ) + + +def read_fods(file_or_path, sheet=1, headers=True, columns=None): + doc = etree.parse(file_or_path) + df = load_fods(doc, sheet, headers=headers, columns=columns) + return df diff --git a/tests/rsc/example_headers.fods b/tests/rsc/example_headers.fods new file mode 100644 index 0000000..250985d --- /dev/null +++ b/tests/rsc/example_headers.fods @@ -0,0 +1,441 @@ + + + + Lukas Jansen2019-01-27T03:31:08.9314826322019-01-27T03:33:20.620959045Lukas JansenPT2M14S1LibreOffice/7.1.4.2$Linux_X86_64 LibreOffice_project/10$Build-2 + + + 0 + 0 + 11288 + 4967 + + + view1 + + + 6 + 20 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 100 + 60 + true + false + + + Sheet1 + 1849 + 0 + 100 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + false + 1000 + true + 1 + 12632256 + true + true + true + jQH+/01GQ0o0OTFEVwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpNRkNKNDkxRFcAAAAAAAAAAAAAAAAAAAAAAAAWAAMArgAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9TUZDSjQ5MURXCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luZGFqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4RGF0YQpQYWdlU2l6ZTpBNAAAEgBDT01QQVRfRFVQTEVYX01PREUPAER1cGxleE1vZGU6Ok9mZg== + 1000 + 7 + false + true + true + 1 + true + false + true + false + true + true + MFCJ491DW + false + 0 + 3 + true + false + false + false + true + false + true + + + Sheet1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ???(???) + + + 00/00/0000, 00:00:00 + + + + + Page 1/ 99 + + + + + + + + + + + + A + + + B + + + C + + + D + + + E + + + + + 62 + + + 22 + + + 83 + + + 35 + + + 35 + + + + + 100 + + + 56 + + + 91 + + + 29 + + + 57 + + + + + 11 + + + 71 + + + 68 + + + 53 + + + 32 + + + + + 84 + + + 26 + + + 3 + + + 21 + + + 17 + + + + + 11 + + + 56 + + + 26 + + + 25 + + + 30 + + + + + 61 + + + 3 + + + 35 + + + 98 + + + 62 + + + + + 22 + + + 96 + + + 10 + + + 53 + + + 34 + + + + + 25 + + + 33 + + + 86 + + + 38 + + + 89 + + + + + 25 + + + 93 + + + 31 + + + 72 + + + 60 + + + + + 19 + + + 64 + + + 42 + + + 38 + + + 28 + + + + + + + \ No newline at end of file