rename modules and factor out common functionality

2021-08-22 18:05:47 +02:00 · 2021-08-22 18:05:47 +02:00 · c379503b1c
parent dbd8dd8bbd
commit c379503b1c
6 changed files with 105 additions and 127 deletions
--- a/pandas_ods_reader/init.py
+++ b/pandas_ods_reader/init.py
@ -1,6 +1,6 @@
 import pkg_resources

-from .parser import read_ods
+from .main import read_ods


 __version__ = pkg_resources.get_distribution("pandas_ods_reader").version
--- a/pandas_ods_reader/algo.py
+++ b/pandas_ods_reader/algo.py
@ -0,0 +1,66 @@
+from collections import OrderedDict
+
+import pandas as pd
+
+from .utils import sanitize_df
+
+
+def parse_data(backend, rows, headers=True, columns=None):
+    df_dict = OrderedDict()
+    col_index = {}
+    for i, row in enumerate(rows):
+        # row is a list of cells
+        if headers and i == 0 and not columns:
+            repeat_until = -1
+            repeat_value = None
+            # columns as lists in a dictionary
+            columns = []
+            # parse the first row as column names
+            for k, cell in enumerate(row):
+                value, n_repeated = backend.get_value(cell)
+                if n_repeated > 0:
+                    repeat_value = value
+                    repeat_until = n_repeated + k
+                if not value and k <= repeat_until:
+                    value = repeat_value
+                if k == repeat_until:
+                    # reset to allow for more than one repeated column
+                    repeat_until = -1
+                if value and value not in columns:
+                    columns.append(value)
+                else:
+                    column_name = value if value else "unnamed"
+                    # add count to column name
+                    idx = 1
+                    while f"{column_name}.{idx}" in columns:
+                        idx += 1
+                    columns.append(f"{column_name}.{idx}")
+        elif i == 0:
+            # without headers, assign generic numbered column names
+            columns = columns if columns else [f"column.{j}" for j in range(len(row))]
+        if i == 0:
+            df_dict = OrderedDict((column, []) for column in columns)
+            # create index for the column headers
+            col_index = {j: column for j, column in enumerate(columns)}
+            if headers:
+                continue
+        for j, cell in enumerate(row):
+            if j < len(col_index):
+                value, _ = backend.get_value(cell, parsed=True)
+                # use header instead of column index
+                df_dict[col_index[j]].append(value)
+    # make sure all columns are of the same length
+    max_col_length = max(len(df_dict[col]) for col in df_dict)
+    for col in df_dict:
+        col_length = len(df_dict[col])
+        if col_length < max_col_length:
+            df_dict[col] += [None] * (max_col_length - col_length)
+    df = pd.DataFrame(df_dict)
+    return df
+
+
+def read_data(backend, file_or_path, sheet_id, headers=True, columns=None):
+    doc = backend.get_doc(file_or_path)
+    rows = backend.get_rows(doc, sheet_id)
+    df = parse_data(backend, rows, headers=headers, columns=columns)
+    return sanitize_df(df)
--- a/pandas_ods_reader/parser.py
+++ b/pandas_ods_reader/parser.py
@ -2,6 +2,7 @@
 from pathlib import Path

 from .parsers import fods, ods
+from . import algo


 EXT_MAP = {".ods": ods, ".fods": fods}
@ -28,7 +29,9 @@ def read_ods(file_or_path, sheet=1, headers=True, columns=None):
    pandas.DataFrame
        The content of the specified sheet as a DataFrame.
    """
-    loader = EXT_MAP.get(Path(file_or_path).suffix)
-    if not loader:
+    backend = EXT_MAP.get(Path(file_or_path).suffix)
+    if not backend:
        raise ValueError("Unknown filetype.")
-    return loader.read(file_or_path, sheet, headers=headers, columns=columns)
+    return algo.read_data(
+        backend, file_or_path, sheet, headers=headers, columns=columns
+    )
--- a/pandas_ods_reader/parsers/fods.py
+++ b/pandas_ods_reader/parsers/fods.py
@ -1,9 +1,4 @@
-from collections import defaultdict
-
 from lxml import etree
-import pandas as pd
-
-from ..tools import sanitize_df


 BODY_TAG = "office:body"
@ -18,6 +13,10 @@ TABLE_CELL_REPEATED_ATTRIB = "number-columns-repeated"
 VALUE_TYPE_ATTRIB = "value-type"


+def get_doc(file_or_path):
+    return etree.parse(str(file_or_path))
+
+
 def get_sheet(spreadsheet, sheet_id):
    namespaces = spreadsheet.nsmap
    if isinstance(sheet_id, str):
@ -33,52 +32,7 @@ def get_sheet(spreadsheet, sheet_id):
    return tables[sheet_id - 1]


-def parse_columns(cells, headers=True, columns=None):
-    orig_columns = cells.pop(0) if headers else None
-    if columns is None:
-        if orig_columns:
-            repeated_val = None
-            columns = []
-            repeated_dict = defaultdict(lambda: 0)
-            for i, col in enumerate(orig_columns):
-                text = col.find(TABLE_CELL_TEXT_TAG, namespaces=col.nsmap)
-                if text is not None:
-                    value = text.text
-                elif text is None and repeated_val:
-                    value = repeated_val
-                else:
-                    value = "unnamed"
-                    idx = 1
-                    while "{}.{}".format(value, idx) in columns:
-                        idx += 1
-                    value = f"{value}.{idx}"
-                repeated = col.attrib.get(
-                    f"{{{col.nsmap[TABLE_KEY]}}}{TABLE_CELL_REPEATED_ATTRIB}"
-                )
-                if repeated:
-                    repeated_dict[value] += 1
-                    repeated_val = f"{value}.{repeated_dict[value]}"
-                column = value if value not in columns else f"{value}.{i}"
-                columns.append(column)
-        else:
-            columns = [f"column.{i}" for i in range(len(cells[0]))]
-    return columns, cells
-
-
-def parse_value(cell):
-    text = cell.find(TABLE_CELL_TEXT_TAG, namespaces=cell.nsmap)
-    is_float = (
-        cell.attrib.get(f"{{{cell.nsmap[OFFICE_KEY]}}}{VALUE_TYPE_ATTRIB}") == "float"
-    )
-    if text is None:
-        return None
-    value = text.text
-    if is_float:
-        return float(value)
-    return value
-
-
-def load_fods(doc, sheet_id, headers=True, columns=None):
+def get_rows(doc, sheet_id):
    if not isinstance(sheet_id, (str, int)):
        raise ValueError("Sheet id has to be either `str` or `int`")
    root = doc.getroot()
@ -88,28 +42,24 @@ def load_fods(doc, sheet_id, headers=True, columns=None):
    )
    sheet = get_sheet(spreadsheet, sheet_id)
    rows = sheet.findall(TABLE_ROW_TAG, namespaces=namespaces)
-    allcells = []
-    for row in rows:
-        cells = row.findall(TABLE_CELL_TAG, namespaces=namespaces)
-        allcells.append(cells)
-    columns, values = parse_columns(allcells, headers, columns)
-    data = []
-    for row in values:
-        rowvalues = [parse_value(cell) for cell in row]
-        data.append(rowvalues)
-    final_rows = []
-    for row in data:
-        final_row = []
-        for i in range(len(columns)):
-            if i < len(row):
-                final_row.append(row[i])
-            else:
-                final_row.append(None)
-        final_rows.append(final_row)
-    return pd.DataFrame(final_rows, columns=columns)
+    return rows


-def read(file_or_path, sheet=1, headers=True, columns=None):
-    doc = etree.parse(str(file_or_path))
-    df = load_fods(doc, sheet, headers=headers, columns=columns)
-    return sanitize_df(df)
+def is_float(cell):
+    return (
+        cell.attrib.get(f"{{{cell.nsmap[OFFICE_KEY]}}}{VALUE_TYPE_ATTRIB}") == "float"
+    )
+
+
+def get_value(cell, parsed=False):
+    text = cell.find(TABLE_CELL_TEXT_TAG, namespaces=cell.nsmap)
+    if text is None:
+        return None, 0
+    value = text.text
+    if parsed and is_float(cell):
+        value = float(value)
+    n_repeated = cell.attrib.get(
+        f"{{{cell.nsmap[TABLE_KEY]}}}{TABLE_CELL_REPEATED_ATTRIB}"
+    )
+    n_repeated = int(n_repeated) if n_repeated is not None else 0
+    return value, n_repeated
--- a/pandas_ods_reader/parsers/ods.py
+++ b/pandas_ods_reader/parsers/ods.py
@ -1,13 +1,11 @@
-from collections import OrderedDict
-
 import ezodf
-import pandas as pd
-
-from ..tools import sanitize_df


-def load_ods(doc, sheet_id, headers=True, columns=None):
-    # convert the sheet to a pandas.DataFrame
+def get_doc(file_or_path):
+    return ezodf.opendoc(file_or_path)
+
+
+def get_rows(doc, sheet_id):
    if not isinstance(sheet_id, (int, str)):
        raise ValueError("Sheet id has to be either `str` or `int`")
    if isinstance(sheet_id, str):
@ -16,47 +14,8 @@ def load_ods(doc, sheet_id, headers=True, columns=None):
            raise KeyError("There is no sheet named {}".format(sheet_id))
        sheet_id = sheets.index(sheet_id) + 1
    sheet = doc.sheets[sheet_id - 1]
-    df_dict = OrderedDict()
-    col_index = {}
-    for i, row in enumerate(sheet.rows()):
-        # row is a list of cells
-        if headers and i == 0 and not columns:
-            # columns as lists in a dictionary
-            columns = []
-            for cell in row:
-                if cell.value and cell.value not in columns:
-                    columns.append(cell.value)
-                else:
-                    column_name = cell.value if cell.value else "unnamed"
-                    # add count to column name
-                    idx = 1
-                    while "{}.{}".format(column_name, idx) in columns:
-                        idx += 1
-                    columns.append("{}.{}".format(column_name, idx))
-
-            df_dict = OrderedDict((column, []) for column in columns)
-            # create index for the column headers
-            col_index = {j: column for j, column in enumerate(columns)}
-            continue
-        elif i == 0:
-            columns = columns if columns else ([f"column.{j}" for j in range(len(row))])
-            # columns as lists in a dictionary
-            df_dict = OrderedDict((column, []) for column in columns)
-            # create index for the column headers
-            col_index = {j: column for j, column in enumerate(columns)}
-            if headers:
-                continue
-        for j, cell in enumerate(row):
-            if j < len(col_index):
-                # use header instead of column index
-                df_dict[col_index[j]].append(cell.value)
-            else:
-                continue
-    df = pd.DataFrame(df_dict)
-    return df
+    return sheet.rows()


-def read(file_or_path, sheet=1, headers=True, columns=None):
-    doc = ezodf.opendoc(file_or_path)
-    df = load_ods(doc, sheet, headers, columns)
-    return sanitize_df(df)
+def get_value(cell, parsed=False):
+    return cell.value, 0
--- a/pandas_ods_reader/utils.py
+++ b/pandas_ods_reader/utils.py