From c379503b1c72d5c207a02f1ab118a69b4e8bce33 Mon Sep 17 00:00:00 2001 From: iuvbio Date: Sun, 22 Aug 2021 18:05:47 +0200 Subject: [PATCH] rename modules and factor out common functionality --- pandas_ods_reader/__init__.py | 2 +- pandas_ods_reader/algo.py | 66 ++++++++++++++++ pandas_ods_reader/{parser.py => main.py} | 9 ++- pandas_ods_reader/parsers/fods.py | 98 ++++++------------------ pandas_ods_reader/parsers/ods.py | 57 ++------------ pandas_ods_reader/{tools.py => utils.py} | 0 6 files changed, 105 insertions(+), 127 deletions(-) create mode 100644 pandas_ods_reader/algo.py rename pandas_ods_reader/{parser.py => main.py} (82%) rename pandas_ods_reader/{tools.py => utils.py} (100%) diff --git a/pandas_ods_reader/__init__.py b/pandas_ods_reader/__init__.py index eb856a8..a2e31fa 100644 --- a/pandas_ods_reader/__init__.py +++ b/pandas_ods_reader/__init__.py @@ -1,6 +1,6 @@ import pkg_resources -from .parser import read_ods +from .main import read_ods __version__ = pkg_resources.get_distribution("pandas_ods_reader").version diff --git a/pandas_ods_reader/algo.py b/pandas_ods_reader/algo.py new file mode 100644 index 0000000..8c872cf --- /dev/null +++ b/pandas_ods_reader/algo.py @@ -0,0 +1,66 @@ +from collections import OrderedDict + +import pandas as pd + +from .utils import sanitize_df + + +def parse_data(backend, rows, headers=True, columns=None): + df_dict = OrderedDict() + col_index = {} + for i, row in enumerate(rows): + # row is a list of cells + if headers and i == 0 and not columns: + repeat_until = -1 + repeat_value = None + # columns as lists in a dictionary + columns = [] + # parse the first row as column names + for k, cell in enumerate(row): + value, n_repeated = backend.get_value(cell) + if n_repeated > 0: + repeat_value = value + repeat_until = n_repeated + k + if not value and k <= repeat_until: + value = repeat_value + if k == repeat_until: + # reset to allow for more than one repeated column + repeat_until = -1 + if value and value not in columns: + columns.append(value) + else: + column_name = value if value else "unnamed" + # add count to column name + idx = 1 + while f"{column_name}.{idx}" in columns: + idx += 1 + columns.append(f"{column_name}.{idx}") + elif i == 0: + # without headers, assign generic numbered column names + columns = columns if columns else [f"column.{j}" for j in range(len(row))] + if i == 0: + df_dict = OrderedDict((column, []) for column in columns) + # create index for the column headers + col_index = {j: column for j, column in enumerate(columns)} + if headers: + continue + for j, cell in enumerate(row): + if j < len(col_index): + value, _ = backend.get_value(cell, parsed=True) + # use header instead of column index + df_dict[col_index[j]].append(value) + # make sure all columns are of the same length + max_col_length = max(len(df_dict[col]) for col in df_dict) + for col in df_dict: + col_length = len(df_dict[col]) + if col_length < max_col_length: + df_dict[col] += [None] * (max_col_length - col_length) + df = pd.DataFrame(df_dict) + return df + + +def read_data(backend, file_or_path, sheet_id, headers=True, columns=None): + doc = backend.get_doc(file_or_path) + rows = backend.get_rows(doc, sheet_id) + df = parse_data(backend, rows, headers=headers, columns=columns) + return sanitize_df(df) diff --git a/pandas_ods_reader/parser.py b/pandas_ods_reader/main.py similarity index 82% rename from pandas_ods_reader/parser.py rename to pandas_ods_reader/main.py index 84c757f..702f505 100644 --- a/pandas_ods_reader/parser.py +++ b/pandas_ods_reader/main.py @@ -2,6 +2,7 @@ from pathlib import Path from .parsers import fods, ods +from . import algo EXT_MAP = {".ods": ods, ".fods": fods} @@ -28,7 +29,9 @@ def read_ods(file_or_path, sheet=1, headers=True, columns=None): pandas.DataFrame The content of the specified sheet as a DataFrame. """ - loader = EXT_MAP.get(Path(file_or_path).suffix) - if not loader: + backend = EXT_MAP.get(Path(file_or_path).suffix) + if not backend: raise ValueError("Unknown filetype.") - return loader.read(file_or_path, sheet, headers=headers, columns=columns) + return algo.read_data( + backend, file_or_path, sheet, headers=headers, columns=columns + ) diff --git a/pandas_ods_reader/parsers/fods.py b/pandas_ods_reader/parsers/fods.py index 3f747b9..d950bc8 100644 --- a/pandas_ods_reader/parsers/fods.py +++ b/pandas_ods_reader/parsers/fods.py @@ -1,9 +1,4 @@ -from collections import defaultdict - from lxml import etree -import pandas as pd - -from ..tools import sanitize_df BODY_TAG = "office:body" @@ -18,6 +13,10 @@ TABLE_CELL_REPEATED_ATTRIB = "number-columns-repeated" VALUE_TYPE_ATTRIB = "value-type" +def get_doc(file_or_path): + return etree.parse(str(file_or_path)) + + def get_sheet(spreadsheet, sheet_id): namespaces = spreadsheet.nsmap if isinstance(sheet_id, str): @@ -33,52 +32,7 @@ def get_sheet(spreadsheet, sheet_id): return tables[sheet_id - 1] -def parse_columns(cells, headers=True, columns=None): - orig_columns = cells.pop(0) if headers else None - if columns is None: - if orig_columns: - repeated_val = None - columns = [] - repeated_dict = defaultdict(lambda: 0) - for i, col in enumerate(orig_columns): - text = col.find(TABLE_CELL_TEXT_TAG, namespaces=col.nsmap) - if text is not None: - value = text.text - elif text is None and repeated_val: - value = repeated_val - else: - value = "unnamed" - idx = 1 - while "{}.{}".format(value, idx) in columns: - idx += 1 - value = f"{value}.{idx}" - repeated = col.attrib.get( - f"{{{col.nsmap[TABLE_KEY]}}}{TABLE_CELL_REPEATED_ATTRIB}" - ) - if repeated: - repeated_dict[value] += 1 - repeated_val = f"{value}.{repeated_dict[value]}" - column = value if value not in columns else f"{value}.{i}" - columns.append(column) - else: - columns = [f"column.{i}" for i in range(len(cells[0]))] - return columns, cells - - -def parse_value(cell): - text = cell.find(TABLE_CELL_TEXT_TAG, namespaces=cell.nsmap) - is_float = ( - cell.attrib.get(f"{{{cell.nsmap[OFFICE_KEY]}}}{VALUE_TYPE_ATTRIB}") == "float" - ) - if text is None: - return None - value = text.text - if is_float: - return float(value) - return value - - -def load_fods(doc, sheet_id, headers=True, columns=None): +def get_rows(doc, sheet_id): if not isinstance(sheet_id, (str, int)): raise ValueError("Sheet id has to be either `str` or `int`") root = doc.getroot() @@ -88,28 +42,24 @@ def load_fods(doc, sheet_id, headers=True, columns=None): ) sheet = get_sheet(spreadsheet, sheet_id) rows = sheet.findall(TABLE_ROW_TAG, namespaces=namespaces) - allcells = [] - for row in rows: - cells = row.findall(TABLE_CELL_TAG, namespaces=namespaces) - allcells.append(cells) - columns, values = parse_columns(allcells, headers, columns) - data = [] - for row in values: - rowvalues = [parse_value(cell) for cell in row] - data.append(rowvalues) - final_rows = [] - for row in data: - final_row = [] - for i in range(len(columns)): - if i < len(row): - final_row.append(row[i]) - else: - final_row.append(None) - final_rows.append(final_row) - return pd.DataFrame(final_rows, columns=columns) + return rows -def read(file_or_path, sheet=1, headers=True, columns=None): - doc = etree.parse(str(file_or_path)) - df = load_fods(doc, sheet, headers=headers, columns=columns) - return sanitize_df(df) +def is_float(cell): + return ( + cell.attrib.get(f"{{{cell.nsmap[OFFICE_KEY]}}}{VALUE_TYPE_ATTRIB}") == "float" + ) + + +def get_value(cell, parsed=False): + text = cell.find(TABLE_CELL_TEXT_TAG, namespaces=cell.nsmap) + if text is None: + return None, 0 + value = text.text + if parsed and is_float(cell): + value = float(value) + n_repeated = cell.attrib.get( + f"{{{cell.nsmap[TABLE_KEY]}}}{TABLE_CELL_REPEATED_ATTRIB}" + ) + n_repeated = int(n_repeated) if n_repeated is not None else 0 + return value, n_repeated diff --git a/pandas_ods_reader/parsers/ods.py b/pandas_ods_reader/parsers/ods.py index 783ff1e..2150d02 100644 --- a/pandas_ods_reader/parsers/ods.py +++ b/pandas_ods_reader/parsers/ods.py @@ -1,13 +1,11 @@ -from collections import OrderedDict - import ezodf -import pandas as pd - -from ..tools import sanitize_df -def load_ods(doc, sheet_id, headers=True, columns=None): - # convert the sheet to a pandas.DataFrame +def get_doc(file_or_path): + return ezodf.opendoc(file_or_path) + + +def get_rows(doc, sheet_id): if not isinstance(sheet_id, (int, str)): raise ValueError("Sheet id has to be either `str` or `int`") if isinstance(sheet_id, str): @@ -16,47 +14,8 @@ def load_ods(doc, sheet_id, headers=True, columns=None): raise KeyError("There is no sheet named {}".format(sheet_id)) sheet_id = sheets.index(sheet_id) + 1 sheet = doc.sheets[sheet_id - 1] - df_dict = OrderedDict() - col_index = {} - for i, row in enumerate(sheet.rows()): - # row is a list of cells - if headers and i == 0 and not columns: - # columns as lists in a dictionary - columns = [] - for cell in row: - if cell.value and cell.value not in columns: - columns.append(cell.value) - else: - column_name = cell.value if cell.value else "unnamed" - # add count to column name - idx = 1 - while "{}.{}".format(column_name, idx) in columns: - idx += 1 - columns.append("{}.{}".format(column_name, idx)) - - df_dict = OrderedDict((column, []) for column in columns) - # create index for the column headers - col_index = {j: column for j, column in enumerate(columns)} - continue - elif i == 0: - columns = columns if columns else ([f"column.{j}" for j in range(len(row))]) - # columns as lists in a dictionary - df_dict = OrderedDict((column, []) for column in columns) - # create index for the column headers - col_index = {j: column for j, column in enumerate(columns)} - if headers: - continue - for j, cell in enumerate(row): - if j < len(col_index): - # use header instead of column index - df_dict[col_index[j]].append(cell.value) - else: - continue - df = pd.DataFrame(df_dict) - return df + return sheet.rows() -def read(file_or_path, sheet=1, headers=True, columns=None): - doc = ezodf.opendoc(file_or_path) - df = load_ods(doc, sheet, headers, columns) - return sanitize_df(df) +def get_value(cell, parsed=False): + return cell.value, 0 diff --git a/pandas_ods_reader/tools.py b/pandas_ods_reader/utils.py similarity index 100% rename from pandas_ods_reader/tools.py rename to pandas_ods_reader/utils.py