From e9e4b3b2728ceb179c04559f2fa0d58b872597f0 Mon Sep 17 00:00:00 2001 From: ljnsn <82611987+ljnsn@users.noreply.github.com> Date: Wed, 16 Nov 2022 01:41:44 +0100 Subject: [PATCH] Add more type hints. --- pandas_ods_reader/algo.py | 32 ++++++++++++++++++++++++------- pandas_ods_reader/parsers/fods.py | 7 +++++-- pandas_ods_reader/parsers/ods.py | 17 +++++++++++----- pandas_ods_reader/utils.py | 5 +++-- 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/pandas_ods_reader/algo.py b/pandas_ods_reader/algo.py index 26d2a87..1d4b3fd 100644 --- a/pandas_ods_reader/algo.py +++ b/pandas_ods_reader/algo.py @@ -1,11 +1,14 @@ from collections import OrderedDict +from pathlib import Path +from types import ModuleType +from typing import Any, Iterator, Union import pandas as pd from .utils import sanitize_df -def get_columns_from_headers(backend, row): +def get_columns_from_headers(backend: ModuleType, row: Any) -> list[str]: repeat_until = -1 repeat_value = None # columns as lists in a dictionary @@ -33,21 +36,29 @@ def get_columns_from_headers(backend, row): return columns -def get_generic_columns(row): +def get_generic_columns(row: Any) -> list[str]: return [f"column.{j}" for j in range(len(row))] -def get_columns(backend, row, headers): +def get_columns(backend: ModuleType, row: Any, headers: bool) -> list[str]: if headers: return get_columns_from_headers(backend, row) return get_generic_columns(row) -def parse_data(backend, rows, headers=True, columns=None, skiprows=0): - df_dict = OrderedDict() - col_index = {} +def parse_data( + backend: ModuleType, + rows: Iterator[list[Any]], + headers: bool, + columns: list[str], + skiprows: int, +) -> pd.DataFrame: + df_dict: OrderedDict[str, Any] = OrderedDict() + col_index: dict[int, str] = {} + for _ in range(skiprows): next(rows) + for i, row in enumerate(rows): # row is a list of cells if i == 0: @@ -74,7 +85,14 @@ def parse_data(backend, rows, headers=True, columns=None, skiprows=0): return pd.DataFrame(df_dict) -def read_data(backend, file_or_path, sheet_id, headers=True, columns=None, skiprows=0): +def read_data( + backend: ModuleType, + file_or_path: Path, + sheet_id: Union[str, int], + headers: bool, + columns: list[str], + skiprows: int, +) -> pd.DataFrame: doc = backend.get_doc(file_or_path) rows = backend.get_rows(doc, sheet_id) df = parse_data(backend, rows, headers=headers, columns=columns, skiprows=skiprows) diff --git a/pandas_ods_reader/parsers/fods.py b/pandas_ods_reader/parsers/fods.py index a5e2c05..b1b0a7e 100644 --- a/pandas_ods_reader/parsers/fods.py +++ b/pandas_ods_reader/parsers/fods.py @@ -1,3 +1,5 @@ +from pathlib import Path + from lxml import etree @@ -13,7 +15,7 @@ TABLE_CELL_REPEATED_ATTRIB = "number-columns-repeated" VALUE_TYPE_ATTRIB = "value-type" -def get_doc(file_or_path): +def get_doc(file_or_path: Path): return etree.parse(str(file_or_path)) @@ -21,7 +23,8 @@ def get_sheet(spreadsheet, sheet_id): namespaces = spreadsheet.nsmap if isinstance(sheet_id, str): sheet = spreadsheet.find( - f"{TABLE_TAG}[@table:name='{sheet_id}']", namespaces=namespaces + f"{TABLE_TAG}[@table:name='{sheet_id}']", + namespaces=namespaces, ) if sheet is None: raise KeyError(f"There is no sheet named {sheet_id}.") diff --git a/pandas_ods_reader/parsers/ods.py b/pandas_ods_reader/parsers/ods.py index 2150d02..1c435c5 100644 --- a/pandas_ods_reader/parsers/ods.py +++ b/pandas_ods_reader/parsers/ods.py @@ -1,21 +1,28 @@ +from pathlib import Path +from typing import Any, Iterator, Union + import ezodf +from ezodf.document import FlatXMLDocument, PackagedDocument -def get_doc(file_or_path): +def get_doc(file_or_path: Path) -> Union[FlatXMLDocument, PackagedDocument]: return ezodf.opendoc(file_or_path) -def get_rows(doc, sheet_id): +def get_rows( + doc: Union[FlatXMLDocument, PackagedDocument], + sheet_id: Union[str, int], +) -> Iterator[list[ezodf.Cell]]: if not isinstance(sheet_id, (int, str)): raise ValueError("Sheet id has to be either `str` or `int`") if isinstance(sheet_id, str): - sheets = [sheet.name for sheet in doc.sheets] + sheets: list[str] = [sheet.name for sheet in doc.sheets] if sheet_id not in sheets: raise KeyError("There is no sheet named {}".format(sheet_id)) sheet_id = sheets.index(sheet_id) + 1 - sheet = doc.sheets[sheet_id - 1] + sheet: ezodf.Sheet = doc.sheets[sheet_id - 1] return sheet.rows() -def get_value(cell, parsed=False): +def get_value(cell: ezodf.Cell, parsed: bool = False) -> tuple[Any, int]: return cell.value, 0 diff --git a/pandas_ods_reader/utils.py b/pandas_ods_reader/utils.py index 36bcf5f..a3a5d60 100644 --- a/pandas_ods_reader/utils.py +++ b/pandas_ods_reader/utils.py @@ -1,4 +1,5 @@ """Provides utility functions for the parser""" +import pandas as pd def ods_info(doc): @@ -14,8 +15,8 @@ def ods_info(doc): ) -def sanitize_df(df): - """Drop empty rows and columns from the DataFrame and returns it""" +def sanitize_df(df: pd.DataFrame) -> pd.DataFrame: + """Drop empty rows and columns from the DataFrame and return it.""" # Delete empty rows for i in df.index.tolist()[-1::-1]: if df.iloc[i].isna().all():