Add more type hints.

This commit is contained in:
ljnsn 2022-11-16 01:41:44 +01:00
parent 3e96dae284
commit e9e4b3b272
4 changed files with 45 additions and 16 deletions

View File

@ -1,11 +1,14 @@
from collections import OrderedDict
from pathlib import Path
from types import ModuleType
from typing import Any, Iterator, Union
import pandas as pd
from .utils import sanitize_df
def get_columns_from_headers(backend, row):
def get_columns_from_headers(backend: ModuleType, row: Any) -> list[str]:
repeat_until = -1
repeat_value = None
# columns as lists in a dictionary
@ -33,21 +36,29 @@ def get_columns_from_headers(backend, row):
return columns
def get_generic_columns(row):
def get_generic_columns(row: Any) -> list[str]:
return [f"column.{j}" for j in range(len(row))]
def get_columns(backend, row, headers):
def get_columns(backend: ModuleType, row: Any, headers: bool) -> list[str]:
if headers:
return get_columns_from_headers(backend, row)
return get_generic_columns(row)
def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
df_dict = OrderedDict()
col_index = {}
def parse_data(
backend: ModuleType,
rows: Iterator[list[Any]],
headers: bool,
columns: list[str],
skiprows: int,
) -> pd.DataFrame:
df_dict: OrderedDict[str, Any] = OrderedDict()
col_index: dict[int, str] = {}
for _ in range(skiprows):
next(rows)
for i, row in enumerate(rows):
# row is a list of cells
if i == 0:
@ -74,7 +85,14 @@ def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
return pd.DataFrame(df_dict)
def read_data(backend, file_or_path, sheet_id, headers=True, columns=None, skiprows=0):
def read_data(
backend: ModuleType,
file_or_path: Path,
sheet_id: Union[str, int],
headers: bool,
columns: list[str],
skiprows: int,
) -> pd.DataFrame:
doc = backend.get_doc(file_or_path)
rows = backend.get_rows(doc, sheet_id)
df = parse_data(backend, rows, headers=headers, columns=columns, skiprows=skiprows)

View File

@ -1,3 +1,5 @@
from pathlib import Path
from lxml import etree
@ -13,7 +15,7 @@ TABLE_CELL_REPEATED_ATTRIB = "number-columns-repeated"
VALUE_TYPE_ATTRIB = "value-type"
def get_doc(file_or_path):
def get_doc(file_or_path: Path):
return etree.parse(str(file_or_path))
@ -21,7 +23,8 @@ def get_sheet(spreadsheet, sheet_id):
namespaces = spreadsheet.nsmap
if isinstance(sheet_id, str):
sheet = spreadsheet.find(
f"{TABLE_TAG}[@table:name='{sheet_id}']", namespaces=namespaces
f"{TABLE_TAG}[@table:name='{sheet_id}']",
namespaces=namespaces,
)
if sheet is None:
raise KeyError(f"There is no sheet named {sheet_id}.")

View File

@ -1,21 +1,28 @@
from pathlib import Path
from typing import Any, Iterator, Union
import ezodf
from ezodf.document import FlatXMLDocument, PackagedDocument
def get_doc(file_or_path):
def get_doc(file_or_path: Path) -> Union[FlatXMLDocument, PackagedDocument]:
return ezodf.opendoc(file_or_path)
def get_rows(doc, sheet_id):
def get_rows(
doc: Union[FlatXMLDocument, PackagedDocument],
sheet_id: Union[str, int],
) -> Iterator[list[ezodf.Cell]]:
if not isinstance(sheet_id, (int, str)):
raise ValueError("Sheet id has to be either `str` or `int`")
if isinstance(sheet_id, str):
sheets = [sheet.name for sheet in doc.sheets]
sheets: list[str] = [sheet.name for sheet in doc.sheets]
if sheet_id not in sheets:
raise KeyError("There is no sheet named {}".format(sheet_id))
sheet_id = sheets.index(sheet_id) + 1
sheet = doc.sheets[sheet_id - 1]
sheet: ezodf.Sheet = doc.sheets[sheet_id - 1]
return sheet.rows()
def get_value(cell, parsed=False):
def get_value(cell: ezodf.Cell, parsed: bool = False) -> tuple[Any, int]:
return cell.value, 0

View File

@ -1,4 +1,5 @@
"""Provides utility functions for the parser"""
import pandas as pd
def ods_info(doc):
@ -14,8 +15,8 @@ def ods_info(doc):
)
def sanitize_df(df):
"""Drop empty rows and columns from the DataFrame and returns it"""
def sanitize_df(df: pd.DataFrame) -> pd.DataFrame:
"""Drop empty rows and columns from the DataFrame and return it."""
# Delete empty rows
for i in df.index.tolist()[-1::-1]:
if df.iloc[i].isna().all():