rename modules and factor out common functionality
This commit is contained in:
parent
dbd8dd8bbd
commit
c379503b1c
|
|
@ -1,6 +1,6 @@
|
|||
import pkg_resources
|
||||
|
||||
from .parser import read_ods
|
||||
from .main import read_ods
|
||||
|
||||
|
||||
__version__ = pkg_resources.get_distribution("pandas_ods_reader").version
|
||||
|
|
|
|||
|
|
@ -0,0 +1,66 @@
|
|||
from collections import OrderedDict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from .utils import sanitize_df
|
||||
|
||||
|
||||
def parse_data(backend, rows, headers=True, columns=None):
|
||||
df_dict = OrderedDict()
|
||||
col_index = {}
|
||||
for i, row in enumerate(rows):
|
||||
# row is a list of cells
|
||||
if headers and i == 0 and not columns:
|
||||
repeat_until = -1
|
||||
repeat_value = None
|
||||
# columns as lists in a dictionary
|
||||
columns = []
|
||||
# parse the first row as column names
|
||||
for k, cell in enumerate(row):
|
||||
value, n_repeated = backend.get_value(cell)
|
||||
if n_repeated > 0:
|
||||
repeat_value = value
|
||||
repeat_until = n_repeated + k
|
||||
if not value and k <= repeat_until:
|
||||
value = repeat_value
|
||||
if k == repeat_until:
|
||||
# reset to allow for more than one repeated column
|
||||
repeat_until = -1
|
||||
if value and value not in columns:
|
||||
columns.append(value)
|
||||
else:
|
||||
column_name = value if value else "unnamed"
|
||||
# add count to column name
|
||||
idx = 1
|
||||
while f"{column_name}.{idx}" in columns:
|
||||
idx += 1
|
||||
columns.append(f"{column_name}.{idx}")
|
||||
elif i == 0:
|
||||
# without headers, assign generic numbered column names
|
||||
columns = columns if columns else [f"column.{j}" for j in range(len(row))]
|
||||
if i == 0:
|
||||
df_dict = OrderedDict((column, []) for column in columns)
|
||||
# create index for the column headers
|
||||
col_index = {j: column for j, column in enumerate(columns)}
|
||||
if headers:
|
||||
continue
|
||||
for j, cell in enumerate(row):
|
||||
if j < len(col_index):
|
||||
value, _ = backend.get_value(cell, parsed=True)
|
||||
# use header instead of column index
|
||||
df_dict[col_index[j]].append(value)
|
||||
# make sure all columns are of the same length
|
||||
max_col_length = max(len(df_dict[col]) for col in df_dict)
|
||||
for col in df_dict:
|
||||
col_length = len(df_dict[col])
|
||||
if col_length < max_col_length:
|
||||
df_dict[col] += [None] * (max_col_length - col_length)
|
||||
df = pd.DataFrame(df_dict)
|
||||
return df
|
||||
|
||||
|
||||
def read_data(backend, file_or_path, sheet_id, headers=True, columns=None):
|
||||
doc = backend.get_doc(file_or_path)
|
||||
rows = backend.get_rows(doc, sheet_id)
|
||||
df = parse_data(backend, rows, headers=headers, columns=columns)
|
||||
return sanitize_df(df)
|
||||
|
|
@ -2,6 +2,7 @@
|
|||
from pathlib import Path
|
||||
|
||||
from .parsers import fods, ods
|
||||
from . import algo
|
||||
|
||||
|
||||
EXT_MAP = {".ods": ods, ".fods": fods}
|
||||
|
|
@ -28,7 +29,9 @@ def read_ods(file_or_path, sheet=1, headers=True, columns=None):
|
|||
pandas.DataFrame
|
||||
The content of the specified sheet as a DataFrame.
|
||||
"""
|
||||
loader = EXT_MAP.get(Path(file_or_path).suffix)
|
||||
if not loader:
|
||||
backend = EXT_MAP.get(Path(file_or_path).suffix)
|
||||
if not backend:
|
||||
raise ValueError("Unknown filetype.")
|
||||
return loader.read(file_or_path, sheet, headers=headers, columns=columns)
|
||||
return algo.read_data(
|
||||
backend, file_or_path, sheet, headers=headers, columns=columns
|
||||
)
|
||||
|
|
@ -1,9 +1,4 @@
|
|||
from collections import defaultdict
|
||||
|
||||
from lxml import etree
|
||||
import pandas as pd
|
||||
|
||||
from ..tools import sanitize_df
|
||||
|
||||
|
||||
BODY_TAG = "office:body"
|
||||
|
|
@ -18,6 +13,10 @@ TABLE_CELL_REPEATED_ATTRIB = "number-columns-repeated"
|
|||
VALUE_TYPE_ATTRIB = "value-type"
|
||||
|
||||
|
||||
def get_doc(file_or_path):
|
||||
return etree.parse(str(file_or_path))
|
||||
|
||||
|
||||
def get_sheet(spreadsheet, sheet_id):
|
||||
namespaces = spreadsheet.nsmap
|
||||
if isinstance(sheet_id, str):
|
||||
|
|
@ -33,52 +32,7 @@ def get_sheet(spreadsheet, sheet_id):
|
|||
return tables[sheet_id - 1]
|
||||
|
||||
|
||||
def parse_columns(cells, headers=True, columns=None):
|
||||
orig_columns = cells.pop(0) if headers else None
|
||||
if columns is None:
|
||||
if orig_columns:
|
||||
repeated_val = None
|
||||
columns = []
|
||||
repeated_dict = defaultdict(lambda: 0)
|
||||
for i, col in enumerate(orig_columns):
|
||||
text = col.find(TABLE_CELL_TEXT_TAG, namespaces=col.nsmap)
|
||||
if text is not None:
|
||||
value = text.text
|
||||
elif text is None and repeated_val:
|
||||
value = repeated_val
|
||||
else:
|
||||
value = "unnamed"
|
||||
idx = 1
|
||||
while "{}.{}".format(value, idx) in columns:
|
||||
idx += 1
|
||||
value = f"{value}.{idx}"
|
||||
repeated = col.attrib.get(
|
||||
f"{{{col.nsmap[TABLE_KEY]}}}{TABLE_CELL_REPEATED_ATTRIB}"
|
||||
)
|
||||
if repeated:
|
||||
repeated_dict[value] += 1
|
||||
repeated_val = f"{value}.{repeated_dict[value]}"
|
||||
column = value if value not in columns else f"{value}.{i}"
|
||||
columns.append(column)
|
||||
else:
|
||||
columns = [f"column.{i}" for i in range(len(cells[0]))]
|
||||
return columns, cells
|
||||
|
||||
|
||||
def parse_value(cell):
|
||||
text = cell.find(TABLE_CELL_TEXT_TAG, namespaces=cell.nsmap)
|
||||
is_float = (
|
||||
cell.attrib.get(f"{{{cell.nsmap[OFFICE_KEY]}}}{VALUE_TYPE_ATTRIB}") == "float"
|
||||
)
|
||||
if text is None:
|
||||
return None
|
||||
value = text.text
|
||||
if is_float:
|
||||
return float(value)
|
||||
return value
|
||||
|
||||
|
||||
def load_fods(doc, sheet_id, headers=True, columns=None):
|
||||
def get_rows(doc, sheet_id):
|
||||
if not isinstance(sheet_id, (str, int)):
|
||||
raise ValueError("Sheet id has to be either `str` or `int`")
|
||||
root = doc.getroot()
|
||||
|
|
@ -88,28 +42,24 @@ def load_fods(doc, sheet_id, headers=True, columns=None):
|
|||
)
|
||||
sheet = get_sheet(spreadsheet, sheet_id)
|
||||
rows = sheet.findall(TABLE_ROW_TAG, namespaces=namespaces)
|
||||
allcells = []
|
||||
for row in rows:
|
||||
cells = row.findall(TABLE_CELL_TAG, namespaces=namespaces)
|
||||
allcells.append(cells)
|
||||
columns, values = parse_columns(allcells, headers, columns)
|
||||
data = []
|
||||
for row in values:
|
||||
rowvalues = [parse_value(cell) for cell in row]
|
||||
data.append(rowvalues)
|
||||
final_rows = []
|
||||
for row in data:
|
||||
final_row = []
|
||||
for i in range(len(columns)):
|
||||
if i < len(row):
|
||||
final_row.append(row[i])
|
||||
else:
|
||||
final_row.append(None)
|
||||
final_rows.append(final_row)
|
||||
return pd.DataFrame(final_rows, columns=columns)
|
||||
return rows
|
||||
|
||||
|
||||
def read(file_or_path, sheet=1, headers=True, columns=None):
|
||||
doc = etree.parse(str(file_or_path))
|
||||
df = load_fods(doc, sheet, headers=headers, columns=columns)
|
||||
return sanitize_df(df)
|
||||
def is_float(cell):
|
||||
return (
|
||||
cell.attrib.get(f"{{{cell.nsmap[OFFICE_KEY]}}}{VALUE_TYPE_ATTRIB}") == "float"
|
||||
)
|
||||
|
||||
|
||||
def get_value(cell, parsed=False):
|
||||
text = cell.find(TABLE_CELL_TEXT_TAG, namespaces=cell.nsmap)
|
||||
if text is None:
|
||||
return None, 0
|
||||
value = text.text
|
||||
if parsed and is_float(cell):
|
||||
value = float(value)
|
||||
n_repeated = cell.attrib.get(
|
||||
f"{{{cell.nsmap[TABLE_KEY]}}}{TABLE_CELL_REPEATED_ATTRIB}"
|
||||
)
|
||||
n_repeated = int(n_repeated) if n_repeated is not None else 0
|
||||
return value, n_repeated
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
from collections import OrderedDict
|
||||
|
||||
import ezodf
|
||||
import pandas as pd
|
||||
|
||||
from ..tools import sanitize_df
|
||||
|
||||
|
||||
def load_ods(doc, sheet_id, headers=True, columns=None):
|
||||
# convert the sheet to a pandas.DataFrame
|
||||
def get_doc(file_or_path):
|
||||
return ezodf.opendoc(file_or_path)
|
||||
|
||||
|
||||
def get_rows(doc, sheet_id):
|
||||
if not isinstance(sheet_id, (int, str)):
|
||||
raise ValueError("Sheet id has to be either `str` or `int`")
|
||||
if isinstance(sheet_id, str):
|
||||
|
|
@ -16,47 +14,8 @@ def load_ods(doc, sheet_id, headers=True, columns=None):
|
|||
raise KeyError("There is no sheet named {}".format(sheet_id))
|
||||
sheet_id = sheets.index(sheet_id) + 1
|
||||
sheet = doc.sheets[sheet_id - 1]
|
||||
df_dict = OrderedDict()
|
||||
col_index = {}
|
||||
for i, row in enumerate(sheet.rows()):
|
||||
# row is a list of cells
|
||||
if headers and i == 0 and not columns:
|
||||
# columns as lists in a dictionary
|
||||
columns = []
|
||||
for cell in row:
|
||||
if cell.value and cell.value not in columns:
|
||||
columns.append(cell.value)
|
||||
else:
|
||||
column_name = cell.value if cell.value else "unnamed"
|
||||
# add count to column name
|
||||
idx = 1
|
||||
while "{}.{}".format(column_name, idx) in columns:
|
||||
idx += 1
|
||||
columns.append("{}.{}".format(column_name, idx))
|
||||
|
||||
df_dict = OrderedDict((column, []) for column in columns)
|
||||
# create index for the column headers
|
||||
col_index = {j: column for j, column in enumerate(columns)}
|
||||
continue
|
||||
elif i == 0:
|
||||
columns = columns if columns else ([f"column.{j}" for j in range(len(row))])
|
||||
# columns as lists in a dictionary
|
||||
df_dict = OrderedDict((column, []) for column in columns)
|
||||
# create index for the column headers
|
||||
col_index = {j: column for j, column in enumerate(columns)}
|
||||
if headers:
|
||||
continue
|
||||
for j, cell in enumerate(row):
|
||||
if j < len(col_index):
|
||||
# use header instead of column index
|
||||
df_dict[col_index[j]].append(cell.value)
|
||||
else:
|
||||
continue
|
||||
df = pd.DataFrame(df_dict)
|
||||
return df
|
||||
return sheet.rows()
|
||||
|
||||
|
||||
def read(file_or_path, sheet=1, headers=True, columns=None):
|
||||
doc = ezodf.opendoc(file_or_path)
|
||||
df = load_ods(doc, sheet, headers, columns)
|
||||
return sanitize_df(df)
|
||||
def get_value(cell, parsed=False):
|
||||
return cell.value, 0
|
||||
|
|
|
|||
Loading…
Reference in New Issue