add fods support
This commit is contained in:
parent
a31ac7671b
commit
db5c14409f
|
|
@ -1,28 +1,14 @@
|
||||||
"""Imports an ods file into a DataFrame object"""
|
"""Imports an ods or fods file into a DataFrame object"""
|
||||||
import ezodf
|
from pathlib import Path
|
||||||
|
|
||||||
from .parsers import ods
|
from .parsers import fods, ods
|
||||||
from .tools import sanitize_df
|
|
||||||
|
|
||||||
|
EXT_MAP = {".ods": ods, ".fods": fods}
|
||||||
|
|
||||||
|
|
||||||
def read_ods(file_or_path, sheet=1, headers=True, columns=None):
|
def read_ods(file_or_path, sheet=1, headers=True, columns=None):
|
||||||
"""
|
loader = EXT_MAP.get(Path(file_or_path).suffix)
|
||||||
This function reads in the provided ods file and converts it to a
|
if not loader:
|
||||||
dictionary. The dictionary is converted to a DataFrame. Trailing empty rows
|
raise ValueError("Unknown filetype.")
|
||||||
and columns are dropped from the DataFrame, before it is returned.
|
return loader.read(file_or_path, sheet, headers=headers, columns=columns)
|
||||||
|
|
||||||
:param file_or_path: str
|
|
||||||
the path to the ODS file
|
|
||||||
:param sheet: int or str, default 1
|
|
||||||
if int, the 1 based index of the sheet to be read in. If str, the name of
|
|
||||||
the sheet to be read in
|
|
||||||
:param header: bool, default True
|
|
||||||
if True, the first row is read in as headers
|
|
||||||
:param columns: list, default None
|
|
||||||
a list of column names to be used as headers
|
|
||||||
:returns: pandas.DataFrame
|
|
||||||
the ODS file as a pandas DataFrame
|
|
||||||
"""
|
|
||||||
doc = ezodf.opendoc(file_or_path)
|
|
||||||
df = ods.load_ods(doc, sheet, headers, columns)
|
|
||||||
return sanitize_df(df)
|
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,21 @@
|
||||||
from collections import OrderedDict
|
from collections import defaultdict
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from ..tools import sanitize_df
|
||||||
|
|
||||||
|
|
||||||
BODY_TAG = "office:body"
|
BODY_TAG = "office:body"
|
||||||
SPREADSHEET_TAG = "office:spreadsheet"
|
SPREADSHEET_TAG = "office:spreadsheet"
|
||||||
|
OFFICE_KEY = "office"
|
||||||
|
TABLE_KEY = "table"
|
||||||
TABLE_TAG = "table:table"
|
TABLE_TAG = "table:table"
|
||||||
TABLE_ROW_TAG = "table:table-row"
|
TABLE_ROW_TAG = "table:table-row"
|
||||||
TABLE_CELL_TAG = "table:table-cell"
|
TABLE_CELL_TAG = "table:table-cell"
|
||||||
TABLE_CELL_TEXT_TAG = "text:p"
|
TABLE_CELL_TEXT_TAG = "text:p"
|
||||||
|
TABLE_CELL_REPEATED_ATTRIB = "number-columns-repeated"
|
||||||
|
VALUE_TYPE_ATTRIB = "value-type"
|
||||||
|
|
||||||
|
|
||||||
def get_sheet(spreadsheet, sheet_id):
|
def get_sheet(spreadsheet, sheet_id):
|
||||||
|
|
@ -19,7 +25,7 @@ def get_sheet(spreadsheet, sheet_id):
|
||||||
f"{TABLE_TAG}[@table:name='{sheet_id}']", namespaces=namespaces
|
f"{TABLE_TAG}[@table:name='{sheet_id}']", namespaces=namespaces
|
||||||
)
|
)
|
||||||
if sheet is None:
|
if sheet is None:
|
||||||
raise KeyError(f"There is no sheet named {sheet_id}")
|
raise KeyError(f"There is no sheet named {sheet_id}.")
|
||||||
return sheet
|
return sheet
|
||||||
tables = spreadsheet.findall(TABLE_TAG, namespaces=namespaces)
|
tables = spreadsheet.findall(TABLE_TAG, namespaces=namespaces)
|
||||||
if sheet_id == 0 or sheet_id > len(tables):
|
if sheet_id == 0 or sheet_id > len(tables):
|
||||||
|
|
@ -27,6 +33,51 @@ def get_sheet(spreadsheet, sheet_id):
|
||||||
return tables[sheet_id - 1]
|
return tables[sheet_id - 1]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_columns(cells, headers=True, columns=None):
|
||||||
|
orig_columns = cells.pop(0) if headers else None
|
||||||
|
if columns is None:
|
||||||
|
if orig_columns:
|
||||||
|
repeated_val = None
|
||||||
|
columns = []
|
||||||
|
repeated_dict = defaultdict(lambda: 0)
|
||||||
|
for i, col in enumerate(orig_columns):
|
||||||
|
text = col.find(TABLE_CELL_TEXT_TAG, namespaces=col.nsmap)
|
||||||
|
if text is not None:
|
||||||
|
value = text.text
|
||||||
|
elif text is None and repeated_val:
|
||||||
|
value = repeated_val
|
||||||
|
else:
|
||||||
|
value = "unnamed"
|
||||||
|
idx = 1
|
||||||
|
while "{}.{}".format(value, idx) in columns:
|
||||||
|
idx += 1
|
||||||
|
value = f"{value}.{idx}"
|
||||||
|
repeated = col.attrib.get(
|
||||||
|
f"{{{col.nsmap[TABLE_KEY]}}}{TABLE_CELL_REPEATED_ATTRIB}"
|
||||||
|
)
|
||||||
|
if repeated:
|
||||||
|
repeated_dict[value] += 1
|
||||||
|
repeated_val = f"{value}.{repeated_dict[value]}"
|
||||||
|
column = value if value not in columns else f"{value}.{i}"
|
||||||
|
columns.append(column)
|
||||||
|
else:
|
||||||
|
columns = [f"column.{i}" for i in range(len(cells[0]))]
|
||||||
|
return columns, cells
|
||||||
|
|
||||||
|
|
||||||
|
def parse_value(cell):
|
||||||
|
text = cell.find(TABLE_CELL_TEXT_TAG, namespaces=cell.nsmap)
|
||||||
|
is_float = (
|
||||||
|
cell.attrib.get(f"{{{cell.nsmap[OFFICE_KEY]}}}{VALUE_TYPE_ATTRIB}") == "float"
|
||||||
|
)
|
||||||
|
if text is None:
|
||||||
|
return None
|
||||||
|
value = text.text
|
||||||
|
if is_float:
|
||||||
|
return float(value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
def load_fods(doc, sheet_id, headers=True, columns=None):
|
def load_fods(doc, sheet_id, headers=True, columns=None):
|
||||||
if not isinstance(sheet_id, (str, int)):
|
if not isinstance(sheet_id, (str, int)):
|
||||||
raise ValueError("Sheet id has to be either `str` or `int`")
|
raise ValueError("Sheet id has to be either `str` or `int`")
|
||||||
|
|
@ -37,27 +88,28 @@ def load_fods(doc, sheet_id, headers=True, columns=None):
|
||||||
)
|
)
|
||||||
sheet = get_sheet(spreadsheet, sheet_id)
|
sheet = get_sheet(spreadsheet, sheet_id)
|
||||||
rows = sheet.findall(TABLE_ROW_TAG, namespaces=namespaces)
|
rows = sheet.findall(TABLE_ROW_TAG, namespaces=namespaces)
|
||||||
data = []
|
allcells = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
cells = row.findall(TABLE_CELL_TAG, namespaces=namespaces)
|
cells = row.findall(TABLE_CELL_TAG, namespaces=namespaces)
|
||||||
data.append(
|
allcells.append(cells)
|
||||||
[
|
columns, values = parse_columns(allcells, headers, columns)
|
||||||
cell.find(TABLE_CELL_TEXT_TAG, namespaces=namespaces).text
|
data = []
|
||||||
for cell in cells
|
for row in values:
|
||||||
]
|
rowvalues = [parse_value(cell) for cell in row]
|
||||||
)
|
data.append(rowvalues)
|
||||||
orig_columns = data.pop(0) if headers else None
|
final_rows = []
|
||||||
if columns is None:
|
for row in data:
|
||||||
if orig_columns:
|
final_row = []
|
||||||
columns = orig_columns
|
for i in range(len(columns)):
|
||||||
else:
|
if i < len(row):
|
||||||
columns = [f"column.{i}" for i in range(len(data[0]))]
|
final_row.append(row[i])
|
||||||
return pd.DataFrame(
|
else:
|
||||||
OrderedDict({column: datarow for column, datarow in zip(columns, data)})
|
final_row.append(None)
|
||||||
)
|
final_rows.append(final_row)
|
||||||
|
return pd.DataFrame(final_rows, columns=columns)
|
||||||
|
|
||||||
|
|
||||||
def read_fods(file_or_path, sheet=1, headers=True, columns=None):
|
def read(file_or_path, sheet=1, headers=True, columns=None):
|
||||||
doc = etree.parse(file_or_path)
|
doc = etree.parse(str(file_or_path))
|
||||||
df = load_fods(doc, sheet, headers=headers, columns=columns)
|
df = load_fods(doc, sheet, headers=headers, columns=columns)
|
||||||
return df
|
return sanitize_df(df)
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,10 @@
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
import ezodf
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
from ..tools import sanitize_df
|
||||||
|
|
||||||
|
|
||||||
def load_ods(doc, sheet_id, headers=True, columns=None):
|
def load_ods(doc, sheet_id, headers=True, columns=None):
|
||||||
# convert the sheet to a pandas.DataFrame
|
# convert the sheet to a pandas.DataFrame
|
||||||
|
|
@ -51,3 +54,26 @@ def load_ods(doc, sheet_id, headers=True, columns=None):
|
||||||
continue
|
continue
|
||||||
df = pd.DataFrame(df_dict)
|
df = pd.DataFrame(df_dict)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def read(file_or_path, sheet=1, headers=True, columns=None):
|
||||||
|
"""
|
||||||
|
This function reads in the provided ods file and converts it to a
|
||||||
|
dictionary. The dictionary is converted to a DataFrame. Trailing empty rows
|
||||||
|
and columns are dropped from the DataFrame, before it is returned.
|
||||||
|
|
||||||
|
:param file_or_path: str
|
||||||
|
the path to the ODS file
|
||||||
|
:param sheet: int or str, default 1
|
||||||
|
if int, the 1 based index of the sheet to be read in. If str, the name of
|
||||||
|
the sheet to be read in
|
||||||
|
:param header: bool, default True
|
||||||
|
if True, the first row is read in as headers
|
||||||
|
:param columns: list, default None
|
||||||
|
a list of column names to be used as headers
|
||||||
|
:returns: pandas.DataFrame
|
||||||
|
the ODS file as a pandas DataFrame
|
||||||
|
"""
|
||||||
|
doc = ezodf.opendoc(file_or_path)
|
||||||
|
df = load_ods(doc, sheet, headers, columns)
|
||||||
|
return sanitize_df(df)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue