This commit is contained in:
fatz 2019-06-08 14:21:34 +02:00
commit d8d71f610d
3 changed files with 24 additions and 12 deletions

View File

@ -6,16 +6,16 @@ from collections import OrderedDict
from .tools import sanitize_df from .tools import sanitize_df
def load_ods(doc, sheet, headers=True, columns=None): def load_ods(doc, sheet_id, headers=True, columns=None):
# convert the sheet to a pandas.DataFrame # convert the sheet to a pandas.DataFrame
if isinstance(sheet, int): if not isinstance(sheet_id, (int, str)):
sheet = doc.sheets[sheet - 1] raise ValueError("Sheet id has to be either `str` or `int`")
elif isinstance(sheet, str): if isinstance(sheet_id, str):
sheets = [sheet.name for sheet in doc.sheets] sheets = [sheet.name for sheet in doc.sheets]
if sheet not in sheets: if sheet_id not in sheets:
raise ValueError("There is no sheet named {}".format(sheet)) raise ValueError("There is no sheet named {}".format(sheet_id))
sheet_idx = sheets.index(sheet) sheet_id = sheets.index(sheet_id) + 1
sheet = doc.sheets[sheet_idx] sheet = doc.sheets[sheet_id - 1]
df_dict = OrderedDict() df_dict = OrderedDict()
col_index = {} col_index = {}
for i, row in enumerate(sheet.rows()): for i, row in enumerate(sheet.rows()):
@ -42,7 +42,7 @@ def load_ods(doc, sheet, headers=True, columns=None):
continue continue
elif i == 0: elif i == 0:
columns = columns if columns else ( columns = columns if columns else (
["column_%s" % j for j in range(len(row))]) [f"column_{j}" for j in range(len(row))])
# columns as lists in a dictionary # columns as lists in a dictionary
df_dict = OrderedDict((column, []) for column in columns) df_dict = OrderedDict((column, []) for column in columns)
# create index for the column headers # create index for the column headers
@ -62,8 +62,8 @@ def load_ods(doc, sheet, headers=True, columns=None):
def read_ods(file_or_path, sheet, headers=True, columns=None): def read_ods(file_or_path, sheet, headers=True, columns=None):
""" """
This function reads in the provided ods file and converts it to a This function reads in the provided ods file and converts it to a
dictionary. The dictionary is converted to a DataFrame. Empty rows and dictionary. The dictionary is converted to a DataFrame. Trailing empty rows
columns are dropped from the DataFrame, before it is returned. and columns are dropped from the DataFrame, before it is returned.
:param file_or_path: str :param file_or_path: str
the path to the ODS file the path to the ODS file

Binary file not shown.

View File

@ -11,6 +11,7 @@ rsc = os.path.join(root, "rsc")
header_file = "example_headers.ods" header_file = "example_headers.ods"
no_header_file = "example_no_headers.ods" no_header_file = "example_no_headers.ods"
duplicated_column_names_file = "example_duplicated_column_names.ods" duplicated_column_names_file = "example_duplicated_column_names.ods"
col_len_file = "example_col_lengths.ods"
class TestOdsReader(object): class TestOdsReader(object):
@ -19,12 +20,14 @@ class TestOdsReader(object):
df = read_ods(path, 1) df = read_ods(path, 1)
assert isinstance(df, pd.DataFrame) assert isinstance(df, pd.DataFrame)
assert len(df) == 10 assert len(df) == 10
assert (len(df.columns) == 5)
def test_header_file_with_str(self): def test_header_file_with_str(self):
path = os.path.join(rsc, header_file) path = os.path.join(rsc, header_file)
df = read_ods(path, "Sheet1") df = read_ods(path, "Sheet1")
assert isinstance(df, pd.DataFrame) assert isinstance(df, pd.DataFrame)
assert len(df) == 10 assert len(df) == 10
assert (len(df.columns) == 5)
def test_header_file_with_cols(self): def test_header_file_with_cols(self):
path = os.path.join(rsc, header_file) path = os.path.join(rsc, header_file)
@ -32,13 +35,15 @@ class TestOdsReader(object):
df = read_ods(path, "Sheet1", columns=columns) df = read_ods(path, "Sheet1", columns=columns)
assert list(df.columns) == columns assert list(df.columns) == columns
assert len(df) == 10 assert len(df) == 10
assert (len(df.columns) == 5)
def test_no_header_file_no_cols(self): def test_no_header_file_no_cols(self):
path = os.path.join(rsc, no_header_file) path = os.path.join(rsc, no_header_file)
df = read_ods(path, 1, headers=False) df = read_ods(path, 1, headers=False)
assert list(df.columns) == [ assert list(df.columns) == [
"column_%s" % i for i in range(len(df.columns))] f"column_{i}" for i in range(len(df.columns))]
assert len(df) == 10 assert len(df) == 10
assert (len(df.columns) == 5)
def test_no_header_file_with_cols(self): def test_no_header_file_with_cols(self):
path = os.path.join(rsc, no_header_file) path = os.path.join(rsc, no_header_file)
@ -53,3 +58,10 @@ class TestOdsReader(object):
assert isinstance(df, pd.DataFrame) assert isinstance(df, pd.DataFrame)
assert len(df.columns) == 4 assert len(df.columns) == 4
assert "website.1" in df.columns assert "website.1" in df.columns
def test_header_file_col_len(self):
path = os.path.join(rsc, col_len_file)
df = read_ods(path, 1)
assert isinstance(df, pd.DataFrame)
assert len(df) == 10
assert (len(df.columns) == 5)