Merge pull request #1 from fatzh/master

Fixed dataframe generation with duplicated columns
This commit is contained in:
iuvbio 2019-06-08 14:36:37 +02:00 committed by GitHub
commit e7a0cbf245
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 26 additions and 6 deletions

View File

@ -1,6 +1,7 @@
"""Imports an ods file into a DataFrame object"""
import ezodf
import pandas as pd
from collections import OrderedDict
from .tools import sanitize_df
@ -15,22 +16,35 @@ def load_ods(doc, sheet_id, headers=True, columns=None):
raise ValueError("There is no sheet named {}".format(sheet_id))
sheet_id = sheets.index(sheet_id) + 1
sheet = doc.sheets[sheet_id - 1]
df_dict = {}
df_dict = OrderedDict()
col_index = {}
for i, row in enumerate(sheet.rows()):
# row is a list of cells
if headers and i == 0 and not columns:
# columns as lists in a dictionary
df_dict = {cell.value: [] for cell in row if cell.value}
columns = []
for cell in row:
if cell.value:
if cell.value not in columns:
columns.append(cell.value)
else:
# add count to column name
idx = 1
while "{}.{}".format(cell.value, idx) in columns:
idx +=1
columns.append("{}.{}".format(cell.value, idx))
df_dict = OrderedDict((column, []) for column in columns)
# create index for the column headers
col_index = {
j: cell.value for j, cell in enumerate(row) if cell.value}
j: column for j, column in enumerate(columns)
}
continue
elif i == 0:
columns = columns if columns else (
[f"column_{j}" for j in range(len(row))])
# columns as lists in a dictionary
df_dict = {column: [] for column in columns}
df_dict = OrderedDict((column, []) for column in columns)
# create index for the column headers
col_index = {j: column for j, column in enumerate(columns)}
if headers:
@ -41,7 +55,6 @@ def load_ods(doc, sheet_id, headers=True, columns=None):
df_dict[col_index[j]].append(cell.value)
else:
continue
# and convert to a DataFrame
df = pd.DataFrame(df_dict)
return df

View File

@ -11,6 +11,7 @@ rsc = os.path.join(root, "rsc")
header_file = "example_headers.ods"
no_header_file = "example_no_headers.ods"
duplicated_column_names_file = "example_duplicated_column_names.ods"
col_len_file = "example_col_lengths.ods"
@ -51,7 +52,13 @@ class TestOdsReader(object):
df = read_ods(path, 1, headers=False, columns=columns)
assert list(df.columns) == columns
assert len(df) == 10
assert (len(df.columns) == 5)
def test_duplicated_column_names(self):
path = os.path.join(rsc, duplicated_column_names_file)
df = read_ods(path, 1)
assert isinstance(df, pd.DataFrame)
assert len(df.columns) == 4
assert "website.1" in df.columns
def test_header_file_col_len(self):
path = os.path.join(rsc, col_len_file)