Replaced the column dict with an OrderedDict, support multiple columns
with same names by appending a numbered suffix automatically. Column ordered is preserved, fixes test_header_file_with_cols that was failing. Addded test case for duplicated column names. - fixed test_header_file_with_cols, the order was wrong
This commit is contained in:
parent
580c6951da
commit
9240a1b3ae
|
|
@ -1,6 +1,7 @@
|
|||
"""Imports an ods file into a DataFrame object"""
|
||||
import ezodf
|
||||
import pandas as pd
|
||||
from collections import OrderedDict
|
||||
|
||||
from .tools import sanitize_df
|
||||
|
||||
|
|
@ -15,22 +16,35 @@ def load_ods(doc, sheet, headers=True, columns=None):
|
|||
raise ValueError("There is no sheet named {}".format(sheet))
|
||||
sheet_idx = sheets.index(sheet)
|
||||
sheet = doc.sheets[sheet_idx]
|
||||
df_dict = {}
|
||||
df_dict = OrderedDict()
|
||||
col_index = {}
|
||||
for i, row in enumerate(sheet.rows()):
|
||||
# row is a list of cells
|
||||
if headers and i == 0 and not columns:
|
||||
# columns as lists in a dictionary
|
||||
df_dict = {cell.value: [] for cell in row if cell.value}
|
||||
columns = []
|
||||
for cell in row:
|
||||
if cell.value:
|
||||
if cell.value not in columns:
|
||||
columns.append(cell.value)
|
||||
else:
|
||||
# add count to column name
|
||||
idx = 1
|
||||
while "{}.{}".format(cell.value, idx) in columns:
|
||||
idx +=1
|
||||
columns.append("{}.{}".format(cell.value, idx))
|
||||
|
||||
df_dict = OrderedDict((column, []) for column in columns)
|
||||
# create index for the column headers
|
||||
col_index = {
|
||||
j: cell.value for j, cell in enumerate(row) if cell.value}
|
||||
j: column for j, column in enumerate(columns)
|
||||
}
|
||||
continue
|
||||
elif i == 0:
|
||||
columns = columns if columns else (
|
||||
["column_%s" % j for j in range(len(row))])
|
||||
# columns as lists in a dictionary
|
||||
df_dict = {column: [] for column in columns}
|
||||
df_dict = OrderedDict((column, []) for column in columns)
|
||||
# create index for the column headers
|
||||
col_index = {j: column for j, column in enumerate(columns)}
|
||||
if headers:
|
||||
|
|
@ -41,12 +55,7 @@ def load_ods(doc, sheet, headers=True, columns=None):
|
|||
df_dict[col_index[j]].append(cell.value)
|
||||
else:
|
||||
continue
|
||||
# convert lists to pd.Series
|
||||
df_series = {}
|
||||
for col in df_dict.keys():
|
||||
df_series[col] = pd.Series(df_dict[col])
|
||||
# and convert to a DataFrame
|
||||
df = pd.DataFrame(df_series)
|
||||
df = pd.DataFrame(df_dict)
|
||||
return df
|
||||
|
||||
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -10,6 +10,7 @@ rsc = os.path.join(root, "rsc")
|
|||
|
||||
header_file = "example_headers.ods"
|
||||
no_header_file = "example_no_headers.ods"
|
||||
duplicated_column_names_file = "example_duplicated_column_names.ods"
|
||||
|
||||
|
||||
class TestOdsReader(object):
|
||||
|
|
@ -45,3 +46,10 @@ class TestOdsReader(object):
|
|||
df = read_ods(path, 1, headers=False, columns=columns)
|
||||
assert list(df.columns) == columns
|
||||
assert len(df) == 10
|
||||
|
||||
def test_duplicated_column_names(self):
|
||||
path = os.path.join(rsc, duplicated_column_names_file)
|
||||
df = read_ods(path, 1)
|
||||
assert isinstance(df, pd.DataFrame)
|
||||
assert len(df.columns) == 4
|
||||
assert "website.1" in df.columns
|
||||
|
|
|
|||
Loading…
Reference in New Issue