diff --git a/pandas_ods_reader/parser.py b/pandas_ods_reader/parser.py index c533eb5..83a3dee 100644 --- a/pandas_ods_reader/parser.py +++ b/pandas_ods_reader/parser.py @@ -1,6 +1,7 @@ """Imports an ods file into a DataFrame object""" import ezodf import pandas as pd +from collections import OrderedDict from .tools import sanitize_df @@ -15,22 +16,35 @@ def load_ods(doc, sheet, headers=True, columns=None): raise ValueError("There is no sheet named {}".format(sheet)) sheet_idx = sheets.index(sheet) sheet = doc.sheets[sheet_idx] - df_dict = {} + df_dict = OrderedDict() col_index = {} for i, row in enumerate(sheet.rows()): # row is a list of cells if headers and i == 0 and not columns: # columns as lists in a dictionary - df_dict = {cell.value: [] for cell in row if cell.value} + columns = [] + for cell in row: + if cell.value: + if cell.value not in columns: + columns.append(cell.value) + else: + # add count to column name + idx = 1 + while "{}.{}".format(cell.value, idx) in columns: + idx +=1 + columns.append("{}.{}".format(cell.value, idx)) + + df_dict = OrderedDict((column, []) for column in columns) # create index for the column headers col_index = { - j: cell.value for j, cell in enumerate(row) if cell.value} + j: column for j, column in enumerate(columns) + } continue elif i == 0: columns = columns if columns else ( ["column_%s" % j for j in range(len(row))]) # columns as lists in a dictionary - df_dict = {column: [] for column in columns} + df_dict = OrderedDict((column, []) for column in columns) # create index for the column headers col_index = {j: column for j, column in enumerate(columns)} if headers: @@ -41,12 +55,7 @@ def load_ods(doc, sheet, headers=True, columns=None): df_dict[col_index[j]].append(cell.value) else: continue - # convert lists to pd.Series - df_series = {} - for col in df_dict.keys(): - df_series[col] = pd.Series(df_dict[col]) - # and convert to a DataFrame - df = pd.DataFrame(df_series) + df = pd.DataFrame(df_dict) return df diff --git a/pandas_ods_reader/tests/rsc/example_duplicated_column_names.ods b/pandas_ods_reader/tests/rsc/example_duplicated_column_names.ods new file mode 100644 index 0000000..01c48d8 Binary files /dev/null and b/pandas_ods_reader/tests/rsc/example_duplicated_column_names.ods differ diff --git a/pandas_ods_reader/tests/test_read_ods.py b/pandas_ods_reader/tests/test_read_ods.py index 4b7e843..321e423 100644 --- a/pandas_ods_reader/tests/test_read_ods.py +++ b/pandas_ods_reader/tests/test_read_ods.py @@ -10,6 +10,7 @@ rsc = os.path.join(root, "rsc") header_file = "example_headers.ods" no_header_file = "example_no_headers.ods" +duplicated_column_names_file = "example_duplicated_column_names.ods" class TestOdsReader(object): @@ -45,3 +46,10 @@ class TestOdsReader(object): df = read_ods(path, 1, headers=False, columns=columns) assert list(df.columns) == columns assert len(df) == 10 + + def test_duplicated_column_names(self): + path = os.path.join(rsc, duplicated_column_names_file) + df = read_ods(path, 1) + assert isinstance(df, pd.DataFrame) + assert len(df.columns) == 4 + assert "website.1" in df.columns