diff --git a/pandas_ods_reader/parser.py b/pandas_ods_reader/parser.py index 8b0ffbb..f04063f 100644 --- a/pandas_ods_reader/parser.py +++ b/pandas_ods_reader/parser.py @@ -1,6 +1,7 @@ """Imports an ods file into a DataFrame object""" import ezodf import pandas as pd +from collections import OrderedDict from .tools import sanitize_df @@ -15,22 +16,35 @@ def load_ods(doc, sheet_id, headers=True, columns=None): raise ValueError("There is no sheet named {}".format(sheet_id)) sheet_id = sheets.index(sheet_id) + 1 sheet = doc.sheets[sheet_id - 1] - df_dict = {} + df_dict = OrderedDict() col_index = {} for i, row in enumerate(sheet.rows()): # row is a list of cells if headers and i == 0 and not columns: # columns as lists in a dictionary - df_dict = {cell.value: [] for cell in row if cell.value} + columns = [] + for cell in row: + if cell.value: + if cell.value not in columns: + columns.append(cell.value) + else: + # add count to column name + idx = 1 + while "{}.{}".format(cell.value, idx) in columns: + idx +=1 + columns.append("{}.{}".format(cell.value, idx)) + + df_dict = OrderedDict((column, []) for column in columns) # create index for the column headers col_index = { - j: cell.value for j, cell in enumerate(row) if cell.value} + j: column for j, column in enumerate(columns) + } continue elif i == 0: columns = columns if columns else ( [f"column_{j}" for j in range(len(row))]) # columns as lists in a dictionary - df_dict = {column: [] for column in columns} + df_dict = OrderedDict((column, []) for column in columns) # create index for the column headers col_index = {j: column for j, column in enumerate(columns)} if headers: @@ -41,7 +55,6 @@ def load_ods(doc, sheet_id, headers=True, columns=None): df_dict[col_index[j]].append(cell.value) else: continue - # and convert to a DataFrame df = pd.DataFrame(df_dict) return df diff --git a/pandas_ods_reader/tests/rsc/example_duplicated_column_names.ods b/pandas_ods_reader/tests/rsc/example_duplicated_column_names.ods new file mode 100644 index 0000000..01c48d8 Binary files /dev/null and b/pandas_ods_reader/tests/rsc/example_duplicated_column_names.ods differ diff --git a/pandas_ods_reader/tests/test_read_ods.py b/pandas_ods_reader/tests/test_read_ods.py index dd4de2e..01bc1f8 100644 --- a/pandas_ods_reader/tests/test_read_ods.py +++ b/pandas_ods_reader/tests/test_read_ods.py @@ -11,6 +11,7 @@ rsc = os.path.join(root, "rsc") header_file = "example_headers.ods" no_header_file = "example_no_headers.ods" +duplicated_column_names_file = "example_duplicated_column_names.ods" col_len_file = "example_col_lengths.ods" @@ -51,7 +52,13 @@ class TestOdsReader(object): df = read_ods(path, 1, headers=False, columns=columns) assert list(df.columns) == columns assert len(df) == 10 - assert (len(df.columns) == 5) + + def test_duplicated_column_names(self): + path = os.path.join(rsc, duplicated_column_names_file) + df = read_ods(path, 1) + assert isinstance(df, pd.DataFrame) + assert len(df.columns) == 4 + assert "website.1" in df.columns def test_header_file_col_len(self): path = os.path.join(rsc, col_len_file)