Replaced the column dict with an OrderedDict, support multiple columns

with same names by appending a numbered suffix automatically. Column ordered is preserved, fixes test_header_file_with_cols that was failing. Addded test case for duplicated column names. - fixed test_header_file_with_cols, the order was wrong
2019-06-06 15:27:11 +02:00 · 2019-06-06 15:27:11 +02:00 · 9240a1b3ae
parent 580c6951da
commit 9240a1b3ae
3 changed files with 27 additions and 10 deletions
--- a/pandas_ods_reader/parser.py
+++ b/pandas_ods_reader/parser.py
@ -1,6 +1,7 @@
 """Imports an ods file into a DataFrame object"""
 import ezodf
 import pandas as pd
+from collections import OrderedDict

 from .tools import sanitize_df

@ -15,22 +16,35 @@ def load_ods(doc, sheet, headers=True, columns=None):
            raise ValueError("There is no sheet named {}".format(sheet))
        sheet_idx = sheets.index(sheet)
        sheet = doc.sheets[sheet_idx]
-    df_dict = {}
+    df_dict = OrderedDict()
    col_index = {}
    for i, row in enumerate(sheet.rows()):
        # row is a list of cells
        if headers and i == 0 and not columns:
            # columns as lists in a dictionary
-            df_dict = {cell.value: [] for cell in row if cell.value}
+            columns = []
+            for cell in row:
+                if cell.value:
+                    if cell.value not in columns:
+                        columns.append(cell.value)
+                    else:
+                        # add count to column name
+                        idx = 1
+                        while "{}.{}".format(cell.value, idx) in columns:
+                            idx +=1
+                        columns.append("{}.{}".format(cell.value, idx))
+
+            df_dict = OrderedDict((column, []) for column in columns)
            # create index for the column headers
            col_index = {
-                j: cell.value for j, cell in enumerate(row) if cell.value}
+                j: column for j, column in enumerate(columns)
+            }
            continue
        elif i == 0:
            columns = columns if columns else (
                ["column_%s" % j for j in range(len(row))])
            # columns as lists in a dictionary
-            df_dict = {column: [] for column in columns}
+            df_dict = OrderedDict((column, []) for column in columns)
            # create index for the column headers
            col_index = {j: column for j, column in enumerate(columns)}
            if headers:
@ -41,12 +55,7 @@ def load_ods(doc, sheet, headers=True, columns=None):
                df_dict[col_index[j]].append(cell.value)
            else:
                continue
-    # convert lists to pd.Series
-    df_series = {}
-    for col  in df_dict.keys():
-        df_series[col] = pd.Series(df_dict[col])
-    # and convert to a DataFrame
-    df = pd.DataFrame(df_series)
+    df = pd.DataFrame(df_dict)
    return df


--- a/pandas_ods_reader/tests/rsc/example_duplicated_column_names.ods
+++ b/pandas_ods_reader/tests/rsc/example_duplicated_column_names.ods
--- a/pandas_ods_reader/tests/test_read_ods.py
+++ b/pandas_ods_reader/tests/test_read_ods.py
@ -10,6 +10,7 @@ rsc = os.path.join(root, "rsc")

 header_file = "example_headers.ods"
 no_header_file = "example_no_headers.ods"
+duplicated_column_names_file = "example_duplicated_column_names.ods"


 class TestOdsReader(object):
@ -45,3 +46,10 @@ class TestOdsReader(object):
        df = read_ods(path, 1, headers=False, columns=columns)
        assert list(df.columns) == columns
        assert len(df) == 10
+
+    def test_duplicated_column_names(self):
+        path = os.path.join(rsc, duplicated_column_names_file)
+        df = read_ods(path, 1)
+        assert isinstance(df, pd.DataFrame)
+        assert len(df.columns) == 4
+        assert "website.1" in df.columns