Factor out logic to get columns into separate functions.

2022-11-10 00:54:20 +01:00 · 2022-11-10 00:54:20 +01:00 · bd3caab7f3
parent 9bf4415a9f
commit bd3caab7f3
1 changed files with 42 additions and 33 deletions
--- a/pandas_ods_reader/algo.py
+++ b/pandas_ods_reader/algo.py
@ -5,6 +5,44 @@ import pandas as pd
 from .utils import sanitize_df


+def get_columns_from_headers(backend, row):
+    repeat_until = -1
+    repeat_value = None
+    # columns as lists in a dictionary
+    columns = []
+    # parse the first row as column names
+    for k, cell in enumerate(row):
+        value, n_repeated = backend.get_value(cell)
+        if n_repeated > 0:
+            repeat_value = value
+            repeat_until = n_repeated + k
+        if not value and k <= repeat_until:
+            value = repeat_value
+        if k == repeat_until:
+            # reset to allow for more than one repeated column
+            repeat_until = -1
+        if value and value not in columns:
+            columns.append(value)
+        else:
+            column_name = value if value else "unnamed"
+            # add count to column name
+            idx = 1
+            while f"{column_name}.{idx}" in columns:
+                idx += 1
+            columns.append(f"{column_name}.{idx}")
+    return columns
+
+
+def get_generic_columns(row):
+    return [f"column.{j}" for j in range(len(row))]
+
+
+def get_columns(backend, row, headers):
+    if headers:
+        return get_columns_from_headers(backend, row)
+    return get_generic_columns(row)
+
+
 def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
    df_dict = OrderedDict()
    col_index = {}
@ -13,40 +51,10 @@ def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
    for i, row in enumerate(rows):
        # row is a list of cells
        if i == 0:
-            if not columns:
-                if headers:
-                    repeat_until = -1
-                    repeat_value = None
-                    # columns as lists in a dictionary
-                    columns = []
-                    # parse the first row as column names
-                    for k, cell in enumerate(row):
-                        value, n_repeated = backend.get_value(cell)
-                        if n_repeated > 0:
-                            repeat_value = value
-                            repeat_until = n_repeated + k
-                        if not value and k <= repeat_until:
-                            value = repeat_value
-                        if k == repeat_until:
-                            # reset to allow for more than one repeated column
-                            repeat_until = -1
-                        if value and value not in columns:
-                            columns.append(value)
-                        else:
-                            column_name = value if value else "unnamed"
-                            # add count to column name
-                            idx = 1
-                            while f"{column_name}.{idx}" in columns:
-                                idx += 1
-                            columns.append(f"{column_name}.{idx}")
-                else:
-                    # without headers, assign generic numbered column names
-                    columns = [f"column.{j}" for j in range(len(row))]
-
+            columns = columns or get_columns(backend, row, headers)
            df_dict = OrderedDict((column, []) for column in columns)
            # create index for the column headers
            col_index = {j: column for j, column in enumerate(columns)}
-
            if headers:
                continue

@ -55,14 +63,15 @@ def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
                value, _ = backend.get_value(cell, parsed=True)
                # use header instead of column index
                df_dict[col_index[j]].append(value)
+
    # make sure all columns are of the same length
    max_col_length = max(len(df_dict[col]) for col in df_dict)
    for col in df_dict:
        col_length = len(df_dict[col])
        if col_length < max_col_length:
            df_dict[col] += [None] * (max_col_length - col_length)
-    df = pd.DataFrame(df_dict)
-    return df
+
+    return pd.DataFrame(df_dict)


 def read_data(backend, file_or_path, sheet_id, headers=True, columns=None, skiprows=0):