Factor out logic to get columns into separate functions.
This commit is contained in:
parent
9bf4415a9f
commit
bd3caab7f3
|
|
@ -5,16 +5,7 @@ import pandas as pd
|
||||||
from .utils import sanitize_df
|
from .utils import sanitize_df
|
||||||
|
|
||||||
|
|
||||||
def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
|
def get_columns_from_headers(backend, row):
|
||||||
df_dict = OrderedDict()
|
|
||||||
col_index = {}
|
|
||||||
for _ in range(skiprows):
|
|
||||||
next(rows)
|
|
||||||
for i, row in enumerate(rows):
|
|
||||||
# row is a list of cells
|
|
||||||
if i == 0:
|
|
||||||
if not columns:
|
|
||||||
if headers:
|
|
||||||
repeat_until = -1
|
repeat_until = -1
|
||||||
repeat_value = None
|
repeat_value = None
|
||||||
# columns as lists in a dictionary
|
# columns as lists in a dictionary
|
||||||
|
|
@ -39,14 +30,31 @@ def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
|
||||||
while f"{column_name}.{idx}" in columns:
|
while f"{column_name}.{idx}" in columns:
|
||||||
idx += 1
|
idx += 1
|
||||||
columns.append(f"{column_name}.{idx}")
|
columns.append(f"{column_name}.{idx}")
|
||||||
else:
|
return columns
|
||||||
# without headers, assign generic numbered column names
|
|
||||||
columns = [f"column.{j}" for j in range(len(row))]
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_generic_columns(row):
|
||||||
|
return [f"column.{j}" for j in range(len(row))]
|
||||||
|
|
||||||
|
|
||||||
|
def get_columns(backend, row, headers):
|
||||||
|
if headers:
|
||||||
|
return get_columns_from_headers(backend, row)
|
||||||
|
return get_generic_columns(row)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
|
||||||
|
df_dict = OrderedDict()
|
||||||
|
col_index = {}
|
||||||
|
for _ in range(skiprows):
|
||||||
|
next(rows)
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
# row is a list of cells
|
||||||
|
if i == 0:
|
||||||
|
columns = columns or get_columns(backend, row, headers)
|
||||||
df_dict = OrderedDict((column, []) for column in columns)
|
df_dict = OrderedDict((column, []) for column in columns)
|
||||||
# create index for the column headers
|
# create index for the column headers
|
||||||
col_index = {j: column for j, column in enumerate(columns)}
|
col_index = {j: column for j, column in enumerate(columns)}
|
||||||
|
|
||||||
if headers:
|
if headers:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -55,14 +63,15 @@ def parse_data(backend, rows, headers=True, columns=None, skiprows=0):
|
||||||
value, _ = backend.get_value(cell, parsed=True)
|
value, _ = backend.get_value(cell, parsed=True)
|
||||||
# use header instead of column index
|
# use header instead of column index
|
||||||
df_dict[col_index[j]].append(value)
|
df_dict[col_index[j]].append(value)
|
||||||
|
|
||||||
# make sure all columns are of the same length
|
# make sure all columns are of the same length
|
||||||
max_col_length = max(len(df_dict[col]) for col in df_dict)
|
max_col_length = max(len(df_dict[col]) for col in df_dict)
|
||||||
for col in df_dict:
|
for col in df_dict:
|
||||||
col_length = len(df_dict[col])
|
col_length = len(df_dict[col])
|
||||||
if col_length < max_col_length:
|
if col_length < max_col_length:
|
||||||
df_dict[col] += [None] * (max_col_length - col_length)
|
df_dict[col] += [None] * (max_col_length - col_length)
|
||||||
df = pd.DataFrame(df_dict)
|
|
||||||
return df
|
return pd.DataFrame(df_dict)
|
||||||
|
|
||||||
|
|
||||||
def read_data(backend, file_or_path, sheet_id, headers=True, columns=None, skiprows=0):
|
def read_data(backend, file_or_path, sheet_id, headers=True, columns=None, skiprows=0):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue