first commit

2019-01-27 03:17:34 +01:00 · 2019-01-27 03:17:34 +01:00 · 495fa7bd0c
commit 495fa7bd0c
5 changed files with 145 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+# Compiled python modules.
+*.pyc
+
+# Setuptools distribution folder.
+/dist/
+
+# Python egg metadata, regenerated from source files by setuptools.
+/*.egg-info
--- a/README.md
+++ b/README.md
@ -0,0 +1,28 @@
+pandas_ods_reader
+===
+
+Provides a function to read in an ODS file and return a pandas DataFrame
+
+Dependencies
+---
+
+- `ezodf`
+- `pandas`
+
+Installation
+---
+
+`pip install pandas_read_ods`
+
+Usage
+---
+
+```Python
+from pandas_ods_reader import read_ods
+
+path = "path/to/file.ods"
+sheet_idx = 1
+df1 = read_ods(path, sheet_idx)
+sheet_name = "sheet"
+df2 = read_ods(path, sheet_name)
+```
--- a/pandas_ods_reader/init.py
+++ b/pandas_ods_reader/init.py
--- a/pandas_ods_reader/read_ods.py
+++ b/pandas_ods_reader/read_ods.py
@ -0,0 +1,94 @@
+"""Imports an ods file into a DataFrame object"""
+import ezodf
+import pandas as pd
+
+
+def ods_info(doc):
+    print("Spreadsheet contains %d sheet(s)." % len(doc.sheets))
+    for sheet in doc.sheets:
+        print("-"*40)
+        print("   Sheet name : '%s'" % sheet.name)
+        print("Size of Sheet : (rows=%d, cols=%d)" % (
+            sheet.nrows(), sheet.ncols()))
+
+
+def load_ods(doc, sheet, header=True, columns=None):
+    # convert the sheet to a pandas.DataFrame
+    if isinstance(sheet, int):
+        sheet = doc.sheets[sheet - 1]
+    elif isinstance(sheet, str):
+        sheets = [sheet for sheet in doc.sheets]
+        if sheet not in sheets:
+            raise ValueError("There is no sheet named {}".format(sheet))
+        sheet_idx = sheets.index(sheet)
+        sheet = doc.sheets[sheet_idx]
+    df_dict = {}
+    col_index = {}
+    for i, row in enumerate(sheet.rows()):
+        # row is a list of cells
+        if header and i == 0:
+            # columns as lists in a dictionary
+            df_dict = {cell.value: [] for cell in row if cell.value}
+            # create index for the column headers
+            col_index = {
+                j: cell.value for j, cell in enumerate(row) if cell.value}
+            continue
+        elif not header and i == 0:
+            columns = columns if columns else (
+                ["Column_%s" % j for j in range(len(row))])
+            # columns as lists in a dictionary
+            df_dict = {column: [] for column in columns}
+            # create index for the column headers
+            col_index = {j: column for j, column in enumerate(columns)}
+            continue
+        for j, cell in enumerate(row):
+            if j < len(col_index):
+                # use header instead of column index
+                df_dict[col_index[j]].append(cell.value)
+            else:
+                continue
+    # and convert to a DataFrame
+    df = pd.DataFrame(df_dict)
+    return df
+
+
+def sanitize_df(df):
+    # Delete empty rows
+    rows = len(df) - 1
+    for i in range(rows):
+        row = df.iloc[-1]
+        if row.isnull().all():
+            df = df.iloc[:-2]
+        else:
+            break
+    # Delete empty columns
+    cols = []
+    for column in df:
+        if not df[column].isnull().all():
+            cols.append(column)
+    df = df[cols]
+    len(df.columns)
+    return df
+
+
+def read_ods(file_or_path, sheet, header=True, columns=None):
+    """
+    This function reads in the provided ods file and converts it to a
+    dictionary. The dictionary is converted to a DataFrame. Empty rows and
+    columns are dropped from the DataFrame, before it is returned.
+
+    :param file_or_path: str
+    the path to the ODS file
+    :param sheet: int or str
+    if int, the 1 based index of the sheet to be read in. If str, the name of
+    the sheet to be read in
+    :param header: bool, default True
+    if True, the first row is read in as headers
+    :param columns: list, default None
+    a list of column names to be used as headers
+    :returns: pandas.DataFrame
+    the ODS file as a pandas DataFrame
+    """
+    doc = ezodf.opendoc(file_or_path)
+    df = load_ods(doc, sheet, header, columns)
+    return sanitize_df(df)
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,15 @@
+from setuptools import setup, find_packages
+
+
+VERSION = "0.0.1"
+
+setup(name="pandas_ods_reader",
+      version=VERSION,
+      description="Read in an ODS file and return it as a pandas.DataFrame",
+      url="http://github.com/iuvbio/pandas_ods_reader",
+      author="iuvbio",
+      author_email="",
+      license="MIT",
+      packages=find_packages(["pandas_ods_reader"]),
+      zip_safe=False,
+      install_requires=["ezodf", "pandas"])