commit 495fa7bd0c845dcb7cfa6f7f818dd7a21a5789b2 Author: iuvbio Date: Sun Jan 27 03:17:34 2019 +0100 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0d25df --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# Compiled python modules. +*.pyc + +# Setuptools distribution folder. +/dist/ + +# Python egg metadata, regenerated from source files by setuptools. +/*.egg-info diff --git a/README.md b/README.md new file mode 100644 index 0000000..0605a7b --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +pandas_ods_reader +=== + +Provides a function to read in an ODS file and return a pandas DataFrame + +Dependencies +--- + +- `ezodf` +- `pandas` + +Installation +--- + +`pip install pandas_read_ods` + +Usage +--- + +```Python +from pandas_ods_reader import read_ods + +path = "path/to/file.ods" +sheet_idx = 1 +df1 = read_ods(path, sheet_idx) +sheet_name = "sheet" +df2 = read_ods(path, sheet_name) +``` diff --git a/pandas_ods_reader/__init__.py b/pandas_ods_reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pandas_ods_reader/read_ods.py b/pandas_ods_reader/read_ods.py new file mode 100644 index 0000000..6311657 --- /dev/null +++ b/pandas_ods_reader/read_ods.py @@ -0,0 +1,94 @@ +"""Imports an ods file into a DataFrame object""" +import ezodf +import pandas as pd + + +def ods_info(doc): + print("Spreadsheet contains %d sheet(s)." % len(doc.sheets)) + for sheet in doc.sheets: + print("-"*40) + print(" Sheet name : '%s'" % sheet.name) + print("Size of Sheet : (rows=%d, cols=%d)" % ( + sheet.nrows(), sheet.ncols())) + + +def load_ods(doc, sheet, header=True, columns=None): + # convert the sheet to a pandas.DataFrame + if isinstance(sheet, int): + sheet = doc.sheets[sheet - 1] + elif isinstance(sheet, str): + sheets = [sheet for sheet in doc.sheets] + if sheet not in sheets: + raise ValueError("There is no sheet named {}".format(sheet)) + sheet_idx = sheets.index(sheet) + sheet = doc.sheets[sheet_idx] + df_dict = {} + col_index = {} + for i, row in enumerate(sheet.rows()): + # row is a list of cells + if header and i == 0: + # columns as lists in a dictionary + df_dict = {cell.value: [] for cell in row if cell.value} + # create index for the column headers + col_index = { + j: cell.value for j, cell in enumerate(row) if cell.value} + continue + elif not header and i == 0: + columns = columns if columns else ( + ["Column_%s" % j for j in range(len(row))]) + # columns as lists in a dictionary + df_dict = {column: [] for column in columns} + # create index for the column headers + col_index = {j: column for j, column in enumerate(columns)} + continue + for j, cell in enumerate(row): + if j < len(col_index): + # use header instead of column index + df_dict[col_index[j]].append(cell.value) + else: + continue + # and convert to a DataFrame + df = pd.DataFrame(df_dict) + return df + + +def sanitize_df(df): + # Delete empty rows + rows = len(df) - 1 + for i in range(rows): + row = df.iloc[-1] + if row.isnull().all(): + df = df.iloc[:-2] + else: + break + # Delete empty columns + cols = [] + for column in df: + if not df[column].isnull().all(): + cols.append(column) + df = df[cols] + len(df.columns) + return df + + +def read_ods(file_or_path, sheet, header=True, columns=None): + """ + This function reads in the provided ods file and converts it to a + dictionary. The dictionary is converted to a DataFrame. Empty rows and + columns are dropped from the DataFrame, before it is returned. + + :param file_or_path: str + the path to the ODS file + :param sheet: int or str + if int, the 1 based index of the sheet to be read in. If str, the name of + the sheet to be read in + :param header: bool, default True + if True, the first row is read in as headers + :param columns: list, default None + a list of column names to be used as headers + :returns: pandas.DataFrame + the ODS file as a pandas DataFrame + """ + doc = ezodf.opendoc(file_or_path) + df = load_ods(doc, sheet, header, columns) + return sanitize_df(df) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8687cc3 --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +from setuptools import setup, find_packages + + +VERSION = "0.0.1" + +setup(name="pandas_ods_reader", + version=VERSION, + description="Read in an ODS file and return it as a pandas.DataFrame", + url="http://github.com/iuvbio/pandas_ods_reader", + author="iuvbio", + author_email="", + license="MIT", + packages=find_packages(["pandas_ods_reader"]), + zip_safe=False, + install_requires=["ezodf", "pandas"])