first commit

This commit is contained in:
iuvbio 2019-01-27 03:17:34 +01:00
commit 495fa7bd0c
5 changed files with 145 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Compiled python modules.
*.pyc
# Setuptools distribution folder.
/dist/
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info

28
README.md Normal file
View File

@ -0,0 +1,28 @@
pandas_ods_reader
===
Provides a function to read in an ODS file and return a pandas DataFrame
Dependencies
---
- `ezodf`
- `pandas`
Installation
---
`pip install pandas_read_ods`
Usage
---
```Python
from pandas_ods_reader import read_ods
path = "path/to/file.ods"
sheet_idx = 1
df1 = read_ods(path, sheet_idx)
sheet_name = "sheet"
df2 = read_ods(path, sheet_name)
```

View File

View File

@ -0,0 +1,94 @@
"""Imports an ods file into a DataFrame object"""
import ezodf
import pandas as pd
def ods_info(doc):
print("Spreadsheet contains %d sheet(s)." % len(doc.sheets))
for sheet in doc.sheets:
print("-"*40)
print(" Sheet name : '%s'" % sheet.name)
print("Size of Sheet : (rows=%d, cols=%d)" % (
sheet.nrows(), sheet.ncols()))
def load_ods(doc, sheet, header=True, columns=None):
# convert the sheet to a pandas.DataFrame
if isinstance(sheet, int):
sheet = doc.sheets[sheet - 1]
elif isinstance(sheet, str):
sheets = [sheet for sheet in doc.sheets]
if sheet not in sheets:
raise ValueError("There is no sheet named {}".format(sheet))
sheet_idx = sheets.index(sheet)
sheet = doc.sheets[sheet_idx]
df_dict = {}
col_index = {}
for i, row in enumerate(sheet.rows()):
# row is a list of cells
if header and i == 0:
# columns as lists in a dictionary
df_dict = {cell.value: [] for cell in row if cell.value}
# create index for the column headers
col_index = {
j: cell.value for j, cell in enumerate(row) if cell.value}
continue
elif not header and i == 0:
columns = columns if columns else (
["Column_%s" % j for j in range(len(row))])
# columns as lists in a dictionary
df_dict = {column: [] for column in columns}
# create index for the column headers
col_index = {j: column for j, column in enumerate(columns)}
continue
for j, cell in enumerate(row):
if j < len(col_index):
# use header instead of column index
df_dict[col_index[j]].append(cell.value)
else:
continue
# and convert to a DataFrame
df = pd.DataFrame(df_dict)
return df
def sanitize_df(df):
# Delete empty rows
rows = len(df) - 1
for i in range(rows):
row = df.iloc[-1]
if row.isnull().all():
df = df.iloc[:-2]
else:
break
# Delete empty columns
cols = []
for column in df:
if not df[column].isnull().all():
cols.append(column)
df = df[cols]
len(df.columns)
return df
def read_ods(file_or_path, sheet, header=True, columns=None):
"""
This function reads in the provided ods file and converts it to a
dictionary. The dictionary is converted to a DataFrame. Empty rows and
columns are dropped from the DataFrame, before it is returned.
:param file_or_path: str
the path to the ODS file
:param sheet: int or str
if int, the 1 based index of the sheet to be read in. If str, the name of
the sheet to be read in
:param header: bool, default True
if True, the first row is read in as headers
:param columns: list, default None
a list of column names to be used as headers
:returns: pandas.DataFrame
the ODS file as a pandas DataFrame
"""
doc = ezodf.opendoc(file_or_path)
df = load_ods(doc, sheet, header, columns)
return sanitize_df(df)

15
setup.py Normal file
View File

@ -0,0 +1,15 @@
from setuptools import setup, find_packages
VERSION = "0.0.1"
setup(name="pandas_ods_reader",
version=VERSION,
description="Read in an ODS file and return it as a pandas.DataFrame",
url="http://github.com/iuvbio/pandas_ods_reader",
author="iuvbio",
author_email="",
license="MIT",
packages=find_packages(["pandas_ods_reader"]),
zip_safe=False,
install_requires=["ezodf", "pandas"])