implement optional 'skiprow' to read_ods.

This commit is contained in:
Vagner Bessa 2022-10-23 12:19:52 -03:00 committed by ljnsn
parent 5b4dfb7413
commit 9dd3950bb9
4 changed files with 200 additions and 5 deletions

3
.gitignore vendored
View File

@ -27,3 +27,6 @@ venv/
# vim config # vim config
.vim/ .vim/
*.ods
.py39/

View File

@ -1,13 +1,21 @@
from collections import OrderedDict from collections import OrderedDict
from unittest import skip
import pandas as pd import pandas as pd
from .utils import sanitize_df from .utils import sanitize_df
def parse_data(backend, rows, headers=True, columns=None): def parse_data(backend, rows, headers=True, columns=None, skiprows=None):
df_dict = OrderedDict() df_dict = OrderedDict()
col_index = {} col_index = {}
if skiprows is not None:
if isinstance(skiprows, int):
for _ in range(skiprows):
next(rows)
else:
message = f"'skiprows' must be int. {type(skiprows)} was given."
raise ValueError(message)
for i, row in enumerate(rows): for i, row in enumerate(rows):
# row is a list of cells # row is a list of cells
if headers and i == 0 and not columns: if headers and i == 0 and not columns:
@ -59,8 +67,13 @@ def parse_data(backend, rows, headers=True, columns=None):
return df return df
def read_data(backend, file_or_path, sheet_id, headers=True, columns=None): def read_data(
backend, file_or_path, sheet_id,
headers=True, columns=None, skiprows=0
):
doc = backend.get_doc(file_or_path) doc = backend.get_doc(file_or_path)
rows = backend.get_rows(doc, sheet_id) rows = backend.get_rows(doc, sheet_id)
df = parse_data(backend, rows, headers=headers, columns=columns) df = parse_data(
backend, rows, headers=headers, columns=columns, skiprows=skiprows
)
return sanitize_df(df) return sanitize_df(df)

View File

@ -8,7 +8,7 @@ from . import algo
EXT_MAP = {".ods": ods, ".fods": fods} EXT_MAP = {".ods": ods, ".fods": fods}
def read_ods(file_or_path, sheet=1, headers=True, columns=None): def read_ods(file_or_path, sheet=1, headers=True, columns=None, skiprows=0):
""" """
Read in the provided ods or .ods file and convert it to `pandas.DataFrame`. Read in the provided ods or .ods file and convert it to `pandas.DataFrame`.
Will detect the filetype based on the file's extension or fall back to Will detect the filetype based on the file's extension or fall back to
@ -33,5 +33,8 @@ def read_ods(file_or_path, sheet=1, headers=True, columns=None):
""" """
backend = EXT_MAP.get(Path(file_or_path).suffix, ods) backend = EXT_MAP.get(Path(file_or_path).suffix, ods)
return algo.read_data( return algo.read_data(
backend, file_or_path, sheet, headers=headers, columns=columns backend,
file_or_path,
sheet,
headers=headers, columns=columns, skiprows=skiprows
) )

176
test.ipynb Normal file
View File

@ -0,0 +1,176 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from pandas_ods_reader import read_ods"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"file_path = \"Dicionário_Microdados_Enem_2021.ods\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>column.0</th>\n",
" <th>column.1</th>\n",
" <th>column.2</th>\n",
" <th>column.3</th>\n",
" <th>column.4</th>\n",
" <th>column.5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NU_INSCRICAO</td>\n",
" <td>Número de inscrição1</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>12.0</td>\n",
" <td>Numérica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NU_ANO</td>\n",
" <td>Ano do Enem</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>4.0</td>\n",
" <td>Numérica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TP_FAIXA_ETARIA</td>\n",
" <td>Faixa etária2</td>\n",
" <td>1</td>\n",
" <td>Menor de 17 anos</td>\n",
" <td>2.0</td>\n",
" <td>Numérica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2</td>\n",
" <td>17 anos</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>3</td>\n",
" <td>18 anos</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" column.0 column.1 column.2 column.3 \\\n",
"0 NU_INSCRICAO Número de inscrição1 None None \n",
"1 NU_ANO Ano do Enem None None \n",
"2 TP_FAIXA_ETARIA Faixa etária2 1 Menor de 17 anos \n",
"3 None None 2 17 anos \n",
"4 None None 3 18 anos \n",
"\n",
" column.4 column.5 \n",
"0 12.0 Numérica \n",
"1 4.0 Numérica \n",
"2 2.0 Numérica \n",
"3 NaN None \n",
"4 NaN None "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = read_ods(file_path, headers=False, skiprows=5)\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.14 ('.py39': venv)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.14"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "6ea8410887842cf63ab95c5e43eca8b07627ccf142be6aa1d4d3a20bcd58cd50"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}