implement optional 'skiprow' to read_ods.
This commit is contained in:
parent
5b4dfb7413
commit
9dd3950bb9
|
|
@ -27,3 +27,6 @@ venv/
|
|||
|
||||
# vim config
|
||||
.vim/
|
||||
|
||||
*.ods
|
||||
.py39/
|
||||
|
|
|
|||
|
|
@ -1,13 +1,21 @@
|
|||
from collections import OrderedDict
|
||||
from unittest import skip
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from .utils import sanitize_df
|
||||
|
||||
|
||||
def parse_data(backend, rows, headers=True, columns=None):
|
||||
def parse_data(backend, rows, headers=True, columns=None, skiprows=None):
|
||||
df_dict = OrderedDict()
|
||||
col_index = {}
|
||||
if skiprows is not None:
|
||||
if isinstance(skiprows, int):
|
||||
for _ in range(skiprows):
|
||||
next(rows)
|
||||
else:
|
||||
message = f"'skiprows' must be int. {type(skiprows)} was given."
|
||||
raise ValueError(message)
|
||||
for i, row in enumerate(rows):
|
||||
# row is a list of cells
|
||||
if headers and i == 0 and not columns:
|
||||
|
|
@ -59,8 +67,13 @@ def parse_data(backend, rows, headers=True, columns=None):
|
|||
return df
|
||||
|
||||
|
||||
def read_data(backend, file_or_path, sheet_id, headers=True, columns=None):
|
||||
def read_data(
|
||||
backend, file_or_path, sheet_id,
|
||||
headers=True, columns=None, skiprows=0
|
||||
):
|
||||
doc = backend.get_doc(file_or_path)
|
||||
rows = backend.get_rows(doc, sheet_id)
|
||||
df = parse_data(backend, rows, headers=headers, columns=columns)
|
||||
df = parse_data(
|
||||
backend, rows, headers=headers, columns=columns, skiprows=skiprows
|
||||
)
|
||||
return sanitize_df(df)
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from . import algo
|
|||
EXT_MAP = {".ods": ods, ".fods": fods}
|
||||
|
||||
|
||||
def read_ods(file_or_path, sheet=1, headers=True, columns=None):
|
||||
def read_ods(file_or_path, sheet=1, headers=True, columns=None, skiprows=0):
|
||||
"""
|
||||
Read in the provided ods or .ods file and convert it to `pandas.DataFrame`.
|
||||
Will detect the filetype based on the file's extension or fall back to
|
||||
|
|
@ -33,5 +33,8 @@ def read_ods(file_or_path, sheet=1, headers=True, columns=None):
|
|||
"""
|
||||
backend = EXT_MAP.get(Path(file_or_path).suffix, ods)
|
||||
return algo.read_data(
|
||||
backend, file_or_path, sheet, headers=headers, columns=columns
|
||||
backend,
|
||||
file_or_path,
|
||||
sheet,
|
||||
headers=headers, columns=columns, skiprows=skiprows
|
||||
)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,176 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pandas_ods_reader import read_ods"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"file_path = \"Dicionário_Microdados_Enem_2021.ods\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>column.0</th>\n",
|
||||
" <th>column.1</th>\n",
|
||||
" <th>column.2</th>\n",
|
||||
" <th>column.3</th>\n",
|
||||
" <th>column.4</th>\n",
|
||||
" <th>column.5</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>NU_INSCRICAO</td>\n",
|
||||
" <td>Número de inscrição1</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>12.0</td>\n",
|
||||
" <td>Numérica</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>NU_ANO</td>\n",
|
||||
" <td>Ano do Enem</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>4.0</td>\n",
|
||||
" <td>Numérica</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>TP_FAIXA_ETARIA</td>\n",
|
||||
" <td>Faixa etária2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Menor de 17 anos</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>Numérica</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>17 anos</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>18 anos</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>None</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" column.0 column.1 column.2 column.3 \\\n",
|
||||
"0 NU_INSCRICAO Número de inscrição1 None None \n",
|
||||
"1 NU_ANO Ano do Enem None None \n",
|
||||
"2 TP_FAIXA_ETARIA Faixa etária2 1 Menor de 17 anos \n",
|
||||
"3 None None 2 17 anos \n",
|
||||
"4 None None 3 18 anos \n",
|
||||
"\n",
|
||||
" column.4 column.5 \n",
|
||||
"0 12.0 Numérica \n",
|
||||
"1 4.0 Numérica \n",
|
||||
"2 2.0 Numérica \n",
|
||||
"3 NaN None \n",
|
||||
"4 NaN None "
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data = read_ods(file_path, headers=False, skiprows=5)\n",
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.14 ('.py39': venv)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.14"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "6ea8410887842cf63ab95c5e43eca8b07627ccf142be6aa1d4d3a20bcd58cd50"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Loading…
Reference in New Issue