From 9dd3950bb958e8b57f0c19a2bddd959d007a147e Mon Sep 17 00:00:00 2001 From: Vagner Bessa Date: Sun, 23 Oct 2022 12:19:52 -0300 Subject: [PATCH] implement optional 'skiprow' to read_ods. --- .gitignore | 3 + pandas_ods_reader/algo.py | 19 +++- pandas_ods_reader/main.py | 7 +- test.ipynb | 176 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 test.ipynb diff --git a/.gitignore b/.gitignore index 167d464..e342915 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ venv/ # vim config .vim/ + +*.ods +.py39/ diff --git a/pandas_ods_reader/algo.py b/pandas_ods_reader/algo.py index 8c872cf..6d391a9 100644 --- a/pandas_ods_reader/algo.py +++ b/pandas_ods_reader/algo.py @@ -1,13 +1,21 @@ from collections import OrderedDict +from unittest import skip import pandas as pd from .utils import sanitize_df -def parse_data(backend, rows, headers=True, columns=None): +def parse_data(backend, rows, headers=True, columns=None, skiprows=None): df_dict = OrderedDict() col_index = {} + if skiprows is not None: + if isinstance(skiprows, int): + for _ in range(skiprows): + next(rows) + else: + message = f"'skiprows' must be int. {type(skiprows)} was given." + raise ValueError(message) for i, row in enumerate(rows): # row is a list of cells if headers and i == 0 and not columns: @@ -59,8 +67,13 @@ def parse_data(backend, rows, headers=True, columns=None): return df -def read_data(backend, file_or_path, sheet_id, headers=True, columns=None): +def read_data( + backend, file_or_path, sheet_id, + headers=True, columns=None, skiprows=0 +): doc = backend.get_doc(file_or_path) rows = backend.get_rows(doc, sheet_id) - df = parse_data(backend, rows, headers=headers, columns=columns) + df = parse_data( + backend, rows, headers=headers, columns=columns, skiprows=skiprows + ) return sanitize_df(df) diff --git a/pandas_ods_reader/main.py b/pandas_ods_reader/main.py index bd8caa0..8de5548 100644 --- a/pandas_ods_reader/main.py +++ b/pandas_ods_reader/main.py @@ -8,7 +8,7 @@ from . import algo EXT_MAP = {".ods": ods, ".fods": fods} -def read_ods(file_or_path, sheet=1, headers=True, columns=None): +def read_ods(file_or_path, sheet=1, headers=True, columns=None, skiprows=0): """ Read in the provided ods or .ods file and convert it to `pandas.DataFrame`. Will detect the filetype based on the file's extension or fall back to @@ -33,5 +33,8 @@ def read_ods(file_or_path, sheet=1, headers=True, columns=None): """ backend = EXT_MAP.get(Path(file_or_path).suffix, ods) return algo.read_data( - backend, file_or_path, sheet, headers=headers, columns=columns + backend, + file_or_path, + sheet, + headers=headers, columns=columns, skiprows=skiprows ) diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..0c02ddb --- /dev/null +++ b/test.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from pandas_ods_reader import read_ods" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"Dicionário_Microdados_Enem_2021.ods\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column.0column.1column.2column.3column.4column.5
0NU_INSCRICAONúmero de inscrição1NoneNone12.0Numérica
1NU_ANOAno do EnemNoneNone4.0Numérica
2TP_FAIXA_ETARIAFaixa etária21Menor de 17 anos2.0Numérica
3NoneNone217 anosNaNNone
4NoneNone318 anosNaNNone
\n", + "
" + ], + "text/plain": [ + " column.0 column.1 column.2 column.3 \\\n", + "0 NU_INSCRICAO Número de inscrição1 None None \n", + "1 NU_ANO Ano do Enem None None \n", + "2 TP_FAIXA_ETARIA Faixa etária2 1 Menor de 17 anos \n", + "3 None None 2 17 anos \n", + "4 None None 3 18 anos \n", + "\n", + " column.4 column.5 \n", + "0 12.0 Numérica \n", + "1 4.0 Numérica \n", + "2 2.0 Numérica \n", + "3 NaN None \n", + "4 NaN None " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = read_ods(file_path, headers=False, skiprows=5)\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.14 ('.py39': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.14" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "6ea8410887842cf63ab95c5e43eca8b07627ccf142be6aa1d4d3a20bcd58cd50" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}