diff --git a/.gitignore b/.gitignore index 167d464..e342915 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ venv/ # vim config .vim/ + +*.ods +.py39/ diff --git a/pandas_ods_reader/algo.py b/pandas_ods_reader/algo.py index 8c872cf..6d391a9 100644 --- a/pandas_ods_reader/algo.py +++ b/pandas_ods_reader/algo.py @@ -1,13 +1,21 @@ from collections import OrderedDict +from unittest import skip import pandas as pd from .utils import sanitize_df -def parse_data(backend, rows, headers=True, columns=None): +def parse_data(backend, rows, headers=True, columns=None, skiprows=None): df_dict = OrderedDict() col_index = {} + if skiprows is not None: + if isinstance(skiprows, int): + for _ in range(skiprows): + next(rows) + else: + message = f"'skiprows' must be int. {type(skiprows)} was given." + raise ValueError(message) for i, row in enumerate(rows): # row is a list of cells if headers and i == 0 and not columns: @@ -59,8 +67,13 @@ def parse_data(backend, rows, headers=True, columns=None): return df -def read_data(backend, file_or_path, sheet_id, headers=True, columns=None): +def read_data( + backend, file_or_path, sheet_id, + headers=True, columns=None, skiprows=0 +): doc = backend.get_doc(file_or_path) rows = backend.get_rows(doc, sheet_id) - df = parse_data(backend, rows, headers=headers, columns=columns) + df = parse_data( + backend, rows, headers=headers, columns=columns, skiprows=skiprows + ) return sanitize_df(df) diff --git a/pandas_ods_reader/main.py b/pandas_ods_reader/main.py index bd8caa0..8de5548 100644 --- a/pandas_ods_reader/main.py +++ b/pandas_ods_reader/main.py @@ -8,7 +8,7 @@ from . import algo EXT_MAP = {".ods": ods, ".fods": fods} -def read_ods(file_or_path, sheet=1, headers=True, columns=None): +def read_ods(file_or_path, sheet=1, headers=True, columns=None, skiprows=0): """ Read in the provided ods or .ods file and convert it to `pandas.DataFrame`. Will detect the filetype based on the file's extension or fall back to @@ -33,5 +33,8 @@ def read_ods(file_or_path, sheet=1, headers=True, columns=None): """ backend = EXT_MAP.get(Path(file_or_path).suffix, ods) return algo.read_data( - backend, file_or_path, sheet, headers=headers, columns=columns + backend, + file_or_path, + sheet, + headers=headers, columns=columns, skiprows=skiprows ) diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..0c02ddb --- /dev/null +++ b/test.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from pandas_ods_reader import read_ods" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"Dicionário_Microdados_Enem_2021.ods\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | column.0 | \n", + "column.1 | \n", + "column.2 | \n", + "column.3 | \n", + "column.4 | \n", + "column.5 | \n", + "
|---|---|---|---|---|---|---|
| 0 | \n", + "NU_INSCRICAO | \n", + "Número de inscrição1 | \n", + "None | \n", + "None | \n", + "12.0 | \n", + "Numérica | \n", + "
| 1 | \n", + "NU_ANO | \n", + "Ano do Enem | \n", + "None | \n", + "None | \n", + "4.0 | \n", + "Numérica | \n", + "
| 2 | \n", + "TP_FAIXA_ETARIA | \n", + "Faixa etária2 | \n", + "1 | \n", + "Menor de 17 anos | \n", + "2.0 | \n", + "Numérica | \n", + "
| 3 | \n", + "None | \n", + "None | \n", + "2 | \n", + "17 anos | \n", + "NaN | \n", + "None | \n", + "
| 4 | \n", + "None | \n", + "None | \n", + "3 | \n", + "18 anos | \n", + "NaN | \n", + "None | \n", + "