diff --git a/.gitignore b/.gitignore index e342915..167d464 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,3 @@ venv/ # vim config .vim/ - -*.ods -.py39/ diff --git a/example_skiprows.ods b/example_skiprows.ods new file mode 100644 index 0000000..ef5b079 Binary files /dev/null and b/example_skiprows.ods differ diff --git a/pandas_ods_reader/algo.py b/pandas_ods_reader/algo.py index 6d391a9..cbf2c31 100644 --- a/pandas_ods_reader/algo.py +++ b/pandas_ods_reader/algo.py @@ -1,5 +1,4 @@ from collections import OrderedDict -from unittest import skip import pandas as pd @@ -67,13 +66,8 @@ def parse_data(backend, rows, headers=True, columns=None, skiprows=None): return df -def read_data( - backend, file_or_path, sheet_id, - headers=True, columns=None, skiprows=0 -): +def read_data(backend, file_or_path, sheet_id, headers=True, columns=None, skiprows=0): doc = backend.get_doc(file_or_path) rows = backend.get_rows(doc, sheet_id) - df = parse_data( - backend, rows, headers=headers, columns=columns, skiprows=skiprows - ) + df = parse_data(backend, rows, headers=headers, columns=columns, skiprows=skiprows) return sanitize_df(df) diff --git a/pandas_ods_reader/main.py b/pandas_ods_reader/main.py index 8de5548..63f0370 100644 --- a/pandas_ods_reader/main.py +++ b/pandas_ods_reader/main.py @@ -36,5 +36,7 @@ def read_ods(file_or_path, sheet=1, headers=True, columns=None, skiprows=0): backend, file_or_path, sheet, - headers=headers, columns=columns, skiprows=skiprows + headers=headers, + columns=columns, + skiprows=skiprows, ) diff --git a/pandas_ods_reader/parsers/fods.py b/pandas_ods_reader/parsers/fods.py index d950bc8..a5e2c05 100644 --- a/pandas_ods_reader/parsers/fods.py +++ b/pandas_ods_reader/parsers/fods.py @@ -42,7 +42,8 @@ def get_rows(doc, sheet_id): ) sheet = get_sheet(spreadsheet, sheet_id) rows = sheet.findall(TABLE_ROW_TAG, namespaces=namespaces) - return rows + for row in rows: + yield row def is_float(cell): diff --git a/poetry.lock b/poetry.lock index 029afd8..a15505a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -15,10 +15,10 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] -docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] +dev = ["coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] +tests-no-zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "black" @@ -38,7 +38,7 @@ tomli = ">=0.2.6,<2.0.0" typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\""} typing-extensions = [ {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}, - {version = "!=3.10.0.1", markers = "python_version >= \"3.10\""}, + {version = ">=3.10.0.0,<3.10.0.1 || >3.10.0.1", markers = "python_version >= \"3.10\""}, ] [package.extras] @@ -89,9 +89,9 @@ typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] +docs = ["jaraco.packaging (>=8.2)", "rst.linker (>=1.9)", "sphinx"] perf = ["ipython"] -testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pep517", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pep517", "pyfakefs", "pytest (>=4.6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy", "pytest-perf (>=0.9.2)"] [[package]] name = "iniconfig" @@ -112,7 +112,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" [package.extras] cssselect = ["cssselect (>=0.7)"] html5 = ["html5lib"] -htmlsoup = ["beautifulsoup4"] +htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=0.29.7)"] [[package]] @@ -156,7 +156,7 @@ python-dateutil = ">=2.7.3" pytz = ">=2017.2" [package.extras] -test = ["pytest (>=4.0.2)", "pytest-xdist", "hypothesis (>=3.58)"] +test = ["hypothesis (>=3.58)", "pytest (>=4.0.2)", "pytest-xdist"] [[package]] name = "pathspec" @@ -307,13 +307,13 @@ optional = false python-versions = ">=3.6" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] +docs = ["jaraco.packaging (>=8.2)", "rst.linker (>=1.9)", "sphinx"] +testing = ["func-timeout", "jaraco.itertools", "pytest (>=4.6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy"] [metadata] lock-version = "1.1" python-versions = "^3.7" -content-hash = "d9c435fd7f0ded3ef3a28aa6a93b4b47ea1ccbd9cda9c0133bd33c405fe53706" +content-hash = "f0f7573338f20f81f960b8c0f670e525b77b081975a7f8918b11d3e7f65cec57" [metadata.files] atomicwrites = [ diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 0c02ddb..0000000 --- a/test.ipynb +++ /dev/null @@ -1,176 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from pandas_ods_reader import read_ods" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "file_path = \"Dicionário_Microdados_Enem_2021.ods\"" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
column.0column.1column.2column.3column.4column.5
0NU_INSCRICAONúmero de inscrição1NoneNone12.0Numérica
1NU_ANOAno do EnemNoneNone4.0Numérica
2TP_FAIXA_ETARIAFaixa etária21Menor de 17 anos2.0Numérica
3NoneNone217 anosNaNNone
4NoneNone318 anosNaNNone
\n", - "
" - ], - "text/plain": [ - " column.0 column.1 column.2 column.3 \\\n", - "0 NU_INSCRICAO Número de inscrição1 None None \n", - "1 NU_ANO Ano do Enem None None \n", - "2 TP_FAIXA_ETARIA Faixa etária2 1 Menor de 17 anos \n", - "3 None None 2 17 anos \n", - "4 None None 3 18 anos \n", - "\n", - " column.4 column.5 \n", - "0 12.0 Numérica \n", - "1 4.0 Numérica \n", - "2 2.0 Numérica \n", - "3 NaN None \n", - "4 NaN None " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = read_ods(file_path, headers=False, skiprows=5)\n", - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.9.14 ('.py39': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.14" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "6ea8410887842cf63ab95c5e43eca8b07627ccf142be6aa1d4d3a20bcd58cd50" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tests/rsc/example_col_lengths.ods b/tests/rsc/example_col_lengths.ods index 9dd3fa7..eb781ca 100644 Binary files a/tests/rsc/example_col_lengths.ods and b/tests/rsc/example_col_lengths.ods differ diff --git a/tests/rsc/example_skiprows.fods b/tests/rsc/example_skiprows.fods new file mode 100644 index 0000000..5af778a --- /dev/null +++ b/tests/rsc/example_skiprows.fods @@ -0,0 +1,423 @@ + + + + Lukas Jansen2019-01-27T03:31:08.9314826322022-10-25T08:45:33.990049580PT2M33S2LibreOffice/6.4.7.2$Linux_X86_64 LibreOffice_project/40$Build-2 + + + 0 + 0 + 11288 + 4967 + + + view1 + + + 5 + 2 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 100 + 60 + true + false + + + Sheet1 + 1861 + 0 + 100 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + false + 1000 + true + 1 + 12632256 + true + true + true + kwH+/0dlbmVyaWMgUHJpbnRlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAU0dFTlBSVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAWAAMAtAAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9R2VuZXJpYyBQcmludGVyCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luZGFqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4RGF0YQpQYWdlU2l6ZTpBNAAAEgBDT01QQVRfRFVQTEVYX01PREUPAER1cGxleE1vZGU6Ok9mZg== + 1000 + 7 + false + true + true + 1 + true + false + true + false + true + true + Generic Printer + false + 0 + 3 + true + false + false + false + true + false + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ???(???) + + + 00/00/0000, 00:00:00 + + + + + Page 1/ 99 + + + + + + + + + + + + A + + + B + + + C + + + D + + + E + + + + + skip this + + + + + a + + + b + + + c + + + d + + + e + + + + + 10 + + + 0 + + + 58 + + + 29 + + + 77 + + + + + 5 + + + 47 + + + 50 + + + 99 + + + 79 + + + + + 75 + + + 25 + + + 86 + + + 47 + + + 65 + + + + + 82 + + + 45 + + + 88 + + + 48 + + + 74 + + + + + 72 + + + 47 + + + 57 + + + 82 + + + 46 + + + + + 40 + + + 54 + + + 26 + + + 97 + + + 1 + + + + + 54 + + + 26 + + + 99 + + + 63 + + + 49 + + + + + 87 + + + 24 + + + 47 + + + 87 + + + 15 + + + + + + + \ No newline at end of file diff --git a/tests/rsc/example_skiprows.ods b/tests/rsc/example_skiprows.ods new file mode 100644 index 0000000..ef5b079 Binary files /dev/null and b/tests/rsc/example_skiprows.ods differ diff --git a/tests/test_read_ods.py b/tests/test_read_ods.py index 772d650..3c0b4b4 100644 --- a/tests/test_read_ods.py +++ b/tests/test_read_ods.py @@ -16,6 +16,7 @@ duplicated_column_names_file = "example_duplicated_column_names.ods" col_len_file = "example_col_lengths.ods" missing_header_file = "example_missing_header.ods" mixed_dtypes_file = "mixed_dtypes.ods" +skiprows_file = "example_skiprows.ods" class TestOdsReader: @@ -145,3 +146,13 @@ class TestOdsReader: assert df.dtypes.tolist() == type_list col_b_types = [type(v) for v in df.B.values] assert str in col_b_types and float in col_b_types + + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_skiprows(self, suffix): + + path = rsc / skiprows_file + df = read_ods(path.with_suffix(suffix), skiprows=2) + assert isinstance(df, pd.DataFrame) + assert len(df) == 8 + assert len(df.columns) == 5 + assert all(df.columns == 'a b c d e'.split(' '))