diff --git a/pandas_ods_reader/parser.py b/pandas_ods_reader/parser.py index b9d4404..fe18cfe 100644 --- a/pandas_ods_reader/parser.py +++ b/pandas_ods_reader/parser.py @@ -1,28 +1,14 @@ -"""Imports an ods file into a DataFrame object""" -import ezodf +"""Imports an ods or fods file into a DataFrame object""" +from pathlib import Path -from .parsers import ods -from .tools import sanitize_df +from .parsers import fods, ods + + +EXT_MAP = {".ods": ods, ".fods": fods} def read_ods(file_or_path, sheet=1, headers=True, columns=None): - """ - This function reads in the provided ods file and converts it to a - dictionary. The dictionary is converted to a DataFrame. Trailing empty rows - and columns are dropped from the DataFrame, before it is returned. - - :param file_or_path: str - the path to the ODS file - :param sheet: int or str, default 1 - if int, the 1 based index of the sheet to be read in. If str, the name of - the sheet to be read in - :param header: bool, default True - if True, the first row is read in as headers - :param columns: list, default None - a list of column names to be used as headers - :returns: pandas.DataFrame - the ODS file as a pandas DataFrame - """ - doc = ezodf.opendoc(file_or_path) - df = ods.load_ods(doc, sheet, headers, columns) - return sanitize_df(df) + loader = EXT_MAP.get(Path(file_or_path).suffix) + if not loader: + raise ValueError("Unknown filetype.") + return loader.read(file_or_path, sheet, headers=headers, columns=columns) diff --git a/pandas_ods_reader/parsers/fods.py b/pandas_ods_reader/parsers/fods.py new file mode 100644 index 0000000..3f747b9 --- /dev/null +++ b/pandas_ods_reader/parsers/fods.py @@ -0,0 +1,115 @@ +from collections import defaultdict + +from lxml import etree +import pandas as pd + +from ..tools import sanitize_df + + +BODY_TAG = "office:body" +SPREADSHEET_TAG = "office:spreadsheet" +OFFICE_KEY = "office" +TABLE_KEY = "table" +TABLE_TAG = "table:table" +TABLE_ROW_TAG = "table:table-row" +TABLE_CELL_TAG = "table:table-cell" +TABLE_CELL_TEXT_TAG = "text:p" +TABLE_CELL_REPEATED_ATTRIB = "number-columns-repeated" +VALUE_TYPE_ATTRIB = "value-type" + + +def get_sheet(spreadsheet, sheet_id): + namespaces = spreadsheet.nsmap + if isinstance(sheet_id, str): + sheet = spreadsheet.find( + f"{TABLE_TAG}[@table:name='{sheet_id}']", namespaces=namespaces + ) + if sheet is None: + raise KeyError(f"There is no sheet named {sheet_id}.") + return sheet + tables = spreadsheet.findall(TABLE_TAG, namespaces=namespaces) + if sheet_id == 0 or sheet_id > len(tables): + raise IndexError(f"There is no sheet at index {sheet_id}.") + return tables[sheet_id - 1] + + +def parse_columns(cells, headers=True, columns=None): + orig_columns = cells.pop(0) if headers else None + if columns is None: + if orig_columns: + repeated_val = None + columns = [] + repeated_dict = defaultdict(lambda: 0) + for i, col in enumerate(orig_columns): + text = col.find(TABLE_CELL_TEXT_TAG, namespaces=col.nsmap) + if text is not None: + value = text.text + elif text is None and repeated_val: + value = repeated_val + else: + value = "unnamed" + idx = 1 + while "{}.{}".format(value, idx) in columns: + idx += 1 + value = f"{value}.{idx}" + repeated = col.attrib.get( + f"{{{col.nsmap[TABLE_KEY]}}}{TABLE_CELL_REPEATED_ATTRIB}" + ) + if repeated: + repeated_dict[value] += 1 + repeated_val = f"{value}.{repeated_dict[value]}" + column = value if value not in columns else f"{value}.{i}" + columns.append(column) + else: + columns = [f"column.{i}" for i in range(len(cells[0]))] + return columns, cells + + +def parse_value(cell): + text = cell.find(TABLE_CELL_TEXT_TAG, namespaces=cell.nsmap) + is_float = ( + cell.attrib.get(f"{{{cell.nsmap[OFFICE_KEY]}}}{VALUE_TYPE_ATTRIB}") == "float" + ) + if text is None: + return None + value = text.text + if is_float: + return float(value) + return value + + +def load_fods(doc, sheet_id, headers=True, columns=None): + if not isinstance(sheet_id, (str, int)): + raise ValueError("Sheet id has to be either `str` or `int`") + root = doc.getroot() + namespaces = root.nsmap + spreadsheet = doc.find(BODY_TAG, namespaces=namespaces).find( + SPREADSHEET_TAG, namespaces=namespaces + ) + sheet = get_sheet(spreadsheet, sheet_id) + rows = sheet.findall(TABLE_ROW_TAG, namespaces=namespaces) + allcells = [] + for row in rows: + cells = row.findall(TABLE_CELL_TAG, namespaces=namespaces) + allcells.append(cells) + columns, values = parse_columns(allcells, headers, columns) + data = [] + for row in values: + rowvalues = [parse_value(cell) for cell in row] + data.append(rowvalues) + final_rows = [] + for row in data: + final_row = [] + for i in range(len(columns)): + if i < len(row): + final_row.append(row[i]) + else: + final_row.append(None) + final_rows.append(final_row) + return pd.DataFrame(final_rows, columns=columns) + + +def read(file_or_path, sheet=1, headers=True, columns=None): + doc = etree.parse(str(file_or_path)) + df = load_fods(doc, sheet, headers=headers, columns=columns) + return sanitize_df(df) diff --git a/pandas_ods_reader/parsers/ods.py b/pandas_ods_reader/parsers/ods.py index d92ce79..55d3a2b 100644 --- a/pandas_ods_reader/parsers/ods.py +++ b/pandas_ods_reader/parsers/ods.py @@ -1,7 +1,10 @@ from collections import OrderedDict +import ezodf import pandas as pd +from ..tools import sanitize_df + def load_ods(doc, sheet_id, headers=True, columns=None): # convert the sheet to a pandas.DataFrame @@ -51,3 +54,26 @@ def load_ods(doc, sheet_id, headers=True, columns=None): continue df = pd.DataFrame(df_dict) return df + + +def read(file_or_path, sheet=1, headers=True, columns=None): + """ + This function reads in the provided ods file and converts it to a + dictionary. The dictionary is converted to a DataFrame. Trailing empty rows + and columns are dropped from the DataFrame, before it is returned. + + :param file_or_path: str + the path to the ODS file + :param sheet: int or str, default 1 + if int, the 1 based index of the sheet to be read in. If str, the name of + the sheet to be read in + :param header: bool, default True + if True, the first row is read in as headers + :param columns: list, default None + a list of column names to be used as headers + :returns: pandas.DataFrame + the ODS file as a pandas DataFrame + """ + doc = ezodf.opendoc(file_or_path) + df = load_ods(doc, sheet, headers, columns) + return sanitize_df(df) diff --git a/tests/rsc/example_col_lengths.fods b/tests/rsc/example_col_lengths.fods new file mode 100644 index 0000000..b5084f5 --- /dev/null +++ b/tests/rsc/example_col_lengths.fods @@ -0,0 +1,451 @@ + + + + Lukas Jansen2019-01-27T03:31:08.9314826322019-06-06T11:51:47.467971713Lukas JansenPT2M31S2LibreOffice/7.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2 + + + 0 + 0 + 15185 + 4967 + + + view1 + + + 7 + 14 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 100 + 60 + true + false + + + Sheet1 + 1849 + 0 + 100 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + false + 1000 + true + 1 + 12632256 + true + true + true + jQH+/01GQ0o0OTFEVwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpNRkNKNDkxRFcAAAAAAAAAAAAAAAAAAAAAAAAWAAMArgAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9TUZDSjQ5MURXCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luZGFqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4RGF0YQpQYWdlU2l6ZTpBNAAAEgBDT01QQVRfRFVQTEVYX01PREUPAER1cGxleE1vZGU6Ok9mZg== + 1000 + 7 + false + true + true + 1 + true + false + true + false + true + true + MFCJ491DW + false + 0 + 3 + true + false + false + false + true + false + true + + + Sheet1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ???(???) + + + 00/00/0000, 00:00:00 + + + + + Page 1/ 99 + + + + + + + + + + + + + + + + A + + + B + + + C + + + D + + + E + + + + + 2 + + + 28 + + + 76 + + + 89 + + + 60 + + + + + 69 + + + 6 + + + 33 + + + 7 + + + 85 + + + + + 48 + + + 14 + + + 48 + + + 14 + + + 61 + + + + + 25 + + + 9 + + + 49 + + + 91 + + + 39 + + + + + 62 + + + 57 + + + 96 + + + 100 + + + 28 + + + + + 0 + + + 85 + + + 83 + + + 50 + + + 58 + + + + + 33 + + + 10 + + + 56 + + + 46 + + + 30 + + + + + 29 + + + 99 + + + 100 + + + 45 + + + 96 + + + + + 62 + + + 37 + + + 16 + + + 37 + + + 51 + + + + + 13 + + + 48 + + + 71 + + + 5 + + + 34 + + + + + + + \ No newline at end of file diff --git a/tests/rsc/example_duplicated_column_names.fods b/tests/rsc/example_duplicated_column_names.fods new file mode 100644 index 0000000..a8155a0 --- /dev/null +++ b/tests/rsc/example_duplicated_column_names.fods @@ -0,0 +1,485 @@ + + + + 2019-05-31T10:36:15.9187991642019-06-06T14:41:16.030513765PT16H2M57S244LibreOffice/7.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2 + + + 0 + 0 + 9133 + 1806 + + + view1 + + + 2 + 6 + 0 + 0 + 0 + 0 + 2 + 0 + 1 + 0 + 0 + 0 + 100 + 60 + true + false + + + Sheet1 + 1849 + 0 + 100 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + false + 1000 + true + 1 + 12632256 + true + true + true + jQH+/01GQ0o0OTFEVwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpNRkNKNDkxRFcAAAAAAAAAAAAAAAAAAAAAAAAWAAMArgAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9TUZDSjQ5MURXCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luZGFqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4RGF0YQpQYWdlU2l6ZTpBNAAAEgBDT01QQVRfRFVQTEVYX01PREUPAER1cGxleE1vZGU6Ok9mZg== + 1000 + 7 + false + true + true + 1 + true + + + en + GB + + + + + + false + true + false + true + true + MFCJ491DW + false + 0 + 3 + true + false + false + false + true + false + true + + + Sheet1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + - + + + + + + + + - + + + + + + + + + - + + + + + + + + + + + - + + + + + + + + - + + + + + + + + + + + + + + + + + - + + + + + + + + - + + + + + + + + + + + + + + : + + + + + : + + : + + + + + : + + + + + + + + / + + / + + + + + - + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ???(???) + + + 00/00/0000, 00:00:00 + + + + + Page 1/ 99 + + + + + + + + + + + + + + + + ID + + + name + + + website + + + + + + Acto_1 + + + W + + + sitea + + + + + + Acto_2 + + + D + + + + siteb + + + + + + Acto_3 + + + S + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/rsc/example_headers.fods b/tests/rsc/example_headers.fods new file mode 100644 index 0000000..250985d --- /dev/null +++ b/tests/rsc/example_headers.fods @@ -0,0 +1,441 @@ + + + + Lukas Jansen2019-01-27T03:31:08.9314826322019-01-27T03:33:20.620959045Lukas JansenPT2M14S1LibreOffice/7.1.4.2$Linux_X86_64 LibreOffice_project/10$Build-2 + + + 0 + 0 + 11288 + 4967 + + + view1 + + + 6 + 20 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 100 + 60 + true + false + + + Sheet1 + 1849 + 0 + 100 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + false + 1000 + true + 1 + 12632256 + true + true + true + jQH+/01GQ0o0OTFEVwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpNRkNKNDkxRFcAAAAAAAAAAAAAAAAAAAAAAAAWAAMArgAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9TUZDSjQ5MURXCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luZGFqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4RGF0YQpQYWdlU2l6ZTpBNAAAEgBDT01QQVRfRFVQTEVYX01PREUPAER1cGxleE1vZGU6Ok9mZg== + 1000 + 7 + false + true + true + 1 + true + false + true + false + true + true + MFCJ491DW + false + 0 + 3 + true + false + false + false + true + false + true + + + Sheet1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ???(???) + + + 00/00/0000, 00:00:00 + + + + + Page 1/ 99 + + + + + + + + + + + + A + + + B + + + C + + + D + + + E + + + + + 62 + + + 22 + + + 83 + + + 35 + + + 35 + + + + + 100 + + + 56 + + + 91 + + + 29 + + + 57 + + + + + 11 + + + 71 + + + 68 + + + 53 + + + 32 + + + + + 84 + + + 26 + + + 3 + + + 21 + + + 17 + + + + + 11 + + + 56 + + + 26 + + + 25 + + + 30 + + + + + 61 + + + 3 + + + 35 + + + 98 + + + 62 + + + + + 22 + + + 96 + + + 10 + + + 53 + + + 34 + + + + + 25 + + + 33 + + + 86 + + + 38 + + + 89 + + + + + 25 + + + 93 + + + 31 + + + 72 + + + 60 + + + + + 19 + + + 64 + + + 42 + + + 38 + + + 28 + + + + + + + \ No newline at end of file diff --git a/tests/rsc/example_missing_header.fods b/tests/rsc/example_missing_header.fods new file mode 100644 index 0000000..feb88b9 --- /dev/null +++ b/tests/rsc/example_missing_header.fods @@ -0,0 +1,439 @@ + + + + Lukas Jansen2019-01-27T03:31:08.9314826322019-06-08T15:24:55.731863115Lukas JansenPT4M44S2LibreOffice/7.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2 + + + 0 + 0 + 11288 + 4967 + + + view1 + + + 7 + 9 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 100 + 60 + true + false + + + Sheet1 + 1849 + 0 + 100 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + false + 1000 + true + 1 + 12632256 + true + true + true + jQH+/01GQ0o0OTFEVwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpNRkNKNDkxRFcAAAAAAAAAAAAAAAAAAAAAAAAWAAMArgAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9TUZDSjQ5MURXCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luZGFqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4RGF0YQpQYWdlU2l6ZTpBNAAAEgBDT01QQVRfRFVQTEVYX01PREUPAER1cGxleE1vZGU6Ok9mZg== + 1000 + 7 + false + true + true + 1 + true + false + true + false + true + true + MFCJ491DW + false + 0 + 3 + true + false + false + false + true + false + true + + + Sheet1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ???(???) + + + 00/00/0000, 00:00:00 + + + + + Page 1/ 99 + + + + + + + + + + + + A + + + B + + + + D + + + E + + + + + 71 + + + 19 + + + 21 + + + 73 + + + 47 + + + + + 19 + + + 67 + + + 7 + + + 51 + + + 26 + + + + + 20 + + + 57 + + + 29 + + + 69 + + + 27 + + + + + 17 + + + 91 + + + 73 + + + 3 + + + 45 + + + + + 35 + + + 40 + + + 41 + + + 66 + + + 35 + + + + + 17 + + + 21 + + + 14 + + + 0 + + + 0 + + + + + 45 + + + 72 + + + 32 + + + 21 + + + 47 + + + + + 29 + + + 90 + + + 21 + + + 82 + + + 5 + + + + + 94 + + + 49 + + + 5 + + + 22 + + + 54 + + + + + 59 + + + 8 + + + 11 + + + 56 + + + 81 + + + + + + + \ No newline at end of file diff --git a/tests/rsc/example_no_headers.fods b/tests/rsc/example_no_headers.fods new file mode 100644 index 0000000..a25b140 --- /dev/null +++ b/tests/rsc/example_no_headers.fods @@ -0,0 +1,424 @@ + + + + Lukas Jansen2019-01-27T03:31:08.9314826322019-01-27T03:33:44.899304723Lukas JansenPT2M38S2LibreOffice/7.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2 + + + 0 + 0 + 11288 + 4515 + + + view1 + + + 2 + 10 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 100 + 60 + true + false + + + Sheet1 + 1849 + 0 + 100 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + false + 1000 + true + 1 + 12632256 + true + true + true + jQH+/01GQ0o0OTFEVwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpNRkNKNDkxRFcAAAAAAAAAAAAAAAAAAAAAAAAWAAMArgAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9TUZDSjQ5MURXCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luZGFqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4RGF0YQpQYWdlU2l6ZTpBNAAAEgBDT01QQVRfRFVQTEVYX01PREUPAER1cGxleE1vZGU6Ok9mZg== + 1000 + 7 + false + true + true + 1 + true + false + true + false + true + true + MFCJ491DW + false + 0 + 3 + true + false + false + false + true + false + true + + + Sheet1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ???(???) + + + 00/00/0000, 00:00:00 + + + + + Page 1/ 99 + + + + + + + + + + + + 37 + + + 94 + + + 39 + + + 85 + + + 32 + + + + + 66 + + + 11 + + + 99 + + + 27 + + + 41 + + + + + 92 + + + 80 + + + 57 + + + 57 + + + 90 + + + + + 47 + + + 16 + + + 58 + + + 10 + + + 40 + + + + + 76 + + + 4 + + + 95 + + + 58 + + + 9 + + + + + 18 + + + 17 + + + 53 + + + 58 + + + 57 + + + + + 39 + + + 31 + + + 37 + + + 90 + + + 91 + + + + + 40 + + + 62 + + + 10 + + + 69 + + + 14 + + + + + 69 + + + 15 + + + 7 + + + 80 + + + 73 + + + + + 99 + + + 15 + + + 78 + + + 53 + + + 79 + + + + + + + \ No newline at end of file diff --git a/tests/rsc/mixed_dtypes.fods b/tests/rsc/mixed_dtypes.fods new file mode 100644 index 0000000..3789653 --- /dev/null +++ b/tests/rsc/mixed_dtypes.fods @@ -0,0 +1,441 @@ + + + + Lukas Jansen2019-01-27T03:31:08.9314826322020-02-23T16:02:58.759849276Lukas JansenPT7M9S3LibreOffice/7.1.5.2$Linux_X86_64 LibreOffice_project/10$Build-2 + + + 0 + 0 + 11288 + 4967 + + + view1 + + + 6 + 15 + 0 + 0 + 0 + 0 + 2 + 0 + 0 + 0 + 0 + 0 + 100 + 60 + true + false + + + Sheet1 + 1849 + 0 + 100 + 60 + false + true + true + true + 12632256 + true + true + true + true + false + false + false + 1000 + 1000 + 1 + 1 + true + false + + + + + true + true + true + false + 1000 + true + 1 + 12632256 + true + true + true + jQH+/01GQ0o0OTFEVwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpNRkNKNDkxRFcAAAAAAAAAAAAAAAAAAAAAAAAWAAMArgAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9TUZDSjQ5MURXCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luZGFqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4RGF0YQpQYWdlU2l6ZTpBNAAAEgBDT01QQVRfRFVQTEVYX01PREUPAER1cGxleE1vZGU6Ok9mZg== + 1000 + 7 + false + true + true + 1 + true + false + true + false + true + true + MFCJ491DW + false + 0 + 3 + true + false + false + false + true + false + true + + + Sheet1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ??? + + + + Page 1 + + + + + + + ???(???) + + + 00/00/0000, 00:00:00 + + + + + Page 1/ 99 + + + + + + + + + + + + A + + + B + + + C + + + D + + + E + + + + + 6 + + + 93 + + + 43 + + + 95 + + + A + + + + + 24 + + + 0.73 + + + 50 + + + 78 + + + B + + + + + 13 + + + 36 + + + 44 + + + 57 + + + C + + + + + 10 + + + 46 + + + 56 + + + 69 + + + D + + + + + 34 + + + S + + + 0.52 + + + 34 + + + E + + + + + 24 + + + Q + + + 43 + + + 93 + + + A + + + + + 43 + + + 15 + + + 95 + + + 89 + + + B + + + + + 67 + + + 0.89 + + + 57 + + + 44 + + + C + + + + + 74 + + + 26 + + + 0.77 + + + 3 + + + D + + + + + 2 + + + 14 + + + 93 + + + 54 + + + E + + + + + + + \ No newline at end of file diff --git a/tests/test_read_ods.py b/tests/test_read_ods.py index 4537702..772d650 100644 --- a/tests/test_read_ods.py +++ b/tests/test_read_ods.py @@ -19,117 +19,127 @@ mixed_dtypes_file = "mixed_dtypes.ods" class TestOdsReader: - - def test_header_file_simple(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_header_file_simple(self, suffix): path = rsc / header_file - df = read_ods(path) + df = read_ods(path.with_suffix(suffix)) assert isinstance(df, pd.DataFrame) assert len(df) == 10 - assert (len(df.columns) == 5) + assert len(df.columns) == 5 - def test_header_file_with_int(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_header_file_with_int(self, suffix): path = rsc / header_file - df = read_ods(path, 1) + df = read_ods(path.with_suffix(suffix), 1) assert isinstance(df, pd.DataFrame) assert len(df) == 10 - assert (len(df.columns) == 5) + assert len(df.columns) == 5 - def test_header_file_with_str(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_header_file_with_str(self, suffix): path = rsc / header_file - df = read_ods(path, "Sheet1") + df = read_ods(path.with_suffix(suffix), "Sheet1") assert isinstance(df, pd.DataFrame) assert len(df) == 10 - assert (len(df.columns) == 5) + assert len(df.columns) == 5 - def test_header_file_with_cols(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_header_file_with_cols(self, suffix): path = rsc / header_file columns = ["One", "Two", "Three", "Four", "Five"] - df = read_ods(path, "Sheet1", columns=columns) + df = read_ods(path.with_suffix(suffix), "Sheet1", columns=columns) assert list(df.columns) == columns assert len(df) == 10 - assert (len(df.columns) == 5) + assert len(df.columns) == 5 - def test_no_header_file_no_cols(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_no_header_file_no_cols(self, suffix): path = rsc / no_header_file - df = read_ods(path, 1, headers=False) + df = read_ods(path.with_suffix(suffix), 1, headers=False) - assert list(df.columns) == [ - f"column.{i}" for i in range(len(df.columns))] + assert list(df.columns) == [f"column.{i}" for i in range(len(df.columns))] assert len(df) == 10 - assert (len(df.columns) == 5) + assert len(df.columns) == 5 - def test_no_header_file_with_cols(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_no_header_file_with_cols(self, suffix): path = rsc / no_header_file columns = ["A", "B", "C", "D", "E"] - df = read_ods(path, 1, headers=False, columns=columns) + df = read_ods(path.with_suffix(suffix), 1, headers=False, columns=columns) assert list(df.columns) == columns assert len(df) == 10 - def test_duplicated_column_names(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_duplicated_column_names(self, suffix): path = rsc / duplicated_column_names_file - df = read_ods(path, 1) + df = read_ods(path.with_suffix(suffix), 1) assert isinstance(df, pd.DataFrame) assert len(df.columns) == 4 assert "website.1" in df.columns - def test_header_file_col_len(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_header_file_col_len(self, suffix): path = rsc / col_len_file - df = read_ods(path, 1) + df = read_ods(path.with_suffix(suffix), 1) assert isinstance(df, pd.DataFrame) assert len(df) == 10 - assert (len(df.columns) == 5) + assert len(df.columns) == 5 - def test_wrong_id_type(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_wrong_id_type(self, suffix): path = rsc / header_file with pytest.raises(ValueError) as e_info: - read_ods(path, 1.0) + read_ods(path.with_suffix(suffix), 1.0) assert e_info.match("Sheet id has to be either `str` or `int`") - def test_non_existent_sheet(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_non_existent_sheet(self, suffix): path = rsc / header_file sheet_name = "No_Sheet" with pytest.raises(KeyError) as e_info: - read_ods(path, sheet_name) + read_ods(path.with_suffix(suffix), sheet_name) assert e_info.match(f"There is no sheet named {sheet_name}") - def test_missing_header(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_missing_header(self, suffix): path = rsc / missing_header_file - df = read_ods(path, 1) + df = read_ods(path.with_suffix(suffix), 1) assert isinstance(df, pd.DataFrame) assert len(df) == 10 - assert (len(df.columns) == 5) + assert len(df.columns) == 5 assert df.columns[2] == "unnamed.1" - def test_mixed_dtypes(self): + @pytest.mark.parametrize("suffix", [".ods", ".fods"]) + def test_mixed_dtypes(self, suffix): path = rsc / mixed_dtypes_file - df = read_ods(path, 1) + df = read_ods(path.with_suffix(suffix), 1) assert isinstance(df, pd.DataFrame) assert len(df) == 10 - assert (len(df.columns) == 5) + assert len(df.columns) == 5 type_list = [float, object, float, float, object] assert df.dtypes.tolist() == type_list