|
1 | | -import xlrd |
2 | | -import zipfile |
| 1 | +from io import BytesIO |
| 2 | +from openpyxl import load_workbook |
| 3 | +import xlrd, zipfile |
3 | 4 | from collections import OrderedDict |
4 | 5 | from ..exceptions import TableTooBigError, MissingRequirementsError |
5 | | - |
6 | | -from ..utilities import header_population |
| 6 | +from ..utilities import ( |
| 7 | + header_population, |
| 8 | + to_bytes, |
| 9 | + parse_xls, |
| 10 | + parse_xlsx |
| 11 | +) |
7 | 12 | from mfr.extensions.tabular.compat import range, basestring |
8 | 13 |
|
9 | 14 |
|
10 | 15 | def xlsx_xlrd(fp): |
11 | | - """Read and convert a xlsx file to JSON format using the xlrd library |
12 | | - :param fp: File pointer object |
13 | | - :return: tuple of table headers and data |
14 | 16 | """ |
15 | | - MAX_SIZE = 10000 |
16 | | - |
17 | | - try: |
18 | | - wb = xlrd.open_workbook(fp.name) |
19 | | - using_xlrd = True |
20 | | - except xlrd.biffh.XLRDError: |
21 | | - using_xlrd = False |
22 | | - try: |
23 | | - from openpyxl import load_workbook |
24 | | - except ImportError: |
25 | | - raise MissingRequirementsError( |
26 | | - 'openpyxl is required to read .xlsx files', |
27 | | - function_preference='openpyxl' |
28 | | - ) |
29 | | - try: |
30 | | - wb = load_workbook(fp.name, data_only=True) |
31 | | - except zipfile.BadZipFile: |
32 | | - raise xlrd.biffh.XLRDError("Excel xlsx file; not supported") |
| 17 | + • .xls → xlrd |
| 18 | + • .xlsx → openpyxl (xlrd ≥2.0 dropped xlsx support) |
33 | 19 |
|
| 20 | + `fp` is the stream returned by WaterButler/MFR. It may already have been |
| 21 | + read, so we always rewind and copy to an in‑memory buffer that openpyxl (and |
| 22 | + ZipFile) can seek inside safely. |
| 23 | + """ |
34 | 24 | sheets = OrderedDict() |
35 | 25 |
|
36 | | - if using_xlrd: |
37 | | - for sheet in wb.sheets(): |
38 | | - if sheet.ncols > MAX_SIZE or sheet.nrows > MAX_SIZE: |
39 | | - raise TableTooBigError('Table is too large to render.', '.xlsx', |
40 | | - nbr_cols=sheet.ncols, nbr_rows=sheet.nrows) |
41 | | - |
42 | | - if sheet.ncols < 1 or sheet.nrows < 1: |
43 | | - sheets[sheet.name] = ([], []) |
44 | | - continue |
45 | | - |
46 | | - fields = sheet.row_values(0) if sheet.nrows else [] |
47 | | - |
48 | | - fields = [ |
49 | | - str(value) |
50 | | - if not isinstance(value, basestring) and value is not None |
51 | | - else value or f'Unnamed: {index + 1}' |
52 | | - for index, value in enumerate(fields) |
53 | | - ] |
54 | | - |
55 | | - data = [] |
56 | | - for i in range(1, sheet.nrows): |
57 | | - row = [] |
58 | | - for cell in sheet.row(i): |
59 | | - if cell.ctype == xlrd.XL_CELL_DATE: |
60 | | - value = xlrd.xldate.xldate_as_datetime(cell.value, wb.datemode).isoformat() |
61 | | - else: |
62 | | - value = cell.value |
63 | | - row.append(value) |
64 | | - data.append(dict(zip(fields, row))) |
65 | | - |
66 | | - header = header_population(fields) |
67 | | - sheets[sheet.name] = (header, data) |
68 | | - |
69 | | - else: |
70 | | - for name in wb.sheetnames: |
71 | | - ws = wb[name] |
72 | | - nrows = ws.max_row |
73 | | - ncols = ws.max_column |
74 | | - if ncols > MAX_SIZE or nrows > MAX_SIZE: |
75 | | - raise TableTooBigError('Table is too large to render.', '.xlsx', |
76 | | - nbr_cols=ncols, nbr_rows=nrows) |
77 | | - |
78 | | - if nrows < 1 or ncols < 1: |
79 | | - sheets[name] = ([], []) |
80 | | - continue |
81 | | - |
82 | | - header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True)) |
83 | | - fields = [ |
84 | | - str(val) if val is not None else f'Unnamed: {i + 1}' |
85 | | - for i, val in enumerate(header_row) |
86 | | - ] |
87 | | - |
88 | | - data = [] |
89 | | - for row in ws.iter_rows(min_row=2, max_row=nrows, max_col=ncols, values_only=True): |
90 | | - data.append(dict(zip(fields, row))) |
| 26 | + try: |
| 27 | + wb = xlrd.open_workbook(file_contents=to_bytes(fp)) |
| 28 | + return parse_xls(wb, sheets) |
| 29 | + except xlrd.biffh.XLRDError: |
| 30 | + pass |
91 | 31 |
|
92 | | - header = header_population(fields) |
93 | | - sheets[name] = (header, data) |
| 32 | + try: |
| 33 | + wb = load_workbook(BytesIO(to_bytes(fp)), data_only=True, read_only=True) |
| 34 | + except zipfile.BadZipFile as exc: |
| 35 | + raise xlrd.biffh.XLRDError( |
| 36 | + "Invalid xlsx file or corrupted ZIP structure" |
| 37 | + ) from exc |
94 | 38 |
|
95 | | - return sheets |
| 39 | + return parse_xlsx(wb, sheets) |
0 commit comments