Skip to content

Commit b554cc1

Browse files
committed
ENG-8475 | fix unable to render spreadsheet files (.xls and .xlsx)
1 parent 8aa8251 commit b554cc1

File tree

2 files changed

+120
-83
lines changed

2 files changed

+120
-83
lines changed
Lines changed: 27 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,95 +1,39 @@
1-
import xlrd
2-
import zipfile
1+
from io import BytesIO
2+
from openpyxl import load_workbook
3+
import xlrd, zipfile
34
from collections import OrderedDict
45
from ..exceptions import TableTooBigError, MissingRequirementsError
5-
6-
from ..utilities import header_population
6+
from ..utilities import (
7+
header_population,
8+
to_bytes,
9+
parse_xls,
10+
parse_xlsx
11+
)
712
from mfr.extensions.tabular.compat import range, basestring
813

914

1015
def xlsx_xlrd(fp):
11-
"""Read and convert a xlsx file to JSON format using the xlrd library
12-
:param fp: File pointer object
13-
:return: tuple of table headers and data
1416
"""
15-
MAX_SIZE = 10000
16-
17-
try:
18-
wb = xlrd.open_workbook(fp.name)
19-
using_xlrd = True
20-
except xlrd.biffh.XLRDError:
21-
using_xlrd = False
22-
try:
23-
from openpyxl import load_workbook
24-
except ImportError:
25-
raise MissingRequirementsError(
26-
'openpyxl is required to read .xlsx files',
27-
function_preference='openpyxl'
28-
)
29-
try:
30-
wb = load_workbook(fp.name, data_only=True)
31-
except zipfile.BadZipFile:
32-
raise xlrd.biffh.XLRDError("Excel xlsx file; not supported")
17+
• .xls → xlrd
18+
• .xlsx → openpyxl (xlrd ≥2.0 dropped xlsx support)
3319
20+
`fp` is the stream returned by WaterButler/MFR. It may already have been
21+
read, so we always rewind and copy to an in‑memory buffer that openpyxl (and
22+
ZipFile) can seek inside safely.
23+
"""
3424
sheets = OrderedDict()
3525

36-
if using_xlrd:
37-
for sheet in wb.sheets():
38-
if sheet.ncols > MAX_SIZE or sheet.nrows > MAX_SIZE:
39-
raise TableTooBigError('Table is too large to render.', '.xlsx',
40-
nbr_cols=sheet.ncols, nbr_rows=sheet.nrows)
41-
42-
if sheet.ncols < 1 or sheet.nrows < 1:
43-
sheets[sheet.name] = ([], [])
44-
continue
45-
46-
fields = sheet.row_values(0) if sheet.nrows else []
47-
48-
fields = [
49-
str(value)
50-
if not isinstance(value, basestring) and value is not None
51-
else value or f'Unnamed: {index + 1}'
52-
for index, value in enumerate(fields)
53-
]
54-
55-
data = []
56-
for i in range(1, sheet.nrows):
57-
row = []
58-
for cell in sheet.row(i):
59-
if cell.ctype == xlrd.XL_CELL_DATE:
60-
value = xlrd.xldate.xldate_as_datetime(cell.value, wb.datemode).isoformat()
61-
else:
62-
value = cell.value
63-
row.append(value)
64-
data.append(dict(zip(fields, row)))
65-
66-
header = header_population(fields)
67-
sheets[sheet.name] = (header, data)
68-
69-
else:
70-
for name in wb.sheetnames:
71-
ws = wb[name]
72-
nrows = ws.max_row
73-
ncols = ws.max_column
74-
if ncols > MAX_SIZE or nrows > MAX_SIZE:
75-
raise TableTooBigError('Table is too large to render.', '.xlsx',
76-
nbr_cols=ncols, nbr_rows=nrows)
77-
78-
if nrows < 1 or ncols < 1:
79-
sheets[name] = ([], [])
80-
continue
81-
82-
header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True))
83-
fields = [
84-
str(val) if val is not None else f'Unnamed: {i + 1}'
85-
for i, val in enumerate(header_row)
86-
]
87-
88-
data = []
89-
for row in ws.iter_rows(min_row=2, max_row=nrows, max_col=ncols, values_only=True):
90-
data.append(dict(zip(fields, row)))
26+
try:
27+
wb = xlrd.open_workbook(file_contents=to_bytes(fp))
28+
return parse_xls(wb, sheets)
29+
except xlrd.biffh.XLRDError:
30+
pass
9131

92-
header = header_population(fields)
93-
sheets[name] = (header, data)
32+
try:
33+
wb = load_workbook(BytesIO(to_bytes(fp)), data_only=True, read_only=True)
34+
except zipfile.BadZipFile as exc:
35+
raise xlrd.biffh.XLRDError(
36+
"Invalid xlsx file or corrupted ZIP structure"
37+
) from exc
9438

95-
return sheets
39+
return parse_xlsx(wb, sheets)

mfr/extensions/tabular/utilities.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import re
2+
import xlrd
3+
24
from http import HTTPStatus
35
from subprocess import (check_call,
46
TimeoutExpired,
@@ -11,6 +13,8 @@
1113
PSPP_CONVERT_TIMEOUT)
1214

1315

16+
MAX_SIZE = 10_000
17+
1418
def header_population(headers):
1519
"""make column headers from a list
1620
:param headers: list of column headers
@@ -83,3 +87,92 @@ def sav_to_csv(fp):
8387
exporter_class='tabular'
8488
)
8589
return csv_file
90+
91+
92+
def to_bytes(fp):
93+
"""
94+
Return *exactly* the original bytes of the Excel file and rewind *fp*.
95+
Handles both binary and text wrappers that WaterButler may give us.
96+
"""
97+
try:
98+
fp.seek(0)
99+
except Exception:
100+
pass
101+
102+
raw = fp.read()
103+
if isinstance(raw, bytes):
104+
try:
105+
fp.seek(0)
106+
except Exception:
107+
pass
108+
return raw
109+
110+
if hasattr(fp, "buffer"):
111+
buf = fp.buffer
112+
try:
113+
buf.seek(0)
114+
except Exception:
115+
pass
116+
data = buf.read()
117+
try:
118+
buf.seek(0)
119+
except Exception:
120+
pass
121+
else:
122+
data = raw.encode("utf-8", "surrogateescape")
123+
124+
try:
125+
fp.seek(0)
126+
except Exception:
127+
pass
128+
return data
129+
130+
131+
def parse_xls(wb, sheets):
132+
for sheet in wb.sheets():
133+
verify_size(sheet.nrows, sheet.ncols, '.xls')
134+
fields = fix_headers(sheet.row_values(0))
135+
rows = [
136+
dict(zip(fields, row_vals(sheet.row(r), wb.datemode)))
137+
for r in range(1, sheet.nrows)
138+
]
139+
sheets[sheet.name] = (header_population(fields), rows)
140+
return sheets
141+
142+
143+
def parse_xlsx(wb, sheets):
144+
for name in wb.sheetnames:
145+
ws = wb[name]
146+
verify_size(ws.max_row, ws.max_column, '.xlsx')
147+
header_row = next(ws.iter_rows(max_row=1, values_only=True))
148+
fields = fix_headers(header_row)
149+
rows = [
150+
dict(zip(fields, row))
151+
for row in ws.iter_rows(min_row=2,
152+
max_row=ws.max_row,
153+
max_col=ws.max_column,
154+
values_only=True)
155+
]
156+
sheets[name] = (header_population(fields), rows)
157+
return sheets
158+
159+
160+
def verify_size(rows, cols, ext):
161+
if rows > MAX_SIZE or cols > MAX_SIZE:
162+
raise TableTooBigError('Table is too large to render.', ext,
163+
nbr_cols=cols, nbr_rows=rows)
164+
165+
166+
def fix_headers(raw):
167+
return [str(v) if v not in (None, '') else f'Unnamed: {i+1}'
168+
for i, v in enumerate(raw)]
169+
170+
171+
def row_vals(row, datemode):
172+
out = []
173+
for c in row:
174+
if c.ctype == xlrd.XL_CELL_DATE:
175+
out.append(xlrd.xldate.xldate_as_datetime(c.value, datemode).isoformat())
176+
else:
177+
out.append(c.value)
178+
return out

0 commit comments

Comments
 (0)