Skip to content

Commit 770848c

Browse files
committed
Spreadsheet.from_file: use load_data from pyexcel_io instead of iget_array
it's unclear to what extent this is a private/public api, but it allows us to reuse a "reader" object across multiple calls, which for some plugins should be able to save a some setup cost while still allowing us to read the header row and data rows with different column_limit values also re-enable column_limit_from_header support for xlsx/xlsm
1 parent 3e7254b commit 770848c

File tree

1 file changed

+23
-11
lines changed

1 file changed

+23
-11
lines changed

app/models/spreadsheet.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from time import sleep
55
from typing import Final, Literal, Self
66

7-
import pyexcel
7+
from pyexcel_io.io import load_data as pyexcel_load_data
88

99

1010
class Spreadsheet:
@@ -108,27 +108,39 @@ def from_file(
108108
if extension == "tsv":
109109
file_content = StringIO(Spreadsheet.normalise_newlines(file_content))
110110

111-
column_limit = -1
112-
if column_limit_from_header and extension not in ("xlsx", "xlsm"):
111+
# why not just use pyexcel.iget_array? this allows us to reuse the same reader
112+
# from the initial header read for the data reading, which should save some
113+
# startup cost
114+
row_iter_dict, reader = pyexcel_load_data(
115+
file_stream=file_content,
116+
file_type=extension,
117+
streaming=True,
118+
row_limit=(1 if column_limit_from_header else -1),
119+
)
120+
row_iter = next(iter(row_iter_dict.values()), iter(()))
121+
122+
if column_limit_from_header:
113123
original_offset = file_content.tell()
114-
header = next(pyexcel.iget_array(file_type=extension, file_stream=file_content, row_limit=1), ())
124+
last_nonempty_column = next(
125+
(i for i, x in reversed(tuple(enumerate(next(row_iter, ())))) if str(x).strip()), None
126+
)
127+
128+
# some plugins just read things directly from the file each time, so we need to reset
129+
# the offset for them to have them read from the beginning again
115130
file_content.seek(original_offset)
116131

117-
last_nonempty_column = next((i for i, x in reversed(tuple(enumerate(header))) if str(x).strip()), None)
118132
if last_nonempty_column is not None:
119133
if last_nonempty_column >= absolute_column_limit:
120134
raise cls.TooManyColumnsError(
121135
f"Last non-empty header column ({last_nonempty_column}) "
122136
f"is beyond absolute limit of {absolute_column_limit}"
123137
)
124138

125-
column_limit = max(last_nonempty_column + 1, min_column_limit)
139+
reader.keywords["column_limit"] = max(last_nonempty_column + 1, min_column_limit)
140+
reader.keywords["row_limit"] = -1
141+
row_iter = next(iter(reader.read_sheet_by_index(0).values()))
126142

127-
return cls.from_rows(
128-
pyexcel.iget_array(file_type=extension, file_stream=file_content, column_limit=column_limit),
129-
filename,
130-
row_limit=row_limit,
131-
)
143+
return cls.from_rows(row_iter, filename, row_limit=row_limit)
132144

133145
@classmethod
134146
def from_file_form(

0 commit comments

Comments
 (0)