@@ -201,10 +201,26 @@ async def download(
201
201
save_dir : Optional [str ] = None ,
202
202
file_name : Optional [str ] = None ,
203
203
extension : str = "jpg" ,
204
+ client_kwargs : Optional [Dict ] = None ,
205
+ stream_kwargs : Optional [Dict ] = None ,
206
+ chunk_size : int = 8192 ,
204
207
) -> str :
205
- """返回下载后的绝对(完整)路径
208
+ """
209
+ 使用httpx,异步流式下载
210
+
211
+ httpx默认timeout仅为5s,如果下载大文件,需要设置timeout
212
+ 可以通过client_kwargs、stream_kwargs传入参数,分别对应client和stream的参数
213
+
214
+ Args:
215
+ full_path : 如果提供了full_path(需要提供文件的扩展类型),则直接下载到该路径,其他参数无效
216
+ client_kwargs : https://www.python-httpx.org/api/#asyncclient
217
+ stream_kwargs : https://www.python-httpx.org/api/#request
218
+
219
+ Raises:
220
+ EventHandleError: _description_
206
221
207
- 如果提供了full_path(需要提供文件的扩展类型),则直接下载到该路径,其他参数无效
222
+ Returns:
223
+ 返回图片下载后的绝对(完整)路径
208
224
"""
209
225
210
226
if full_path :
@@ -218,9 +234,21 @@ async def download(
218
234
save_path = os .path .abspath (save_path )
219
235
220
236
if self .temporary_file_path .startswith ("http" ):
221
- await download_file (self .file_path , save_path )
237
+ await download_file (
238
+ url = self .file_path ,
239
+ save_path = save_path ,
240
+ client_kwargs = client_kwargs ,
241
+ stream_kwargs = stream_kwargs ,
242
+ chunk_size = chunk_size ,
243
+ )
222
244
return save_path
223
245
246
+ # 这里只处理了host为空的情况(3个斜杠)
247
+ # https://en.wikipedia.org/wiki/File_URI_scheme
248
+ elif self .temporary_file_path .startswith ("file:///" ):
249
+ return self .file_path [8 :]
250
+
251
+ # TODO 不太确定带host的情况下,path是否正确
224
252
elif self .temporary_file_path .startswith ("file://" ):
225
253
return self .file_path [7 :]
226
254
@@ -263,8 +291,23 @@ def hash_string(string: str):
263
291
return h .hexdigest ()
264
292
265
293
266
- async def download_file (url : str , save_path : str ):
267
- """使用httpx,流式下载"""
294
+ async def download_file (
295
+ url : str ,
296
+ save_path : str ,
297
+ client_kwargs : Optional [Dict ] = None ,
298
+ stream_kwargs : Optional [Dict ] = None ,
299
+ chunk_size : int = 8192 ,
300
+ ):
301
+ """使用httpx,流式下载
302
+
303
+ 默认使用GET,可以通过stream_kwargs传入method参数,来指定请求方法
304
+
305
+ 最好别在stream_kwargs手动覆写URL,可能会导致函数参数的URL和stream_kwargs中的URL不一致
306
+
307
+ Args:
308
+ client_kwargs : https://www.python-httpx.org/api/#asyncclient
309
+ stream_kwargs : https://www.python-httpx.org/api/#request
310
+ """
268
311
269
312
# 以防万一,还是再调用一次,保证绝对是绝对路径
270
313
absolute_path = os .path .abspath (save_path )
@@ -273,10 +316,32 @@ async def download_file(url: str, save_path: str):
273
316
if not os .path .exists (directory ): # 如果文件夹不存在
274
317
os .makedirs (directory ) # 创建文件夹
275
318
276
- async with httpx .AsyncClient () as client :
277
- async with client .stream ("GET" , url ) as response :
319
+ # 设置默认参数
320
+ default_client_kwargs : Dict = dict (
321
+ timeout = 60 ,
322
+ )
323
+
324
+ default_stream_kwargs : Dict = dict (
325
+ method = "GET" ,
326
+ url = url ,
327
+ timeout = 60 ,
328
+ )
329
+
330
+ # 确保kwargs不为None
331
+ if client_kwargs :
332
+ client_kwargs = {** default_client_kwargs , ** client_kwargs }
333
+ else :
334
+ client_kwargs = default_client_kwargs
335
+
336
+ if stream_kwargs :
337
+ stream_kwargs = {** default_stream_kwargs , ** stream_kwargs }
338
+ else :
339
+ stream_kwargs = default_stream_kwargs
340
+
341
+ async with httpx .AsyncClient (** client_kwargs ) as client :
342
+ async with client .stream (** stream_kwargs ) as response :
278
343
async with aiofiles .open (absolute_path , "wb" ) as f :
279
- async for chunk in response .aiter_bytes (chunk_size = 8192 ):
344
+ async for chunk in response .aiter_bytes (chunk_size = chunk_size ):
280
345
await f .write (chunk )
281
346
282
347
0 commit comments