From 9a7fc4b45dc21a3124a287425001d08bbb9bbf9f Mon Sep 17 00:00:00 2001 From: CatchZeng <891793848@qq.com> Date: Tue, 23 Feb 2021 17:48:22 +0800 Subject: [PATCH] feat: use thread pool --- .gitignore | 4 +-- {bing_image => bing_images}/__init__.py | 0 {bing_image => bing_images}/bing.py | 35 ++++++++++++++++++++++--- {bing_image => bing_images}/util.py | 5 +++- 4 files changed, 37 insertions(+), 7 deletions(-) rename {bing_image => bing_images}/__init__.py (100%) rename {bing_image => bing_images}/bing.py (74%) rename {bing_image => bing_images}/util.py (95%) diff --git a/.gitignore b/.gitignore index 495c019..b051375 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ build dist -bing_image/__pycache__ -bing_image.egg-info +bing_images/__pycache__ +bing_images.egg-info diff --git a/bing_image/__init__.py b/bing_images/__init__.py similarity index 100% rename from bing_image/__init__.py rename to bing_images/__init__.py diff --git a/bing_image/bing.py b/bing_images/bing.py similarity index 74% rename from bing_image/bing.py rename to bing_images/bing.py index 05b04ba..962ab87 100644 --- a/bing_image/bing.py +++ b/bing_images/bing.py @@ -1,5 +1,10 @@ -from .util import file_name, make_image_dir, download_image +try: + from util import file_name, make_image_dir, download_image +except ImportError: # Python 3 + from .util import file_name, make_image_dir, download_image from typing import List +from multiprocessing.pool import ThreadPool +from time import time as timer import requests import re import os @@ -11,6 +16,7 @@ class Bing: def fetch_image_urls( + self, query: str, first: int = 0, count: int = 20, @@ -70,22 +76,43 @@ def download_images( query: str, limit: int = 20, output_dir='dataset', + processes: int = 20, adult: bool = False, file_type: str = "jpg", filters: str = '', force_replace=False ): image_dir = make_image_dir(output_dir, query, force_replace) + urls = fetch_image_urls(query, limit, adult, file_type, filters) counter = 1 - print("save path: {}".format(image_dir)) + print("Save path: {}".format(image_dir)) + entries = [] for url in urls: name = file_name(url, counter, query) - print("downloading {} {}".format(name, url)) path = os.path.join(image_dir, name) - download_image(url, path) + entries.append((url, path)) counter += 1 + start = timer() + + tp = processes + if limit < processes: + tp = limit + results = ThreadPool(tp).imap_unordered(download_image_with_thread, entries) + for path in results: + print("Downloaded", path) + + print("Done") + print(f"Elapsed Time: {timer() - start}") + + +def download_image_with_thread(entry): + url, path = entry + print("Downloading {} from {}".format(path, url)) + download_image(url, path) + return path + if __name__ == '__main__': # urls = fetch_image_urls("cat", limit=100, file_type='png', diff --git a/bing_image/util.py b/bing_images/util.py similarity index 95% rename from bing_image/util.py rename to bing_images/util.py index f88f7b1..25b3631 100644 --- a/bing_image/util.py +++ b/bing_images/util.py @@ -38,4 +38,7 @@ def make_image_dir(output_dir, sub_dir, force_replace=False) -> str: if not os.path.isdir("{}/{}/{}".format(cwd, output_dir, sub_dir)): os.makedirs("{}/{}/{}".format(cwd, output_dir, sub_dir)) - return image_dir \ No newline at end of file + return image_dir + +if __name__ == '__main__': + print("util") \ No newline at end of file