Skip to content

Commit

Permalink
feat: use thread pool
Browse files Browse the repository at this point in the history
  • Loading branch information
CatchZeng committed Feb 23, 2021
1 parent 9830181 commit 9a7fc4b
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 7 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
build
dist
bing_image/__pycache__
bing_image.egg-info
bing_images/__pycache__
bing_images.egg-info
File renamed without changes.
35 changes: 31 additions & 4 deletions bing_image/bing.py → bing_images/bing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from .util import file_name, make_image_dir, download_image
try:
from util import file_name, make_image_dir, download_image
except ImportError: # Python 3
from .util import file_name, make_image_dir, download_image
from typing import List
from multiprocessing.pool import ThreadPool
from time import time as timer
import requests
import re
import os
Expand All @@ -11,6 +16,7 @@

class Bing:
def fetch_image_urls(
self,
query: str,
first: int = 0,
count: int = 20,
Expand Down Expand Up @@ -70,22 +76,43 @@ def download_images(
query: str,
limit: int = 20,
output_dir='dataset',
processes: int = 20,
adult: bool = False,
file_type: str = "jpg",
filters: str = '',
force_replace=False
):
image_dir = make_image_dir(output_dir, query, force_replace)

urls = fetch_image_urls(query, limit, adult, file_type, filters)
counter = 1
print("save path: {}".format(image_dir))
print("Save path: {}".format(image_dir))
entries = []
for url in urls:
name = file_name(url, counter, query)
print("downloading {} {}".format(name, url))
path = os.path.join(image_dir, name)
download_image(url, path)
entries.append((url, path))
counter += 1

start = timer()

tp = processes
if limit < processes:
tp = limit
results = ThreadPool(tp).imap_unordered(download_image_with_thread, entries)
for path in results:
print("Downloaded", path)

print("Done")
print(f"Elapsed Time: {timer() - start}")


def download_image_with_thread(entry):
url, path = entry
print("Downloading {} from {}".format(path, url))
download_image(url, path)
return path


if __name__ == '__main__':
# urls = fetch_image_urls("cat", limit=100, file_type='png',
Expand Down
5 changes: 4 additions & 1 deletion bing_image/util.py → bing_images/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,7 @@ def make_image_dir(output_dir, sub_dir, force_replace=False) -> str:
if not os.path.isdir("{}/{}/{}".format(cwd, output_dir, sub_dir)):
os.makedirs("{}/{}/{}".format(cwd, output_dir, sub_dir))

return image_dir
return image_dir

if __name__ == '__main__':
print("util")

0 comments on commit 9a7fc4b

Please sign in to comment.