From 9830181a4566e540ea89708b52450ea2eb207153 Mon Sep 17 00:00:00 2001 From: CatchZeng <891793848@qq.com> Date: Tue, 23 Feb 2021 16:27:57 +0800 Subject: [PATCH] feat: init project --- .github/workflows/python-publish.yml | 31 +++++++++ .gitignore | 4 ++ LICENSE | 21 ++++++ README.md | 3 + bing_image/__init__.py | 1 + bing_image/bing.py | 98 ++++++++++++++++++++++++++++ bing_image/util.py | 41 ++++++++++++ pyproject.toml | 6 ++ setup.cfg | 17 +++++ 9 files changed, 222 insertions(+) create mode 100644 .github/workflows/python-publish.yml create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 bing_image/__init__.py create mode 100644 bing_image/bing.py create mode 100644 bing_image/util.py create mode 100644 pyproject.toml create mode 100644 setup.cfg diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..f743767 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,31 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python3 -m pip install --upgrade build + python3 -m pip install --user --upgrade twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python3 -m build + python3 -m twine upload dist/* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..495c019 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +build +dist +bing_image/__pycache__ +bing_image.egg-info diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..42ccda7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 CatchZeng + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..2164ccc --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# bing_images + +Python library to fetch image urls based on keywords and download from Bing.com. \ No newline at end of file diff --git a/bing_image/__init__.py b/bing_image/__init__.py new file mode 100644 index 0000000..9e52310 --- /dev/null +++ b/bing_image/__init__.py @@ -0,0 +1 @@ +name = "bing_images" \ No newline at end of file diff --git a/bing_image/bing.py b/bing_image/bing.py new file mode 100644 index 0000000..05b04ba --- /dev/null +++ b/bing_image/bing.py @@ -0,0 +1,98 @@ +from .util import file_name, make_image_dir, download_image +from typing import List +import requests +import re +import os + +FETCH_IMAGE_URL = "https://www.bing.com/images/async" +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'} + + +class Bing: + def fetch_image_urls( + query: str, + first: int = 0, + count: int = 20, + adult: bool = False, + filters: str = "", + ) -> List[str]: + params = { + "q": query, + "first": first, + "count": count, + "adlt": adult, + "qft": filters, + } + try: + r = requests.get(FETCH_IMAGE_URL, headers=HEADERS, params=params) + r.raise_for_status() + except Exception as exc: + print(exc) + raise exc + + try: + urls = re.findall(r"murl":"(.*?)"", r.text) + except Exception as exc: + print(exc) + raise exc + + return urls + + +def fetch_image_urls( + query: str, + limit: int = 20, + adult: bool = False, + file_type: str = "jpg", + filters: str = '', + max_page_counter: int = 10 +) -> List[str]: + result = list() + page_counter = 0 + + bing = Bing() + while len(result) < limit: + urls = bing.fetch_image_urls( + query, page_counter, limit, adult, file_type + filters) + for url in urls: + if url.endswith(file_type): + result.append(url) + if len(result) >= limit: + break + page_counter += 1 + if page_counter >= max_page_counter: + break + return result + + +def download_images( + query: str, + limit: int = 20, + output_dir='dataset', + adult: bool = False, + file_type: str = "jpg", + filters: str = '', + force_replace=False +): + image_dir = make_image_dir(output_dir, query, force_replace) + urls = fetch_image_urls(query, limit, adult, file_type, filters) + counter = 1 + print("save path: {}".format(image_dir)) + for url in urls: + name = file_name(url, counter, query) + print("downloading {} {}".format(name, url)) + path = os.path.join(image_dir, name) + download_image(url, path) + counter += 1 + + +if __name__ == '__main__': + # urls = fetch_image_urls("cat", limit=100, file_type='png', + # filters='+filterui:aspect-square+filterui:color2-bw') + # print("{} images.".format(len(urls))) + # counter = 1 + # for url in urls: + # print("{}: {}".format(counter, url)) + # counter += 1 + download_images("cat", 20, file_type="png", force_replace=True) diff --git a/bing_image/util.py b/bing_image/util.py new file mode 100644 index 0000000..f88f7b1 --- /dev/null +++ b/bing_image/util.py @@ -0,0 +1,41 @@ +import requests +import shutil +import posixpath +import urllib +import os + +def download_image(url, path): + r = requests.get(url, stream=True) + if r.status_code == 200: + with open(path, 'wb') as f: + r.raw.decode_content = True + shutil.copyfileobj(r.raw, f) + +def file_name(url, index, prefix = 'image') -> str: + try: + path = urllib.parse.urlsplit(url).path + filename = posixpath.basename(path).split('?')[0] + file_type = filename.split(".")[-1] + if file_type.lower() not in ["jpe", "jpeg", "jfif", "exif", "tiff", "gif", "bmp", "png", "webp", "jpg"]: + file_type = "jpg" + result = "{}_{}.{}".format(prefix, str(index), file_type) + return result + except Exception as e: + print("[!] Issue getting: {}\n[!] Er = 0ror:: {}".format(url, e)) + return prefix + +def make_image_dir(output_dir, sub_dir, force_replace=False) -> str: + cwd = os.getcwd() + image_dir = os.path.join(cwd, output_dir, sub_dir) + if force_replace: + if os.path.isdir(image_dir): + shutil.rmtree(image_dir) + try: + if not os.path.isdir("{}/{}/".format(cwd, output_dir)): + os.makedirs("{}/{}/".format(cwd, output_dir)) + except: + pass + if not os.path.isdir("{}/{}/{}".format(cwd, output_dir, sub_dir)): + os.makedirs("{}/{}/{}".format(cwd, output_dir, sub_dir)) + + return image_dir \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b5a3c46 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..fbe1ec5 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,17 @@ +[metadata] +name = bing_images +version = 0.0.1 +author = CatchZeng +author_email = catchzenghh@gmail.com +description = A small example package +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/catchzeng/bing_images +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +packages = find: +python_requires = >=3.6 \ No newline at end of file