kuwala-io · arifluthfi16 · Jan 19, 2022 · Jan 19, 2022 · Jan 21, 2022 · Jan 22, 2022
diff --git a/kuwala/docker-compose.yml b/kuwala/docker-compose.yml
@@ -195,3 +195,18 @@ services:
       - ./tmp/kuwala/population_files:/opt/app/tmp/kuwala/population_files
     profiles:
       - network
+
+  # docker-compose run insta-scrapper
+  # Example Options : docker-compose run insta-scrapper --location_id=1313809508724705 --type=locations
+  insta-scrapper:
+    container_name: insta-scrapper
+    build:
+      context: .
+      dockerfile: ./pipelines/insta-scrapper/dockerfile
+    restart: always
+    volumes:
+      - ./tmp/kuwala/insta_pipelines:/opt/app/tmp/kuwala/insta_pipelines
+    ports:
+      - '3016:3016'
+    profiles:
+      - network
diff --git a/kuwala/pipelines/insta-scrapper/.dockerignore b/kuwala/pipelines/insta-scrapper/.dockerignore
@@ -0,0 +1 @@
+venv
diff --git a/kuwala/pipelines/insta-scrapper/.gitignore b/kuwala/pipelines/insta-scrapper/.gitignore
@@ -0,0 +1 @@
+venv
diff --git a/kuwala/pipelines/insta-scrapper/dockerfile b/kuwala/pipelines/insta-scrapper/dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.9.5
+
+COPY ./pipelines/insta-scrapper /opt/app/pipelines/insta-scrapper
+
+WORKDIR /opt/app/pipelines/insta-scrapper
+RUN pip install --no-cache-dir -r requirements.txt
+
+WORKDIR /opt/app/pipelines/insta-scrapper/src
+ENTRYPOINT [ "python", "main.py"]
diff --git a/kuwala/pipelines/insta-scrapper/readme.md b/kuwala/pipelines/insta-scrapper/readme.md
@@ -0,0 +1,103 @@
+This Pipelines is for Issue:
+https://github.com/kuwala-io/kuwala/issues/60
+
+
+
+# Instagram Pipelines
+
+This pipelines allows you to scrape only **public** information from instagram it is not supported to scrape private information that requires authorization or login. 
+
+
+
+## **Sources & Features :**
+
+1. Locations
+
+   - comments_disabled
+
+   - id
+   - shortcode
+   - taken_at_timestamp
+   - display_url
+   - is_video
+   - caption
+   - comment_count
+   - liked_count
+   - preview_liked_count
+   - owner_id
+   - post_height
+   - post_width
+   - location_name
+   - lat
+   - long
+   - h3
+   - country
+   - country_id
+   - city
+   - city_id
+   - accessibility_caption
+   - video_view_count
+
+2. Hashtag (Coming Soon)
+
+3. Profiles (Coming Soon)
+
+
+
+## Usage
+
+Since it is not merged yet, you can clone my fork and checkout to the branch:
+
+```bash
+git clone -b features/insta-scrapper https://github.com/arifluthfi16/kuwala.git
+```
+
+Navigate to `kuwala/kuwala` until you can see the docker compose file. And then build the image using:
+
+```bash
+docker-compose build insta-scrapper
+```
+
+### Try to Scrape a Location by Id
+
+You can supply any location id here, to get a location id you can navigate to https://www.instagram.com/explore/locations/. A will be something like `1313809508724705` after you got your location id execute:
+
+```bash
+docker-compose run insta-scrapper --location_id=<your_loc_id> --type=locations
+```
+
+This command will write the csv results into `/kuwala/tmp/insta_pipelines/locations/`
+
+
+
+### Available Arguments
+
+```bash
+  -h, --help            show this help message and exit
+  --location_id LOCATION_ID
+                        Location id to scrape
+  --type TYPE           Available type = locations | hashtag | profile
+  --max_sessions MAX_SESSIONS
+                        Maximum tor session created
+  --max_posts MAX_POSTS
+                        Maximum post scrapped
+  --max_session_time MAX_SESSION_TIME
+                        Maximum tor session up time
+  --max_request_per_session MAX_REQUEST_PER_SESSION
+                        Maximum number of requests from a single tor session
+  --continue_last_cursor CONTINUE_LAST_CURSOR
+                        Continue from last saved cursor
+  --log_error LOG_ERROR
+                        Log error to /errors/
+  --request_sleep_time REQUEST_SLEEP_TIME
+                        Sleep time after every session request
+  --headers HEADERS     Request headers
+```
+
+The only required arguments to run is `location_id` and `type`
+
+
+
+## License
+
+We are not responsible for nor do we take any responsibility for legal claims that might arise. **Use at your own risk!**
diff --git a/kuwala/pipelines/insta-scrapper/requirements.txt b/kuwala/pipelines/insta-scrapper/requirements.txt
@@ -0,0 +1,74 @@
+aiofiles==0.8.0
+appnope==0.1.2
+asttokens==2.0.5
+attrs==21.4.0
+backcall==0.2.0
+beautifulsoup4==4.10.0
+black==21.12b0
+blinker==1.4
+certifi==2020.12.5
+cffi==1.14.4
+chardet==3.0.4
+charset-normalizer==2.0.10
+click==8.0.3
+click-plugins==1.1.1
+cligj==0.7.2
+colorama==0.4.4
+cryptography==3.2.1
+decorator==4.4.2
+executing==0.8.2
+Fiona==1.8.20
+func-timeout==4.3.5
+geopandas==0.10.2
+h11==0.12.0
+h2==4.1.0
+h3==3.7.3
+hpack==4.0.0
+hypercorn==0.13.2
+hyperframe==6.0.1
+idna==2.10
+insta-scrape==2.1.2
+ipython==7.19.0
+ipython-genutils==0.2.0
+itsdangerous==2.0.1
+jedi==0.17.2
+Jinja2==3.0.3
+MarkupSafe==2.0.1
+matplotlib-inline==0.1.3
+munch==2.5.0
+mypy-extensions==0.4.3
+numpy==1.19.4
+pandas==1.1.5
+parso==0.7.1
+pathspec==0.9.0
+pexpect==4.8.0
+pickleshare==0.7.5
+platformdirs==2.4.1
+priority==2.0.0
+prompt-toolkit==3.0.8
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.20
+Pygments==2.7.3
+pyproj==3.3.0
+PySocks==1.7.1
+python-dateutil==2.8.1
+python-dotenv==0.19.2
+pytz==2020.4
+quart==0.16.2
+requests==2.25.0
+Shapely==1.8.0
+six==1.15.0
+soupsieve==2.3.1
+stack-data==0.1.4
+toml==0.10.2
+tomli==1.2.3
+torpy==1.1.3
+tqdm==4.54.1
+traitlets==5.0.5
+typing_extensions==4.0.1
+urllib3==1.26.2
+wcwidth==0.2.5
+Werkzeug==2.0.2
+wincertstore==0.2
+wsproto==1.0.0
diff --git a/kuwala/pipelines/insta-scrapper/src/__init__.py b/kuwala/pipelines/insta-scrapper/src/__init__.py
diff --git a/kuwala/pipelines/insta-scrapper/src/cursors/.gitignore b/kuwala/pipelines/insta-scrapper/src/cursors/.gitignore
@@ -0,0 +1,3 @@
+*
+*/
+!.gitignore
diff --git a/kuwala/pipelines/insta-scrapper/src/main.py b/kuwala/pipelines/insta-scrapper/src/main.py
@@ -0,0 +1,62 @@
+from wrapper.locations import location_wrapper
+import argparse    
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    default_header = {}
+    default_header['User-agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
+
+    ## Necessary Arguments
+    parser.add_argument('--location_id', help='Location id to scrape', default='')
+    parser.add_argument('--type', help='Available type = locations | hashtag | profile', default='')
+
+    ## Optional Arguments
+    parser.add_argument('--max_sessions', help='Maximum tor session created', default=1000)
+    parser.add_argument('--max_posts', help='Maximum post scrapped', default = 100000)
+    parser.add_argument('--max_session_time', help='Maximum tor session up time', default = 3600 * 3) # 3 Hour Per Session
+    parser.add_argument('--max_request_per_session', help='Maximum number of requests from a single tor session', default=1000)
+    parser.add_argument('--continue_last_cursor', help='Continue from last saved cursor', default=True)
+    parser.add_argument('--log_error', help='Log error to /errors/', default=False)
+    parser.add_argument('--request_sleep_time', help='Sleep time after every session request', default=15)
+    parser.add_argument('--headers', help='Request headers', default=default_header)
+
+
+    args = parser.parse_args()
+
+    ## Config Definition
+    max_sessions = args.max_sessions
+    max_posts = args.max_posts
+    max_session_time = args.max_session_time
+    max_request_per_session = args.max_request_per_session
+    continue_last_cursor = args.continue_last_cursor
+    log_error = args.log_error
+    location_id = args.location_id
+    request_sleep_time = args.request_sleep_time
+    headers = args.headers
+    process_type = args.type
+
+    if process_type == 'locations':
+        print('\nScrapping locations')
+        location_wrapper(
+            max_sessions=max_sessions,
+            max_posts=max_posts,
+            max_session_time=max_session_time,
+            max_request_per_session=max_request_per_session,
+            continue_last_cursor=continue_last_cursor,
+            log_error=log_error,
+            location_id=location_id,
+            request_sleep_time=request_sleep_time,
+            headers=headers,
+        )
+
+    elif process_type == 'hashtag':
+        print('\nHashtag processor is currently un anavailable\n')
+
+    elif process_type == 'profile':
+        print('\nProfile processor is currently un anavailable\n')
+
+    else:
+        print('\nUnknown Process Type\n')
+
+    print("=@= Process Completed! =@=")