Download public Google Drive Sheets using PyCurl instead of Requests

MAGIC-nexus · Apr 10, 2021 · 96eaae8 · 96eaae8
1 parent 4efc32d
commit 96eaae8
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 6 deletions.
diff --git a/nexinfosys/command_generators/parser_spreadsheet.py b/nexinfosys/command_generators/parser_spreadsheet.py
@@ -11,7 +11,7 @@
 from nexinfosys.command_executors import create_command
 from nexinfosys.command_generators.parser_spreadsheet_utils import binary_mask_from_worksheet, \
     obtain_rectangular_submatrices
-from nexinfosys.common.helper import create_dictionary, first
+from nexinfosys.common.helper import create_dictionary, first, download_file
 from nexinfosys.command_definitions import valid_v2_command_names, commands
 from nexinfosys.command_generators.spreadsheet_command_parsers_v2 import parse_command_in_worksheet
 from nexinfosys.command_generators import IType
@@ -30,15 +30,17 @@ def load_file(location: str = None):
 
         if location:
             # Try to load the Dataset from the specified location
-            data = urllib.request.urlopen(location).read()
+            data = download_file(location).getvalue()
+            # data = urllib.request.urlopen(location).read()
             # data = io.BytesIO(data)
             # Then, try to read it
             t = mimetypes.guess_type(location, strict=True)
             if t[0] == "text/python":
                 f_type = "python"
             elif t[0] == "text/json":
                 f_type = "json"
-            elif t[0] == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+            else:
+            # elif t[0] == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
                 f_type = "spreadsheet"
 
         return f_type, data

diff --git a/nexinfosys/common/helper.py b/nexinfosys/common/helper.py
@@ -24,6 +24,8 @@
 import jsonpickle
 import numpy as np
 import pandas as pd
+import pycurl
+import requests
 import webdav.client as wc
 from flask import after_this_request, request
 from multidict import MultiDict, CIMultiDict
@@ -1165,6 +1167,51 @@ def wv_download_file(location, wv_user=None, wv_password=None, wv_host_name=None
     return data
 
 
+def download_with_pycurl(location):
+    headers = {}
+
+    def header_function(header_line):
+        # HTTP standard specifies that headers are encoded in iso-8859-1.
+        # On Python 2, decoding step can be skipped.
+        # On Python 3, decoding step is required.
+        header_line = header_line.decode('iso-8859-1')
+
+        # Header lines include the first status line (HTTP/1.x ...).
+        # We are going to ignore all lines that don't have a colon in them.
+        # This will botch headers that are split on multiple lines...
+        if ':' not in header_line:
+            return
+
+        # Break the header line into header name and value.
+        name, value = header_line.split(':', 1)
+
+        # Remove whitespace that may be present.
+        # Header lines include the trailing newline, and there may be whitespace
+        # around the colon.
+        name = name.strip()
+        value = value.strip()
+
+        # Header names are case insensitive.
+        # Lowercase name here.
+        name = name.lower()
+
+        # Now we can actually record the header name and value.
+        # Note: this only works when headers are not duplicated, see below.
+        headers[name] = value
+
+    data = io.BytesIO()
+    c = pycurl.Curl()
+    c.setopt(c.URL, location)
+    c.setopt(c.FOLLOWLOCATION, True)
+    c.setopt(c.HEADERFUNCTION, header_function)
+    c.setopt(c.WRITEDATA, data)
+    c.perform()
+    status = c.getinfo(c.RESPONSE_CODE)
+    c.close()
+
+    return status, headers, data
+
+
 def download_file(location, wv_user=None, wv_password=None, wv_host_name=None):
     """
     Download a file from the specified URL location.
@@ -1196,9 +1243,20 @@ def download_file(location, wv_user=None, wv_password=None, wv_host_name=None):
             import re
             m = re.match(r".*[^-\w]([-\w]{33,})[^-\w]?.*", location)
             file_id = m.groups()[0]
-            credentials_file = get_global_configuration_variable("GAPI_CREDENTIALS_FILE")
-            token_file = get_global_configuration_variable("GAPI_TOKEN_FILE")
-            data = download_xlsx_file_id(credentials_file, token_file, file_id)
+            url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx"  # &id={file_id}"
+            # resp = requests.get(url, headers={'Cache-Control': 'no-cache', 'Pragma': 'no-cache'}, allow_redirects=True)  # headers={'Cache-Control': 'no-cache', 'Pragma': 'no-cache'}
+            status_code, headers, data = download_with_pycurl(url)
+            print(f'curl -L "{url}" >> out.xlsx')
+            if status_code == 200 and "text/html" not in headers["content-type"]:
+            # if resp.status_code == 200 and "text/html" not in resp.headers["Content-Type"]:
+            #     data = io.BytesIO(resp.content)
+                pass
+            else:
+                credentials_file = get_global_configuration_variable("GAPI_CREDENTIALS_FILE")
+                token_file = get_global_configuration_variable("GAPI_TOKEN_FILE")
+                data = download_xlsx_file_id(credentials_file, token_file, file_id)
+            # with open("/home/rnebot/Downloads/out2.xlsx", "wb") as nf:
+            #     nf.write(data.getvalue())
         else:
             data = urllib.request.urlopen(location).read()
             data = io.BytesIO(data)

diff --git a/requirements.txt b/requirements.txt
@@ -30,6 +30,7 @@ Flask_Cors==3.0.3
 Werkzeug==0.15.5  # >=
 requests==2.21.0  # >=
 requests_cache==0.4.13
+pycurl>=7.43.0.6
 pymonetdb>=1.1.1
 SQLAlchemy>=1.3.3
 sqlalchemy_monetdb==0.9.3