Skip to content

Commit

Permalink
Download public Google Drive Sheets using PyCurl instead of Requests
Browse files Browse the repository at this point in the history
  • Loading branch information
rnebot committed Apr 10, 2021
1 parent 4efc32d commit 96eaae8
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 6 deletions.
8 changes: 5 additions & 3 deletions nexinfosys/command_generators/parser_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from nexinfosys.command_executors import create_command
from nexinfosys.command_generators.parser_spreadsheet_utils import binary_mask_from_worksheet, \
obtain_rectangular_submatrices
from nexinfosys.common.helper import create_dictionary, first
from nexinfosys.common.helper import create_dictionary, first, download_file
from nexinfosys.command_definitions import valid_v2_command_names, commands
from nexinfosys.command_generators.spreadsheet_command_parsers_v2 import parse_command_in_worksheet
from nexinfosys.command_generators import IType
Expand All @@ -30,15 +30,17 @@ def load_file(location: str = None):

if location:
# Try to load the Dataset from the specified location
data = urllib.request.urlopen(location).read()
data = download_file(location).getvalue()
# data = urllib.request.urlopen(location).read()
# data = io.BytesIO(data)
# Then, try to read it
t = mimetypes.guess_type(location, strict=True)
if t[0] == "text/python":
f_type = "python"
elif t[0] == "text/json":
f_type = "json"
elif t[0] == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
else:
# elif t[0] == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
f_type = "spreadsheet"

return f_type, data
Expand Down
64 changes: 61 additions & 3 deletions nexinfosys/common/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import jsonpickle
import numpy as np
import pandas as pd
import pycurl
import requests
import webdav.client as wc
from flask import after_this_request, request
from multidict import MultiDict, CIMultiDict
Expand Down Expand Up @@ -1165,6 +1167,51 @@ def wv_download_file(location, wv_user=None, wv_password=None, wv_host_name=None
return data


def download_with_pycurl(location):
headers = {}

def header_function(header_line):
# HTTP standard specifies that headers are encoded in iso-8859-1.
# On Python 2, decoding step can be skipped.
# On Python 3, decoding step is required.
header_line = header_line.decode('iso-8859-1')

# Header lines include the first status line (HTTP/1.x ...).
# We are going to ignore all lines that don't have a colon in them.
# This will botch headers that are split on multiple lines...
if ':' not in header_line:
return

# Break the header line into header name and value.
name, value = header_line.split(':', 1)

# Remove whitespace that may be present.
# Header lines include the trailing newline, and there may be whitespace
# around the colon.
name = name.strip()
value = value.strip()

# Header names are case insensitive.
# Lowercase name here.
name = name.lower()

# Now we can actually record the header name and value.
# Note: this only works when headers are not duplicated, see below.
headers[name] = value

data = io.BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, location)
c.setopt(c.FOLLOWLOCATION, True)
c.setopt(c.HEADERFUNCTION, header_function)
c.setopt(c.WRITEDATA, data)
c.perform()
status = c.getinfo(c.RESPONSE_CODE)
c.close()

return status, headers, data


def download_file(location, wv_user=None, wv_password=None, wv_host_name=None):
"""
Download a file from the specified URL location.
Expand Down Expand Up @@ -1196,9 +1243,20 @@ def download_file(location, wv_user=None, wv_password=None, wv_host_name=None):
import re
m = re.match(r".*[^-\w]([-\w]{33,})[^-\w]?.*", location)
file_id = m.groups()[0]
credentials_file = get_global_configuration_variable("GAPI_CREDENTIALS_FILE")
token_file = get_global_configuration_variable("GAPI_TOKEN_FILE")
data = download_xlsx_file_id(credentials_file, token_file, file_id)
url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=xlsx" # &id={file_id}"
# resp = requests.get(url, headers={'Cache-Control': 'no-cache', 'Pragma': 'no-cache'}, allow_redirects=True) # headers={'Cache-Control': 'no-cache', 'Pragma': 'no-cache'}
status_code, headers, data = download_with_pycurl(url)
print(f'curl -L "{url}" >> out.xlsx')
if status_code == 200 and "text/html" not in headers["content-type"]:
# if resp.status_code == 200 and "text/html" not in resp.headers["Content-Type"]:
# data = io.BytesIO(resp.content)
pass
else:
credentials_file = get_global_configuration_variable("GAPI_CREDENTIALS_FILE")
token_file = get_global_configuration_variable("GAPI_TOKEN_FILE")
data = download_xlsx_file_id(credentials_file, token_file, file_id)
# with open("/home/rnebot/Downloads/out2.xlsx", "wb") as nf:
# nf.write(data.getvalue())
else:
data = urllib.request.urlopen(location).read()
data = io.BytesIO(data)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Flask_Cors==3.0.3
Werkzeug==0.15.5 # >=
requests==2.21.0 # >=
requests_cache==0.4.13
pycurl>=7.43.0.6
pymonetdb>=1.1.1
SQLAlchemy>=1.3.3
sqlalchemy_monetdb==0.9.3
Expand Down

0 comments on commit 96eaae8

Please sign in to comment.