Skip to content

Commit d2f8dae

Browse files
feiniks杨赫然
andauthored
S3V2 with path-style request using list-objects API to list bucket (#93)
* S3V2 with path-style request using list-objects API to list bucket * Update python version for c * Add lxml dep --------- Co-authored-by: 杨赫然 <[email protected]>
1 parent 2b48848 commit d2f8dae

File tree

3 files changed

+81
-2
lines changed

3 files changed

+81
-2
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
fetch-depth: 1
2222
- uses: actions/setup-python@v1
2323
with:
24-
python-version: "3.8"
24+
python-version: "3.10"
2525
- name: install dependencies and test
2626
run: |
2727
cd $GITHUB_WORKSPACE

ci/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ oss2==2.18.4
77
sqlalchemy==2.0.18
88
pylibmc==1.6.3
99
redis==5.0.8
10+
lxml==5.3.1

objwrapper/s3.py

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
import boto3
22
from botocore.exceptions import ClientError
33
from objwrapper.exceptions import InvalidConfigError
4+
import requests
5+
from datetime import datetime
6+
import hmac
7+
import hashlib
8+
import base64
9+
from lxml import etree
410

511
class S3Conf(object):
612
def __init__(self, key_id, key, bucket_name, host, port, use_v4_sig, aws_region, use_https, path_style_request, sse_c_key):
@@ -24,6 +30,7 @@ def __init__(self, conf):
2430
self.conf = conf
2531
self.client = None
2632
self.bucket = None
33+
self.enpoint_url = None;
2734
self.do_connect()
2835

2936
def do_connect(self):
@@ -37,6 +44,10 @@ def do_connect(self):
3744
config = boto3.session.Config(signature_version='s3',s3={'addressing_style':addressing_style})
3845

3946
if self.conf.host is None:
47+
if self.conf.use_https:
48+
self.endpoint_url = f'https://s3.{self.conf.aws_region}.amazonaws.com'
49+
else:
50+
self.endpoint_url = f'http://s3.{self.conf.aws_region}.amazonaws.com'
4051
self.client = boto3.client('s3',
4152
region_name=self.conf.aws_region,
4253
aws_access_key_id=self.conf.key_id,
@@ -48,6 +59,7 @@ def do_connect(self):
4859
endpoint_url = 'https://%s' % self.conf.host if self.conf.use_https else 'http://%s' % self.conf.host
4960
if self.conf.port:
5061
endpoint_url = '%s:%s' % (endpoint_url, self.conf.port)
62+
self.endpoint_url = endpoint_url
5163
self.client = boto3.client('s3',
5264
aws_access_key_id=self.conf.key_id,
5365
aws_secret_access_key=self.conf.key,
@@ -67,6 +79,11 @@ def get_name(self):
6779
return 'S3 storage backend'
6880

6981
def list_objs(self, prefix=None):
82+
if not self.conf.use_v4_sig and self.conf.path_style_request:
83+
# When using the S3 v2 protocol and path-style requests, boto3 is unable to list objects properly.
84+
# We manually sign the requests and then use the list-objects API to list the objects in the bucket.
85+
yield from self.list_objs_v2(prefix)
86+
return
7087
paginator = self.client.get_paginator('list_objects_v2')
7188
if prefix:
7289
iterator = paginator.paginate(Bucket=self.bucket, Prefix=prefix)
@@ -79,7 +96,6 @@ def list_objs(self, prefix=None):
7996
obj = [tokens, content.get('Size', 0)]
8097
yield obj
8198

82-
8399
def obj_exists(self, key):
84100
bucket = self.bucket
85101
try:
@@ -128,3 +144,65 @@ def get_ctime(self, key):
128144
return float(ctime)
129145
except:
130146
return 0
147+
148+
def get_signature(self, date):
149+
sign_str = f"GET\n\n\n{date}\n/{self.bucket}/"
150+
151+
hmac_object = hmac.new(self.conf.key.encode('utf-8'), sign_str.encode('utf-8'), hashlib.sha1)
152+
hmac_bytes = hmac_object.digest()
153+
signature = base64.b64encode(hmac_bytes).decode('utf-8')
154+
return signature
155+
156+
def list_bucket_v2 (self, marker, prefix):
157+
now = datetime.utcnow()
158+
date = now.strftime('%a, %d %b %Y %H:%M:%S GMT')
159+
signature =self.get_signature(date)
160+
161+
headers = {'Date':date,
162+
'Authorization': f"AWS {self.conf.key_id}:{signature}",
163+
}
164+
165+
endpoint_url = self.endpoint_url
166+
bucket = self.bucket
167+
168+
if marker and prefix:
169+
url = f'{endpoint_url}/{bucket}/?marker={marker}&prefix={prefix}/'
170+
elif marker:
171+
url = f'{endpoint_url}/{bucket}/?marker={marker}'
172+
elif prefix:
173+
url = f'{endpoint_url}/{bucket}/?prefix={prefix}/'
174+
else:
175+
url = f'{endpoint_url}/{bucket}/'
176+
177+
response = requests.get (url, headers=headers, timeout=300)
178+
if response.status_code != 200:
179+
return None
180+
return response.text
181+
182+
def list_objs_v2(self, prefix):
183+
is_truncated = True
184+
marker = None
185+
while is_truncated:
186+
rsp = self.list_bucket_v2(marker, prefix)
187+
if not rsp:
188+
break
189+
190+
root = etree.fromstring(rsp.encode('utf-8'))
191+
if "ListBucketResult" not in root.tag:
192+
break
193+
for child in root:
194+
if "IsTruncated" in child.tag:
195+
if child.text == "true":
196+
is_truncated = True
197+
else:
198+
is_truncated = False
199+
if "Contents" in child.tag:
200+
obj = []
201+
for contents in child:
202+
if "Key" in contents.tag:
203+
marker = contents.text
204+
obj.append(contents.text)
205+
if "Size" in contents.tag:
206+
obj.append(int(contents.text))
207+
if obj:
208+
yield obj

0 commit comments

Comments
 (0)