Skip to content
This repository was archived by the owner on Mar 17, 2025. It is now read-only.

Commit b6d82a0

Browse files
committed
remove arcgis python library dependency; just use requests
1 parent 79c5039 commit b6d82a0

File tree

3 files changed

+78
-35
lines changed

3 files changed

+78
-35
lines changed

README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,16 @@ This containers is part of a data pipeline to automatically retrieve data from t
66
Container images are built by Github actions, and pushed to Github's container registry. You can find up-to-date built images [here](https://github.com/orgs/WDGPH/packages?repo_name=workflow-WSI).
77

88
## Retrieval Container
9-
This container utilizes the [ArcGIS API Python Package](https://developers.arcgis.com/python/guide/install-and-set-up/) to authenticate to ArcGIS online, which is then used to download resources by item ID.
9+
This container downloads ArcGIS online items from a specified url.
1010

1111
To use, `ARCGIS_USER` and `ARCGIS_PASSWORD` environment variables must be set for the container (credentials for WSI Data and Visualization Hub). It is strongly suggested that a secure key vault is utilized for this process and that credentials are rotated frequently. Additionally, the following arguments are required:
1212

13-
**1. `item_id`**
14-
ArcGIS Online item id. Changes with addition/removal of features to dataset requiring occasional updates.
15-
**Example**: `1a111aa1a1aa1a1aaaa1a111aa1a1aa1`
13+
**1. `url`**
14+
ArcGIS Online item url. Changes with addition/removal of features to dataset requiring occasional updates.
15+
**Example**: `https://services6.arcgis.com/ghjer345tert/arcgis/rest/services/PROD_PHU_Base_Aggregated/FeatureServer/0/query`
1616

1717
**2. `output`**
18-
The filename where the output will be written.
18+
The filename where the output will be written.
1919
**Example**: `wsi.csv`
2020

2121
## Pipeline Orchestration

retrieval/dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
FROM python:3.9-bullseye
22

3-
RUN pip install --upgrade pip && pip install arcgis==2.0.*
3+
RUN python3 -m pip install --upgrade pip && pip install requests==2.* pandas==2.* --no-cache-dir
44

55
COPY getdata.py ./
66
RUN chmod a+x /getdata.py
77

8-
ENTRYPOINT ["python", "./getdata.py"]
8+
ENTRYPOINT ["python3", "./getdata.py"]

retrieval/getdata.py

+71-28
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,103 @@
11
import argparse
2-
import arcgis
2+
import requests
3+
import pandas as pd
34
import os
45
import logging
56

67
# Argument parser
78
def parse_args():
89
parser = argparse.ArgumentParser(
9-
description='Extract a feature set from an ArcGIS Online item')
10+
description='Extract a feature set from an ArcGIS Online URL')
1011

1112
parser.add_argument(
12-
'item_id',
13-
help = "ArcGIS Online item id",
13+
'--url',
14+
help = "ArcGIS Online url for item",
15+
required = True,
1416
type = str)
1517

1618
parser.add_argument(
17-
'output',
19+
'--output',
1820
help = "Filename to write output to",
21+
required = True,
1922
type = str)
2023

2124
return parser.parse_args()
2225

23-
24-
# Main function to extract and output data from ArcGIS Online
25-
def main(item_id, output):
26-
if os.getenv('ARCGIS_USER') is not None:
26+
# Main function to extract and output data from PHO WTISEN
27+
def main(features_url, output):
28+
29+
# Load credentials and remove environment variables
30+
username = os.getenv('ARCGIS_USER')
31+
if username is not None:
2732
logging.info("ARCGIS_USER environment variable found")
33+
os.environ.pop('ARCGIS_USER', None)
2834
else:
2935
raise ValueError("ARCGIS_USER environment variable not found.")
30-
31-
if os.getenv('ARCGIS_PASSWORD') is not None:
36+
37+
password = os.getenv('ARCGIS_PASSWORD')
38+
if password is not None:
3239
logging.info("ARCGIS_PASSWORD environment variable found")
40+
os.environ.pop('ARCGIS_PASSWORD', None)
3341
else:
3442
raise ValueError("ARCGIS_PASSWORD environment variable not found.")
3543

36-
logging.info("Connecting to ArcGIS Online")
37-
gis = arcgis.gis.GIS(
38-
username = os.getenv('ARCGIS_USER'),
39-
password = os.getenv('ARCGIS_PASSWORD'),
40-
verify_cert = False)
41-
42-
logging.info("Logged in to ArcGIS Online as " + str(gis.properties.user.username))
43-
44-
logging.info(f"Retrieving {item_id}")
45-
item = gis.content.get(item_id)
46-
47-
logging.info("Extracting feature set")
48-
feature_set = item.layers[0].query()
49-
50-
logging.info(f"Outputting feature set to {output}")
51-
feature_set.sdf.to_csv(output)
44+
logging.info("Generating ArcGIS API token")
45+
token = requests.post(
46+
url = 'https://www.arcgis.com/sharing/rest/generateToken',
47+
data = {
48+
'f': 'json',
49+
'username': username,
50+
'password': password,
51+
'referer': 'https://www.arcgis.com',
52+
'expiration': 60, # minutes
53+
}).json()['token']
54+
55+
# Set up pagination
56+
batch_size = 1000
57+
offset = 0
58+
all_records = []
59+
continue_pagination = True
60+
61+
logging.info(f"Retrieving data in batch sizes of {batch_size} from {features_url} in JSON format")
62+
63+
while continue_pagination:
64+
logging.info(f"Retrieving data batch {(offset//batch_size) + 1}")
65+
66+
# Fetch batch of records
67+
response = requests.get(
68+
url = features_url,
69+
params= {
70+
'f': 'json',
71+
'where': '1=1',
72+
'outFields': '*',
73+
'resultOffset': offset,
74+
'resultRecordCount': batch_size,
75+
'token': token
76+
}).json()
77+
78+
# Add records to all_records list
79+
all_records.extend(response.get('features', []))
80+
81+
# Check if exceededTransferLimit is true to determine if pagination continues
82+
continue_pagination = response.get('exceededTransferLimit', False)
83+
84+
# Increment offset
85+
offset += batch_size
86+
87+
logging.info("All data retrieved")
88+
logging.info("Converting JSON to tabular format")
89+
features = pd.DataFrame([record['attributes'] for record in all_records])
90+
91+
rows, columns = features.shape
92+
logging.info(f"Data contains {rows} rows and {columns} columns")
93+
94+
logging.info(f"Exporting data as {output}")
95+
features.to_csv(output, index = False)
5296

5397

5498
if __name__ == '__main__':
5599
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO)
56100

57101
# Parse and unpack keyword arguments
58102
main(**vars(parse_args()))
59-
60103
logging.info("Done")

0 commit comments

Comments
 (0)