Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ESRI Dump Internal JSON #55

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2021 OpenAddresses
Copyright (c) 2023 OpenAddresses

Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
Expand Down
25 changes: 3 additions & 22 deletions openaddr/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging; _L = logging.getLogger('openaddr.cache')

import os
import json
import errno
import math
import mimetypes
Expand Down Expand Up @@ -303,7 +304,7 @@ def get_file_path(self, url, dir_path):
''' Return a local file path in a directory for a URL.
'''
_, host, path, _, _, _ = urlparse(url)
hash, path_ext = sha1((host + path).encode('utf-8')), '.csv'
hash, path_ext = sha1((host + path).encode('utf-8')), '.geojson'

# With no source prefix like "us-ca-oakland" use the host as a hint.
name_base = '{}-{}'.format(self.source_prefix or host, hash.hexdigest()[:8])
Expand Down Expand Up @@ -376,16 +377,6 @@ def download(self, source_urls, workdir, source_config):

metadata = downloader.get_metadata()

if query_fields is None:
field_names = [f['name'] for f in metadata['fields']]
else:
field_names = query_fields[:]

if GEOM_FIELDNAME not in field_names:
field_names.append(GEOM_FIELDNAME)

field_names = list(map(lambda x: x.upper(), field_names))

# Get the count of rows in the layer
try:
row_count = downloader.get_feature_count()
Expand All @@ -394,9 +385,6 @@ def download(self, source_urls, workdir, source_config):
_L.info("Source doesn't support count")

with open(file_path, 'w', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=field_names)
writer.writeheader()

for feature in downloader:
try:
geom = feature.get('geometry') or {}
Expand All @@ -407,15 +395,8 @@ def download(self, source_urls, workdir, source_config):
if any((isinstance(g, float) and math.isnan(g)) for g in traverse(geom)):
raise TypeError("Geometry has NaN coordinates")

shp = shape(feature['geometry'])
row[GEOM_FIELDNAME] = shp.wkt

r = dict()
for k,v in row.items():
r[k.upper()] = v
row = r
f.write(json.dumps(feature) + '\n')

writer.writerow({fn: row.get(fn) for fn in field_names})
size += 1
except TypeError:
_L.debug("Skipping a geometry", exc_info=True)
Expand Down
22 changes: 8 additions & 14 deletions openaddr/conform.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def find_source_path(data_source, source_paths):
return c
_L.warning("Source names file %s but could not find it", source_file_name)
return None
elif format_string == "geojson" and protocol_string != "ESRI":
elif format_string == "geojson" or protocol_string == "ESRI":
candidates = []
for fn in source_paths:
basename, ext = os.path.splitext(fn)
Expand All @@ -306,9 +306,6 @@ def find_source_path(data_source, source_paths):
_L.warning("Found more than one JSON file in source, can't pick one")
# geojson spec currently doesn't include a file attribute. Maybe it should?
return None
elif format_string == "geojson" and protocol_string == "ESRI":
# Old style ESRI conform: ESRI downloader should only give us a single cache.csv file
return source_paths[0]
elif format_string == "csv":
# Return file if it's specified, else return the first file we find
if "file" in conform:
Expand Down Expand Up @@ -1126,20 +1123,17 @@ def extract_to_source_csv(source_config, source_path, extract_path):
format_string = source_config.data_source["conform"]['format']
protocol_string = source_config.data_source['protocol']

if format_string in ("shapefile", "xml", "gdb"):
# TODO 2023: Is this true anymore?
# 2017: GeoJSON sources have some awkward legacy with ESRI, see issue #34
if format_string == "geojson" or protocol_string == "ESRI":
_L.info("Non-ESRI GeoJSON source found; converting as a stream.")
geojson_source_path = normalize_ogr_filename_case(source_path)
geojson_source_to_csv(source_config, geojson_source_path, extract_path)
elif format_string in ("shapefile", "xml", "gdb"):
ogr_source_path = normalize_ogr_filename_case(source_path)
ogr_source_to_csv(source_config, ogr_source_path, extract_path)
elif format_string == "csv":
csv_source_to_csv(source_config, source_path, extract_path)
elif format_string == "geojson":
# GeoJSON sources have some awkward legacy with ESRI, see issue #34
if protocol_string == "ESRI":
_L.info("ESRI GeoJSON source found; treating it as CSV")
csv_source_to_csv(source_config, source_path, extract_path)
else:
_L.info("Non-ESRI GeoJSON source found; converting as a stream.")
geojson_source_path = normalize_ogr_filename_case(source_path)
geojson_source_to_csv(source_config, geojson_source_path, extract_path)
else:
raise Exception("Unsupported source format %s" % format_string)

Expand Down
4 changes: 2 additions & 2 deletions openaddr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ def test_single_car_cached(self):
state = dict(zip(*json.load(file)))

self.assertIsNotNone(state['cache'])
self.assertEqual(state['fingerprint'], '1821b2e50a61ed04ac2213fbc7a1984d')
self.assertEqual(state['fingerprint'], '49708e631ccc1eb06c0d10b5966fd1f8')
self.assertIsNotNone(state['processed'])
self.assertIsNone(state['preview'])

Expand All @@ -576,7 +576,7 @@ def test_single_car_old_cached(self):
state = dict(zip(*json.load(file)))

self.assertIsNotNone(state['cache'])
self.assertEqual(state['fingerprint'], '1821b2e50a61ed04ac2213fbc7a1984d')
self.assertEqual(state['fingerprint'], '49708e631ccc1eb06c0d10b5966fd1f8')
self.assertIsNotNone(state['processed'])
self.assertIsNone(state['preview'])

Expand Down
10 changes: 6 additions & 4 deletions openaddr/tests/conform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2033,10 +2033,12 @@ def test_find_geojson_source_path(self):
def test_find_esri_source_path(self):
# test that the legacy ESRI/GeoJSON style works
old_conform = {"protocol": "ESRI", "conform": {"format": "geojson"}}
self.assertEqual("foo.csv", find_source_path(old_conform, ["foo.csv"]))
# test that the new ESRI/CSV style works
new_conform = {"protocol": "ESRI", "conform": {"format": "csv"}}
self.assertEqual("foo.csv", find_source_path(new_conform, ["foo.csv"]))
self.assertEqual("foo.geojson", find_source_path(old_conform, ["foo.geojson"]))

# test that the legacy ESRI/CSV is ignored
old_conform = {"protocol": "ESRI", "conform": {"format": "csv"}}
self.assertEqual(None, find_source_path(old_conform, ["foo.csv"]))
self.assertEqual("foo.geojson", find_source_path(old_conform, ["foo.geojson"]))

def test_find_csv_source_path(self):
csv_conform = {"conform": {"format": "csv"}}
Expand Down
101 changes: 101 additions & 0 deletions openaddr/tests/conforms/lake-man-gml.gfs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<GMLFeatureClassList>
<GMLFeatureClass>
<Name>lake_man</Name>
<ElementPath>lake_man</ElementPath>
<GeometryName>geometryProperty</GeometryName>
<GeometryElementPath>geometryProperty</GeometryElementPath>
<!--POINT-->
<GeometryType>1</GeometryType>
<DatasetSpecificInfo>
<FeatureCount>6</FeatureCount>
<ExtentXMin>-122.25925</ExtentXMin>
<ExtentXMax>-122.25672</ExtentXMax>
<ExtentYMin>37.80071</ExtentYMin>
<ExtentYMax>37.80436</ExtentYMax>
</DatasetSpecificInfo>
<PropertyDefn>
<Name>MSTRID</Name>
<ElementPath>MSTRID</ElementPath>
<Type>Integer</Type>
</PropertyDefn>
<PropertyDefn>
<Name>CATEGORY</Name>
<ElementPath>CATEGORY</ElementPath>
<Type>String</Type>
<Width>7</Width>
</PropertyDefn>
<PropertyDefn>
<Name>ADDRESSID</Name>
<ElementPath>ADDRESSID</ElementPath>
<Type>Integer</Type>
</PropertyDefn>
<PropertyDefn>
<Name>BASEID</Name>
<ElementPath>BASEID</ElementPath>
<Type>Integer</Type>
</PropertyDefn>
<PropertyDefn>
<Name>REVDATE</Name>
<ElementPath>REVDATE</ElementPath>
<Type>String</Type>
<Width>10</Width>
</PropertyDefn>
<PropertyDefn>
<Name>FLOOR</Name>
<ElementPath>FLOOR</ElementPath>
<Type>Integer</Type>
</PropertyDefn>
<PropertyDefn>
<Name>NUMBER</Name>
<ElementPath>NUMBER</ElementPath>
<Type>Integer</Type>
</PropertyDefn>
<PropertyDefn>
<Name>ZIP</Name>
<ElementPath>ZIP</ElementPath>
<Type>Integer</Type>
</PropertyDefn>
<PropertyDefn>
<Name>ACTIVE</Name>
<ElementPath>ACTIVE</ElementPath>
<Type>String</Type>
<Width>3</Width>
</PropertyDefn>
<PropertyDefn>
<Name>MAILING</Name>
<ElementPath>MAILING</ElementPath>
<Type>String</Type>
<Width>3</Width>
</PropertyDefn>
<PropertyDefn>
<Name>FENAME</Name>
<ElementPath>FENAME</ElementPath>
<Type>String</Type>
<Width>14</Width>
</PropertyDefn>
<PropertyDefn>
<Name>FNAME</Name>
<ElementPath>FNAME</ElementPath>
<Type>String</Type>
<Width>14</Width>
</PropertyDefn>
<PropertyDefn>
<Name>FTYPE</Name>
<ElementPath>FTYPE</ElementPath>
<Type>String</Type>
<Width>2</Width>
</PropertyDefn>
<PropertyDefn>
<Name>STRNAME</Name>
<ElementPath>STRNAME</ElementPath>
<Type>String</Type>
<Width>17</Width>
</PropertyDefn>
<PropertyDefn>
<Name>id</Name>
<ElementPath>id</ElementPath>
<Type>String</Type>
<Width>9</Width>
</PropertyDefn>
</GMLFeatureClass>
</GMLFeatureClassList>
7 changes: 0 additions & 7 deletions openaddr/tests/conforms/lake-man-split2.csv

This file was deleted.

6 changes: 6 additions & 0 deletions openaddr/tests/conforms/lake-man-split2.geojson
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"type":"Feature","properties":{"OWNER1":"Serrano Heights Community","ApLot":"34","OWNER2":"Assn","SITECITYST":" ","SITESTREET":" ","PREFIX":"","STTYPE":"","OWNERLAST":"","MAILZIP4":"2288","id":"left arm","OBJECTID":"2807","APTUNIT":"","PHONE":"","APNUMBER":"085-531-34","ApPage":"531","MAILZIP":"92630","ZIP4":"","ZIP":"","STNAME":"","OWNERFIRST":"","HNUM":"","LEGAL":"T 4 R 9 Sec 11 Por S1/2 And T 4 R 9 Sec 14 Por Ne1/4 (also Being Por. Par 1pm. 264-29.)","MAILCITYST":"Lake Forest Ca","ApBook":"085","CARRIER":"","MAILNAME":"Serrano Heights Community","CITY":"","MAILSTREET":"1 Spectrum Pointe Dr #320","NUNITS":"","OA:x":-122.259249687,"OA:y":37.8026126376,"OA:geom":"POINT (-122.259250 37.802613 0)"},"geometry":{"type":"Point","coordinates":[-122.259249687,37.8026126376]}}
{"type":"Feature","properties":{"OWNER1":"","ApLot":"36","OWNER2":"","SITECITYST":" ","SITESTREET":" ","PREFIX":"","STTYPE":"","OWNERLAST":"","MAILZIP4":"","id":"right arm","OBJECTID":"2899","APTUNIT":"","PHONE":"","APNUMBER":"085-531-36","ApPage":"531","MAILZIP":"","ZIP4":"","ZIP":"","STNAME":"","OWNERFIRST":"","HNUM":"","LEGAL":"","MAILCITYST":"","ApBook":"085","CARRIER":"","MAILNAME":"","CITY":"","MAILSTREET":"","NUNITS":"","OA:x":-122.256717682,"OA:y":37.8025278661,"OA:geom":"POINT (-122.256718 37.802528 0)"},"geometry":{"type":"Point","coordinates":[-122.256717682,37.8025278661]}}
{"type":"Feature","properties":{"OWNER1":"City Of Orange","ApLot":"32","OWNER2":"","SITECITYST":" ","SITESTREET":" ","PREFIX":"","STTYPE":"","OWNERLAST":"","MAILZIP4":"1508","id":"torso","OBJECTID":"3419","APTUNIT":"","PHONE":"","APNUMBER":"085-531-32","ApPage":"531","MAILZIP":"92866","ZIP4":"","ZIP":"","STNAME":"","OWNERFIRST":"","HNUM":"","LEGAL":"T 4 R 9 Sec 11 Por S1/2","MAILCITYST":"Orange Ca","ApBook":"085","CARRIER":"","MAILNAME":"City Of Orange","CITY":"","MAILSTREET":"300 E Chapman Ave","NUNITS":"","OA:x":-122.257940769,"OA:y":37.8029686768,"OA:geom":"POINT (-122.257941 37.802969 0)"},"geometry":{"type":"Point","coordinates":[-122.257940769,37.8029686768]}}
{"type":"Feature","properties":{"OWNER1":"Serrano Heights Community","ApLot":"41","OWNER2":"Assn","SITECITYST":" ","SITESTREET":" ","PREFIX":"","STTYPE":"","OWNERLAST":"","MAILZIP4":"2288","id":"left leg","OBJECTID":"3711","APTUNIT":"","PHONE":"","APNUMBER":"085-531-41","ApPage":"531","MAILZIP":"92630","ZIP4":"","ZIP":"","STNAME":"","OWNERFIRST":"","HNUM":"","LEGAL":"T 4 R 9 Sec 12 Por Sw1/4 Also Being Por. Par 1 Pm.264-29.","MAILCITYST":"Lake Forest Ca","ApBook":"085","CARRIER":"","MAILNAME":"Serrano Heights Community","CITY":"","MAILSTREET":"1 Spectrum Pointe Dr #320","NUNITS":"","OA:x":-122.258970737,"OA:y":37.8007476424,"OA:geom":"POINT (-122.258971 37.800748 0)"},"geometry":{"type":"Point","coordinates":[-122.258970737,37.8007476424]}}
{"type":"Feature","properties":{"OWNER1":"Serrano Heights Community","ApLot":"42","OWNER2":"Assn","SITECITYST":" ","SITESTREET":" ","PREFIX":"","STTYPE":"","OWNERLAST":"","MAILZIP4":"2288","id":"right leg","OBJECTID":"3925","APTUNIT":"","PHONE":"","APNUMBER":"085-531-42","ApPage":"531","MAILZIP":"92630","ZIP4":"","ZIP":"","STNAME":"","OWNERFIRST":"","HNUM":"","LEGAL":"Pm 264-29 Par 1 Por Ofpar","MAILCITYST":"Lake Forest Ca","ApBook":"085","CARRIER":"","MAILNAME":"Serrano Heights Community","CITY":"","MAILSTREET":"1 Spectrum Pointe Dr #320","NUNITS":"","OA:x":-122.256953716,"OA:y":37.800713733,"OA:geom":"POINT (-122.256954 37.800714 0)"},"geometry":{"type":"Point","coordinates":[-122.256953716,37.800713733]}}
{"type":"Feature","properties":{"OWNER1":"Serrano Heights Community","ApLot":"43","OWNER2":"Assn","SITECITYST":" ","SITESTREET":" ","PREFIX":"","STTYPE":"","OWNERLAST":"","MAILZIP4":"2288","id":"head","OBJECTID":"4143","APTUNIT":"","PHONE":"","APNUMBER":"085-531-43","ApPage":"531","MAILZIP":"92630","ZIP4":"","ZIP":"","STNAME":"","OWNERFIRST":"","HNUM":"","LEGAL":"P.m. 264-29 Por. Of Par 1.","MAILCITYST":"Lake Forest Ca","ApBook":"085","CARRIER":"","MAILNAME":"Serrano Heights Community","CITY":"","MAILSTREET":"1 Spectrum Pointe Dr #320","NUNITS":"","OA:x":-122.257640362,"OA:y":37.8043589086,"OA:geom":"POINT (-122.257640 37.804359 0)"},"geometry":{"type":"Point","coordinates":[-122.257640362,37.8043589086]}}
Loading