Skip to content

Commit

Permalink
🗺 Include GeoNames to verify admin boundaries
Browse files Browse the repository at this point in the history
Download cities with +15,000 inhabitants from GeoNames
Import GeoNames cities into Postgres
Disable automatic builds for draft PRs
Match GeoName city names with OSM admin boundaries
Add city name to POI list based on newly created `cities` table
Additional Google category tags from scraping Finland
Materialize some sub-queries as tables to improve building time of POI model
  • Loading branch information
Matti Grotheer authored and mattigrthr committed Jul 29, 2022
1 parent 44d73c2 commit 99f3f21
Show file tree
Hide file tree
Showing 18 changed files with 1,479 additions and 42 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ services:
- POSTGRES_USER=kuwala
- POSTGRES_PASSWORD=password
- POSTGRES_DB=kuwala
- POSTGRES_MULTIPLE_EXTENSIONS=postgis,hstore,postgis_topology,postgis_raster,pgrouting,h3
- POSTGRES_MULTIPLE_EXTENSIONS=postgis,hstore,postgis_topology,postgis_raster,pgrouting,h3,unaccent,fuzzystrmatch
ports:
- '5432:5432'
volumes:
Expand Down
8 changes: 8 additions & 0 deletions kuwala/common/python_utils/src/file_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pandas


def txt_to_csv(file_path):
read_file = pandas.read_csv(
file_path, delimiter="\t", header=None, low_memory=False
)
read_file.to_csv(file_path.replace(".txt", ".csv"), index=None, header=False)
24 changes: 24 additions & 0 deletions kuwala/core/database/importer/sql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,30 @@ CREATE TABLE IF NOT EXISTS admin_boundary (
geometry geometry
);

-- Creation of admin_boundary_geonames_cities table

CREATE TABLE IF NOT EXISTS admin_boundary_geonames_cities (
geoname_id text NOT NULL PRIMARY KEY,
name text NOT NULL,
ascii_name text,
alternate_names text[],
latitude decimal NOT NULL,
longitude decimal NOT NULL,
feature_class text NOT NULL,
feature_code text,
country_code text,
alternate_country_codes text[],
admin_1_code text,
admin_2_code text,
admin_3_code text,
admin_4_code text,
population integer NOT NULL,
elevation integer,
digital_elevation_model integer NOT NULL,
timezone text NOT NULL,
modification_date date NOT NULL
);

-- Creation of population_density table

CREATE TABLE IF NOT EXISTS population_density (
Expand Down
82 changes: 64 additions & 18 deletions kuwala/core/database/importer/src/admin_boundary_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,18 @@
from pyspark.sql.functions import lit


def import_admin_boundaries(
def import_osm_admin_boundaries(
spark,
database_host,
database_port,
database_name,
database_url,
database_properties,
continent,
country,
country_region,
file_path,
):
start_time = time.time()

logging.info(
f"Starting import of admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent}'
)

script_dir = os.path.dirname(__file__)
file_path = os.path.join(
script_dir,
f"../../../../tmp/kuwala/admin_boundary_files/{continent}/{country}"
f'{f"/{country_region}" if country_region else ""}/admin_boundaries.parquet',
)

if not os.path.exists(file_path):
logging.warning("No admin boundaries file available. Skipping import.")
logging.warning("No OSM admin boundaries file available. Skipping import.")
return

data = (
Expand All @@ -58,6 +43,67 @@ def import_admin_boundaries(
path_to_query_file="../sql/create_admin_boundary_geometries.sql",
)


def import_geonames_cities(spark, database_url, database_properties, file_path):
if not os.path.exists(file_path):
logging.warning("No GeoNames city names available. Skipping import.")
return

data = spark.read.parquet(file_path)

data.write.option("truncate", True).jdbc(
url=database_url,
table="admin_boundary_geonames_cities",
mode="overwrite",
properties=database_properties,
)


def import_admin_boundaries(
spark,
database_host,
database_port,
database_name,
database_url,
database_properties,
continent,
country,
country_region,
):
start_time = time.time()

logging.info(
f"Starting import of admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent}'
)

script_dir = os.path.dirname(__file__)
file_path_geonames_cities = os.path.join(
script_dir, "../../../../tmp/kuwala/admin_boundary_files/cities_500.parquet"
)
file_path_osm_admin_boundaries = os.path.join(
script_dir,
f"../../../../tmp/kuwala/admin_boundary_files/{continent}/{country}"
f'{f"/{country_region}" if country_region else ""}/admin_boundaries.parquet',
)

import_geonames_cities(
spark=spark,
database_url=database_url,
database_properties=database_properties,
file_path=file_path_geonames_cities,
)
import_osm_admin_boundaries(
spark=spark,
database_host=database_host,
database_port=database_port,
database_name=database_name,
database_url=database_url,
database_properties=database_properties,
country=country,
file_path=file_path_osm_admin_boundaries,
)

logging.info(
f"Successfully imported admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent} after '
Expand Down
11 changes: 11 additions & 0 deletions kuwala/core/database/transformer/dbt/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ clean-targets: # directories to be removed by `dbt clean`
- "dbt_modules"

vars:
# Country to match cities
country: 'MT'
# POI brand
focus_brand: 'focus_brand_id_prefix'
# Aggregation grid
Expand All @@ -43,10 +45,19 @@ vars:

models:
kuwala_core_transformer:
admin_boundaries:
cities:
+materialized: table
poi:
poi:
+materialized: table
poi_address_city:
+materialized: table
poi_address_country:
+materialized: table
poi_aggregated:
+materialized: table
poi_matched:
+materialized: table
poi_popularity_time_series:
+materialized: table
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{% macro get_all_versions_of_city_names(country_code) %}
{% set query %}
SELECT DISTINCT geoname_id, unnest(ascii_name || alternate_names) AS name
FROM admin_boundary_geonames_cities
WHERE country_code = '{{ country_code }}'
{% endset %}

{{ return(query) }}
{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{% macro match_admin_boundaries_with_city_names(country_code) %}
{% set all_versions_of_city_names = get_all_versions_of_city_names(country_code) %}

{% set query %}
SELECT id, geoname_id, levenshtein(ab.name, avocn.name) AS levenshtein_distance
FROM admin_boundary AS ab LEFT JOIN ({{ all_versions_of_city_names }}) AS avocn ON
ab.name = avocn.name OR
unaccent(ab.name) = avocn.name OR
ab.name LIKE avocn.name || '%' OR
unaccent(ab.name) LIKE avocn.name || '%'
{% endset %}

{{ return(query) }}
{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
SELECT city_name, geometry
FROM {{ ref('city_candidates') }}
INNER JOIN (
SELECT city_name AS city_name_best_match, MIN(min_levenshtein_distance) AS min_levenshtein_distance_best_match
FROM {{ ref('city_candidates') }}
GROUP BY city_name
) AS best_city_candidates ON
city_candidates.city_name = best_city_candidates.city_name_best_match AND
city_candidates.min_levenshtein_distance = best_city_candidates.min_levenshtein_distance_best_match
WHERE st_contains(geometry, st_setsrid(st_makepoint(candidate_longitude, candidate_latitude), 4326))
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
SELECT
abgc.ascii_name AS city_name,
ab.name AS admin_boundary_name,
min_levenshtein_distance,
ab.geometry AS geometry,
abgc.latitude AS candidate_latitude,
abgc.longitude AS candidate_longitude
FROM (
SELECT id, geoname_id, levenshtein_distance
FROM ({{ match_admin_boundaries_with_city_names(var('country')) }}) AS mabwcn
WHERE geoname_id IS NOT NULL AND levenshtein_distance < 10
) AS matched_cities
INNER JOIN (
SELECT id AS id_best_match, MIN(levenshtein_distance) AS min_levenshtein_distance
FROM ({{ match_admin_boundaries_with_city_names(var('country')) }}) AS mabwcn
WHERE geoname_id IS NOT NULL AND levenshtein_distance < 10
GROUP BY id
) AS matched_cities_min_levenshtein_distances ON
matched_cities.id = matched_cities_min_levenshtein_distances.id_best_match AND
matched_cities.levenshtein_distance = matched_cities_min_levenshtein_distances.min_levenshtein_distance
LEFT JOIN admin_boundary_geonames_cities AS abgc USING (geoname_id)
LEFT JOIN admin_boundary AS ab USING (id)
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

version: 2

models:
# Cities
- name: cities
description: 'List of matched city names from GeoNames against OSM admin boundaries'
columns:
- name: city_name
description: 'City name from GeoNames'
tests:
- not_null
- name: geometry
description: 'Geometry based on matched admin boundary'
tests:
- not_null
# City candidates
- name: city_candidates
description: 'List of matched city names from GeoNames against OSM admin boundaries'
columns:
- name: city_name
description: 'City name from GeoNames'
tests:
- not_null
- name: admin_boundary_name
description: 'Name of matched admin boundary'
tests:
- not_null
- name: min_levenshtein_distance
description: 'Levenshtein distance of GeoNames city name and admin boundary name'
tests:
- not_null
- name: geometry
description: 'Geometry of matched admin boundary'
tests:
- not_null
2 changes: 1 addition & 1 deletion kuwala/core/database/transformer/dbt/models/poi/poi.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
SELECT
poi_matched.*,
poi_address_city.name AS poi_address_city,
poi_address_city.city_name AS poi_address_city,
poi_address_country.name AS poi_address_country,
h3_to_parent(poi_h3_index::h3index, 8) AS poi_h3_index_res_8,
h3_to_parent(poi_h3_index::h3index, 9) AS poi_h3_index_res_9,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
SELECT poi_id, name, id
FROM admin_boundary AS ab, {{ ref('poi_matched') }} AS poi
WHERE
kuwala_admin_level = (SELECT max(kuwala_admin_level) FROM admin_boundary) AND
st_contains(ab.geometry, st_setsrid(st_makepoint(poi.longitude, poi.latitude), 4326))
SELECT DISTINCT poi_id, city_name
FROM {{ ref('cities') }} AS cities, {{ ref('poi_matched') }} AS poi
WHERE st_contains(cities.geometry, st_setsrid(st_makepoint(poi.longitude, poi.latitude), 4326))
12 changes: 4 additions & 8 deletions kuwala/core/database/transformer/dbt/models/poi/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,25 +98,21 @@ models:
- accepted_values:
values: [ 15 ]
- name: poi_address_city
description: 'Name of the admin boundary with the highest kuwala_admin_level containing POI coordinates'
description: 'Name of the city containing POI coordinates based on matched GeoNames and OSM data'
- name: poi_address_country
description: 'Name of admin boundary having kuwala_admin_level 1 containing POI coordinates'

# POI city based on admin boundary
- name: poi_address_city
description: 'Admin boundary name with highest admin level containing POI coordinates'
description: 'City containing POI coordinates based on matched GeoNames and OSM data'
columns:
- name: poi_id
description: 'Unique ID referring to a POI'
tests:
- unique
- not_null
- name: name
description: 'Name of matching admin boundary'
tests:
- not_null
- name: id
description: 'ID of matching admin boundary'
- name: city_name
description: 'Name of matching city'
tests:
- not_null

Expand Down
1 change: 1 addition & 0 deletions kuwala/pipelines/admin-boundaries/dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
FROM kuwala/base-images:python-java-0.2.0-alpha

COPY ./pipelines/admin-boundaries /opt/pipelines/app
COPY ./common/python_utils /opt/common/python_utils

WORKDIR /opt/pipelines/app
RUN pip install --no-cache-dir -r requirements.txt
Expand Down
4 changes: 3 additions & 1 deletion kuwala/pipelines/admin-boundaries/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pandas==1.4.1
pyspark==3.2.1
Shapely==1.8.0
requests==2.28.1
Shapely==1.8.0
tqdm==4.64.0
Loading

0 comments on commit 99f3f21

Please sign in to comment.