Skip to content

Commit

Permalink
🗺 Include GeoNames to verify admin boundaries
Browse files Browse the repository at this point in the history
  • Loading branch information
ephe-meral committed Aug 10, 2022
1 parent 44d73c2 commit 8a68bb5
Show file tree
Hide file tree
Showing 20 changed files with 1,481 additions and 53 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ services:
- POSTGRES_USER=kuwala
- POSTGRES_PASSWORD=password
- POSTGRES_DB=kuwala
- POSTGRES_MULTIPLE_EXTENSIONS=postgis,hstore,postgis_topology,postgis_raster,pgrouting,h3
- POSTGRES_MULTIPLE_EXTENSIONS=postgis,hstore,postgis_topology,postgis_raster,pgrouting,h3,unaccent,fuzzystrmatch
ports:
- '5432:5432'
volumes:
Expand Down
8 changes: 8 additions & 0 deletions kuwala/common/python_utils/src/file_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pandas


def txt_to_csv(file_path):
read_file = pandas.read_csv(
file_path, delimiter="\t", header=None, low_memory=False
)
read_file.to_csv(file_path.replace(".txt", ".csv"), index=None, header=False)
24 changes: 24 additions & 0 deletions kuwala/core/database/importer/sql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,30 @@ CREATE TABLE IF NOT EXISTS admin_boundary (
geometry geometry
);

-- Creation of admin_boundary_geonames_cities table

CREATE TABLE IF NOT EXISTS admin_boundary_geonames_cities (
geoname_id text NOT NULL PRIMARY KEY,
name text NOT NULL,
ascii_name text,
alternate_names text[],
latitude decimal NOT NULL,
longitude decimal NOT NULL,
feature_class text NOT NULL,
feature_code text,
country_code text,
alternate_country_codes text[],
admin_1_code text,
admin_2_code text,
admin_3_code text,
admin_4_code text,
population integer NOT NULL,
elevation integer,
digital_elevation_model integer NOT NULL,
timezone text NOT NULL,
modification_date date NOT NULL
);

-- Creation of population_density table

CREATE TABLE IF NOT EXISTS population_density (
Expand Down
82 changes: 64 additions & 18 deletions kuwala/core/database/importer/src/admin_boundary_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,18 @@
from pyspark.sql.functions import lit


def import_admin_boundaries(
def import_osm_admin_boundaries(
spark,
database_host,
database_port,
database_name,
database_url,
database_properties,
continent,
country,
country_region,
file_path,
):
start_time = time.time()

logging.info(
f"Starting import of admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent}'
)

script_dir = os.path.dirname(__file__)
file_path = os.path.join(
script_dir,
f"../../../../tmp/kuwala/admin_boundary_files/{continent}/{country}"
f'{f"/{country_region}" if country_region else ""}/admin_boundaries.parquet',
)

if not os.path.exists(file_path):
logging.warning("No admin boundaries file available. Skipping import.")
logging.warning("No OSM admin boundaries file available. Skipping import.")
return

data = (
Expand All @@ -58,6 +43,67 @@ def import_admin_boundaries(
path_to_query_file="../sql/create_admin_boundary_geometries.sql",
)


def import_geonames_cities(spark, database_url, database_properties, file_path):
if not os.path.exists(file_path):
logging.warning("No GeoNames city names available. Skipping import.")
return

data = spark.read.parquet(file_path)

data.write.option("truncate", True).jdbc(
url=database_url,
table="admin_boundary_geonames_cities",
mode="overwrite",
properties=database_properties,
)


def import_admin_boundaries(
spark,
database_host,
database_port,
database_name,
database_url,
database_properties,
continent,
country,
country_region,
):
start_time = time.time()

logging.info(
f"Starting import of admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent}'
)

script_dir = os.path.dirname(__file__)
file_path_geonames_cities = os.path.join(
script_dir, "../../../../tmp/kuwala/admin_boundary_files/cities_500.parquet"
)
file_path_osm_admin_boundaries = os.path.join(
script_dir,
f"../../../../tmp/kuwala/admin_boundary_files/{continent}/{country}"
f'{f"/{country_region}" if country_region else ""}/admin_boundaries.parquet',
)

import_geonames_cities(
spark=spark,
database_url=database_url,
database_properties=database_properties,
file_path=file_path_geonames_cities,
)
import_osm_admin_boundaries(
spark=spark,
database_host=database_host,
database_port=database_port,
database_name=database_name,
database_url=database_url,
database_properties=database_properties,
country=country,
file_path=file_path_osm_admin_boundaries,
)

logging.info(
f"Successfully imported admin boundaries for "
f'{f"{country_region}, " if country_region else ""}{country}, {continent} after '
Expand Down
14 changes: 12 additions & 2 deletions kuwala/core/database/transformer/dbt/dbt_project.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# Name your project! Project names should contain only lowercase characters
# and underscores. A good package name should reflect your organization's
# name or the intended use of these models
Expand All @@ -25,6 +24,8 @@ clean-targets: # directories to be removed by `dbt clean`
- "dbt_modules"

vars:
# Country to match cities
country: 'MT'
# POI brand
focus_brand: 'focus_brand_id_prefix'
# Aggregation grid
Expand All @@ -43,10 +44,19 @@ vars:

models:
kuwala_core_transformer:
admin_boundaries:
cities:
+materialized: table
poi:
poi:
+materialized: table
poi_address_city:
+materialized: table
poi_address_country:
+materialized: table
poi_aggregated:
+materialized: table
poi_matched:
+materialized: table
poi_popularity_time_series:
+materialized: table
+materialized: table
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{% macro get_all_versions_of_city_names(country_code) %}
{% set query %}
SELECT DISTINCT geoname_id, unnest(ascii_name || alternate_names) AS name
FROM admin_boundary_geonames_cities
WHERE country_code = '{{ country_code }}'
{% endset %}

{{ return(query) }}
{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{% macro match_admin_boundaries_with_city_names(country_code) %}
{% set all_versions_of_city_names = get_all_versions_of_city_names(country_code) %}

{% set query %}
SELECT id, geoname_id, levenshtein(ab.name, avocn.name) AS levenshtein_distance
FROM admin_boundary AS ab LEFT JOIN ({{ all_versions_of_city_names }}) AS avocn ON
ab.name = avocn.name OR
unaccent(ab.name) = avocn.name OR
ab.name LIKE avocn.name || '%' OR
unaccent(ab.name) LIKE avocn.name || '%'
{% endset %}

{{ return(query) }}
{% endmacro %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
SELECT city_name, geometry
FROM {{ ref('city_candidates') }}
INNER JOIN (
SELECT city_name AS city_name_best_match, MIN(min_levenshtein_distance) AS min_levenshtein_distance_best_match
FROM {{ ref('city_candidates') }}
GROUP BY city_name
) AS best_city_candidates ON
city_candidates.city_name = best_city_candidates.city_name_best_match AND
city_candidates.min_levenshtein_distance = best_city_candidates.min_levenshtein_distance_best_match
WHERE st_contains(geometry, st_setsrid(st_makepoint(candidate_longitude, candidate_latitude), 4326))
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
SELECT
abgc.ascii_name AS city_name,
ab.name AS admin_boundary_name,
min_levenshtein_distance,
ab.geometry AS geometry,
abgc.latitude AS candidate_latitude,
abgc.longitude AS candidate_longitude
FROM (
SELECT id, geoname_id, levenshtein_distance
FROM ({{ match_admin_boundaries_with_city_names(var('country')) }}) AS mabwcn
WHERE geoname_id IS NOT NULL AND levenshtein_distance < 10
) AS matched_cities
INNER JOIN (
SELECT id AS id_best_match, MIN(levenshtein_distance) AS min_levenshtein_distance
FROM ({{ match_admin_boundaries_with_city_names(var('country')) }}) AS mabwcn
WHERE geoname_id IS NOT NULL AND levenshtein_distance < 10
GROUP BY id
) AS matched_cities_min_levenshtein_distances ON
matched_cities.id = matched_cities_min_levenshtein_distances.id_best_match AND
matched_cities.levenshtein_distance = matched_cities_min_levenshtein_distances.min_levenshtein_distance
LEFT JOIN admin_boundary_geonames_cities AS abgc USING (geoname_id)
LEFT JOIN admin_boundary AS ab USING (id)
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@

version: 2

models:
# Cities
- name: cities
description: 'List of matched city names from GeoNames against OSM admin boundaries'
columns:
- name: city_name
description: 'City name from GeoNames'
tests:
- not_null
- name: geometry
description: 'Geometry based on matched admin boundary'
tests:
- not_null
# City candidates
- name: city_candidates
description: 'List of matched city names from GeoNames against OSM admin boundaries'
columns:
- name: city_name
description: 'City name from GeoNames'
tests:
- not_null
- name: admin_boundary_name
description: 'Name of matched admin boundary'
tests:
- not_null
- name: min_levenshtein_distance
description: 'Levenshtein distance of GeoNames city name and admin boundary name'
tests:
- not_null
- name: geometry
description: 'Geometry of matched admin boundary'
tests:
- not_null
2 changes: 1 addition & 1 deletion kuwala/core/database/transformer/dbt/models/poi/poi.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
SELECT
poi_matched.*,
poi_address_city.name AS poi_address_city,
poi_address_city.city_name AS poi_address_city,
poi_address_country.name AS poi_address_country,
h3_to_parent(poi_h3_index::h3index, 8) AS poi_h3_index_res_8,
h3_to_parent(poi_h3_index::h3index, 9) AS poi_h3_index_res_9,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
SELECT poi_id, name, id
FROM admin_boundary AS ab, {{ ref('poi_matched') }} AS poi
WHERE
kuwala_admin_level = (SELECT max(kuwala_admin_level) FROM admin_boundary) AND
st_contains(ab.geometry, st_setsrid(st_makepoint(poi.longitude, poi.latitude), 4326))
SELECT DISTINCT poi_id, city_name
FROM {{ ref('cities') }} AS cities, {{ ref('poi_matched') }} AS poi
WHERE st_contains(cities.geometry, st_setsrid(st_makepoint(poi.longitude, poi.latitude), 4326))
12 changes: 4 additions & 8 deletions kuwala/core/database/transformer/dbt/models/poi/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,25 +98,21 @@ models:
- accepted_values:
values: [ 15 ]
- name: poi_address_city
description: 'Name of the admin boundary with the highest kuwala_admin_level containing POI coordinates'
description: 'Name of the city containing POI coordinates based on matched GeoNames and OSM data'
- name: poi_address_country
description: 'Name of admin boundary having kuwala_admin_level 1 containing POI coordinates'

# POI city based on admin boundary
- name: poi_address_city
description: 'Admin boundary name with highest admin level containing POI coordinates'
description: 'City containing POI coordinates based on matched GeoNames and OSM data'
columns:
- name: poi_id
description: 'Unique ID referring to a POI'
tests:
- unique
- not_null
- name: name
description: 'Name of matching admin boundary'
tests:
- not_null
- name: id
description: 'ID of matching admin boundary'
- name: city_name
description: 'Name of matching city'
tests:
- not_null

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,4 +391,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Loading

0 comments on commit 8a68bb5

Please sign in to comment.