-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththesis_data_analysis.py
693 lines (516 loc) · 28.2 KB
/
thesis_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
# -*- coding: utf-8 -*-
"""
Created on Fri May 10 00:07:36 2019
@author: Sampo Vesanen
Sampo Vesanen's MSc thesis survey results data processing
"Parking of private cars and spatial accessibility in Helsinki Capital Region"
This script was created using Anaconda Python Distribution 2020.02. In addition
to all packages delivered with Anaconda, GeoPandas 0.5.0 with its dependencies
was installed for this work. Unfortunately I can't guarantee the smooth execution
of this code on newer package versions.
Please note that this script talks about "respondents" and "IP addresses"
interchangeably. For the most part an unique IP address can be considered to
be a unique person, but this can't be verified as households and public venues
share a wifi and therefore IP addresses. For simplicity this script assumes
an unique IP address is an unique person.
"""
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from datetime import timedelta
from shapely.geometry import Polygon, MultiPolygon, Point, LinearRing
from rtree import index
import random
# Working directories
wd = r"C:\Sampon\Maantiede\Master of the Universe\python"
# Survey data file paths
records_data = r"thesis_data_python\records.csv"
visitors_data = r"thesis_data_python\visitors.csv"
# Spatial data paths
grid_path = r"thesis_data_python\MetropAccess_YKR_grid_EurefFIN.shp"
resarea_path = r"thesis_data_python\pno_dissolve.shp"
postal_path = r"thesis_data_python\pno_tilasto_2019.shp"
ykr_path = r"thesis_data_python\YKRVyohykkeet2017.shp"
artificial_path = r"thesis_data_python\clc2018_level1_dissolve.shp"
# Import functions and lists of zipcodes. Has to be done after inserting
# os.chdir()
os.chdir(wd)
from thesis_data_analysis_funcs import *
from thesis_data_zipcodes import *
### 1 IMPORT DATA -------------------------------------------------------------
# Survey data. Lambda x to preserve leading zeros
records = pd.read_csv(records_data,
converters={"zipcode": lambda x: str(x)},
index_col=[0])
visitors = pd.read_csv(visitors_data, index_col=[0])
# Shapefiles
grid = gpd.read_file(grid_path, encoding="utf-8")
postal = gpd.read_file(postal_path, encoding="utf-8")
# CORINE land cover 2018, 25ha, use only level 1, artificial surfaces. This
# script uses a shapefile preprocessed in QGIS.
artificial = gpd.read_file(artificial_path, encoding="utf-8")
# Import urban zones shapefile and select only relevant area for this study
resarea = gpd.read_file(resarea_path, encoding="utf-8").buffer(500).unary_union
ykr_vyoh = gpd.read_file(ykr_path, encoding="utf-8")
ykr_vyoh = gpd.overlay(ykr_vyoh,
gpd.GeoDataFrame(geometry=[resarea], crs=ykr_vyoh.crs),
how="intersection")
### 2 PREPROCESSING -----------------------------------------------------------
### A) Survey data
# Timestamps to datetime format
records["timestamp"] = pd.to_datetime(
records.timestamp, format="%d-%m-%Y %H:%M:%S")
visitors["ts_first"] = pd.to_datetime(
visitors.ts_first, format="%Y-%m-%d %H:%M:%S")
visitors["ts_latest"] = pd.to_datetime(
visitors.ts_latest, format="%Y-%m-%d %H:%M:%S")
# visitors, records: remove three responses inputted by the author and two home
# IP codes belonging to the author. Author's visits from shared IP codes can't
# be identified in the data.
records = records.drop([0, 5, 6])
visitors = visitors.drop([0, 3753])
# I was contacted on 24.5.2019 22.41 with the message that a respondent had
# filled three responses erroneously: for Kallio, Käpylä and one other
# (probably Pukinmäki-Savela). Remove these records.
records = records.drop([1277, 1278, 1279])
# We have now removed all confirmedly erroneous responses. Reset index for
# both DataFrames.
records = records.reset_index().drop(columns=["index"])
visitors = visitors.reset_index().drop(columns=["index"])
# Alert if records has IP codes which do not exist in visitors
diff = list(set(records.ip) - set(visitors.ip))
if diff != 0:
print(f"\nNB! The following IP codes exist only in records: {diff}")
print("These unique IP codes sent in total "
f"{len(records[records.ip.isin(diff)])} records. This affects"
" statistics only minimally but acknowledge this discrepancy"
" anyway.")
### B) Paavo postal area code data
# This process is rather unnecessarily complicated. In hindsight I would
# just create the layer in QGIS, but as this is already a finished script,
# we shall use it.
# Process Helsinki Capital Region GeoDataFrame from PAAVO data
muns = ["091", "049", "092", "235"]
postal = postal[postal.kunta.isin(muns)]
postal = postal.reset_index().drop(columns=["index", "euref_x", "euref_y"])
postal = postal.rename(columns={"posti_alue": "zipcode"})
# Remove islands unreachable by car from the mainland. This includes islands
# connected to the mainland, but which are closed from the majority of the
# public or are privately owned. These exceptions include
# - Staffan in 02360 Soukka
# - Mustasaari in 00340 Kuusisaari-Lehtisaari
# - Seurasaari and Pukkisaaret in 00250 Taka-Töölö
# - Various islands reachable with ferry
# I decided to preserve Suomenlinna and Korkeasaari in the data.
# Preserve largest includes island postal code areas
preserveLargest = ["00210", "00860", "00840", "00850", "00870", "00590"]
preserveTwoLargest = ["00340", "00200", "00890"]
preserveAll = ["00330", "00570", "02100"]
# Suvisaaristo needs special attention. Define a Polygon to help detect
# areas in Suvisaaristo reachable by car
suvisaar = Polygon([(371829.79632436, 6668010.26942102),
(372652.78908073, 6667612.58164613),
(371343.73348838, 6666204.10411005),
(372923.43770531, 6665381.11135368),
(374635.70451388, 6667452.40184791),
(372078.35118367, 6668800.12152949),
(371829.79632436, 6668010.26942102)])
# Get mainland
mainland = max(postal.unary_union, key=lambda a: a.area)
# Process islands unreachable by car
for idx, geom in enumerate(postal.geometry):
# Preserve two largest Polygons. "in" utilises package "operator".
if postal.loc[idx, "zipcode"] in preserveTwoLargest:
thisGeom = preserve_nlargest(postal.loc[idx], 2)
mainland = mainland.union(thisGeom)
# Islands
if postal.loc[idx, "zipcode"] in preserveLargest:
if geom.geom_type == "MultiPolygon":
largest = max(geom, key=lambda a: a.area)
else:
largest = geom
mainland = mainland.union(largest)
# Case Munkkiniemi 00330, Tapiola 02100 and Kulosaari 00570, preserve the
# entire MultiPolygon
if postal.loc[idx, "zipcode"] in preserveAll:
thisGeom = postal.loc[idx, "geometry"]
mainland = mainland.union(thisGeom)
# Case Suvisaaristo 02380
if postal.loc[idx, "zipcode"] == "02380":
match = postal.loc[idx, "geometry"]
preserve = MultiPolygon(
[geom for geom in match if geom.intersects(suvisaar)])
mainland = mainland.union(preserve)
# Case Taka-Töölö 00250, preserve Rajasaari (third largest feature)
if postal.loc[idx, "zipcode"] == "00250":
thisGeom = postal.loc[idx, "geometry"]
thisGeom = sorted(thisGeom, key=lambda a: a.area, reverse=True)
thisGeom = MultiPolygon([thisGeom[0], thisGeom[2]])
mainland = mainland.union(thisGeom)
# Case Kuusisaari-Lehtisaari 00340, preserve three largest Polygons.
# Remove Mustasaari and Leppäluoto
if postal.loc[idx, "zipcode"] == "00340":
thisGeom = preserve_nlargest(postal.loc[idx], 3)
mainland = mainland.union(thisGeom)
# Case Suomenlinna 00190, preserve four largest Polygons (remove Lonna)
if postal.loc[idx, "zipcode"] == "00190":
thisGeom = preserve_nlargest(postal.loc[idx], 4)
mainland = mainland.union(thisGeom)
# Case Tammisalo 00830, preserve only the second largest Polygon
if postal.loc[idx, "zipcode"] == "00830":
thisGeom = postal.loc[idx, "geometry"]
thisGeom = sorted(thisGeom, key=lambda a: a.area, reverse=True)
thisGeom = Polygon(thisGeom[1])
mainland = mainland.union(thisGeom)
# Replace postal geometries with geometries without islands
# NB! postal.at[idx, "geometry"] = thisGeom seems to raise ValueError in
# GeoPandas 0.6.0. Complains that "Must have equal len keys and value when
# setting with an iterable". Will remain in GeoPandas version 0.5.0 for this
# thesis.
for idx, geom in enumerate(postal.geometry):
if geom.geom_type == "MultiPolygon":
thisGeom = MultiPolygon(
[pol for idx, pol in enumerate(geom) if pol.intersects(mainland.buffer(-10))])
if thisGeom.is_empty == False:
postal.at[idx, "geometry"] = thisGeom
### C) Insert zipcode data into DataFrame grid
# My survey data and Travel Time Matrix 2018 operate in different spatial
# units. TTM utilises MetropAccess-YKR-grid with each cell 250 x 250 meters,
# while my data was gathered with PAAVO zipcode delineations as the base unit.
# In order to compare TTM and my findings I need to add postal area code data
# to the GeoDataFrame "grid". This enables me to assign each postal code area
# with mean data about "parktime" and "walktime".
# The for loop below assigns zipcode to a cell when more than 50 percent of the
# current cell area is in that zipcode. Helsinki Capital Region is of different
# shape in grid and postal and the for loop below will not iterate through
# cells outside of postal. Add these cells to grid_areas after this for loop.
# Establish dataframe to collect zipcode area data
grid_areas = gpd.GeoDataFrame(columns=["YKR_ID", "zipcode", "largest_area",
"total_area"])
crs = grid.crs
grid_sindex = grid.sindex # GeoPandas spatial index
# Use i to create new rows in GeoDataFrame grid_areas
i = 0
for postalarea in postal.itertuples():
precise_matches = spatialIndexFunc(grid_sindex, grid, postalarea.geometry)
# Make the current zipcode to a GeoDataFrame
thisZip = gpd.overlay(precise_matches,
gpd.GeoDataFrame(geometry=[postalarea.geometry], crs=crs),
how="intersection")
thisZip["area1"] = thisZip.area
# Iterate through all rows in current zipcode and see if each YKR_ID is
# in DataFrame "grid_areas" or not
for row in thisZip.itertuples():
# this YKR_ID does not yet exist in "grid_areas"
if grid_areas[grid_areas.YKR_ID.isin([row.YKR_ID])].empty == True:
grid_areas.loc[i, "YKR_ID"] = row.YKR_ID
grid_areas.loc[i, "zipcode"] = postalarea.zipcode
grid_areas.loc[i, "largest_area"] = row.area1
grid_areas.loc[i, "total_area"] = row.area1
i += 1
# The current YKR_ID was found in "grid_areas"
else:
# This is the location of the located pre-existing ykr_id
found_idx = grid_areas.index[grid_areas.YKR_ID == row.YKR_ID][0]
# In this case the ykr_id has been iterated over in some previous
# iteration of a postal area. Only add to largest_area if area value
# is larger than before and add to total_area
if row.area1 > grid_areas.loc[found_idx, "largest_area"]:
grid_areas.loc[found_idx, "largest_area"] = row.area1
grid_areas.loc[found_idx, "zipcode"] = postalarea.zipcode
grid_areas.loc[found_idx, "total_area"] += row.area1
# Find out which ykr_ids in DataFrame "grid" do not occur in GeoDataFrame
# "grid_areas". These cells are outside of PAAVO postal code area boundaries
notPresent = list(np.setdiff1d(list(grid.YKR_ID), list(grid_areas.YKR_ID)))
# Add these cells in GeoDataFrame grid_areas as zipcode 99999
for cell_id in notPresent:
grid_areas = grid_areas.append(
{"YKR_ID": np.int64(cell_id),
"zipcode": "99999"},
ignore_index=True)
# Incorporate "grid_areas" to the original GeoDataFrame "grid"
grid = pd.concat(
[grid.set_index("YKR_ID"), grid_areas.set_index("YKR_ID")],
axis=1,
join="inner").reset_index()
### 3 DETECTION OF ILLEGAL DATA -----------------------------------------------
### A) Respondent behaviour
# Create groupby aggregations of the records. One can use this DataFrame to
# see how each respondent (or, specifically, one IP address) answered to the
# survey
visitors_grp = records.groupby("ip").agg({
"id": "count",
"timestamp": lambda x: x.tolist(),
"zipcode": lambda x: x.tolist(),
"likert": lambda x: x.tolist(),
"parkspot": lambda x: x.tolist(),
"parktime": lambda x: x.tolist(),
"walktime": lambda x: x.tolist(),
"timeofday": lambda x: x.tolist()})
visitors_grp = visitors_grp.rename(columns={"id": "amount"})
# Export respondent behaviour to xlsx
visitors_grp.to_excel("visitors_grouped.xlsx")
### B) Detect duplicate data
# Erase contents of duplicates.txt before writing anything (wipe old data).
# Wrap the following for loop in a "with" loop. This enables us to write the
# duplicates report as a text file in the working directory for easy viewing.
# Edit pandas max_colwidth to show even the lengthiest rows. pandas default is
# 50.
pd.options.display.max_colwidth = 100
open("duplicates.txt", "w").close()
# Detect if a respondent has answered to the same area multiple times. Please
# note that this only locates duplicates but does nothing about it. I will leave
# the discussion about this to the thesis.
with open(os.path.join(wd, "duplicates.txt"), "a+") as f:
for visitor in records.ip.unique():
theseRecords = records.loc[records.ip == visitor]
if(theseRecords.zipcode.is_unique) == False:
f.write("\nDuplicates detected for ip code "
f"'{theseRecords.ip.unique()[0]}'\n\n")
dupl = theseRecords[theseRecords.zipcode.duplicated(keep=False)]
dupl = dupl.groupby("zipcode").agg(
{"id": lambda x: x.tolist(),
"timestamp": lambda x: x.tolist(),
"ip": "first",
"zipcode": "first",
"likert": lambda x: identicaltest(x),
"parkspot": lambda x: identicaltest(x),
"parktime": lambda x: identicaltest(x),
"walktime": lambda x: identicaltest(x),
"timeofday": lambda x: identicaltest(x)})
# Produce current rows in the text file
for idx, row in dupl.drop(["ip"], axis=1).iterrows():
f.write(f"{row.to_string()}\n\n")
print("\nA report on duplicate answers saved to disk in path "
f"{os.path.join(wd, 'duplicates.txt')}\n")
### C) Remove illegal answers
# Any value equal or above 60 minutes is deemed illegal in columns "parktime"
# and "walktime".
illegal_df = pd.DataFrame(columns=records.columns)
for row in records.itertuples():
if(row.parktime >= 60 or row.walktime >= 60):
illegal_df = illegal_df.append(records.iloc[row.Index])
print("\nThe following illegal records were found (value >= 60):")
print("\n", illegal_df[["parktime", "walktime"]])
# At this point, one may choose to not delete illegal answers by commenting
# the next line of code. This is to save DataFrame records, with illegal
# answers and all, for analysis in R. In Analysis shinyapp users can control
# the threshold for illegal answers by themselves, with 59 minutes as default
# setting. That will produce the same data as would be done here.
# Consequently, do not add the length of "illegal_df" to "invalid". Also, do
# not delete illegal values from DataFrame.
# Value "invalid "starts at 6 as at this point I have removed 3 of my own
# records and 3 others which were reported to me as false. See above, part
# "Process survey data".
invalid = 6 + len(illegal_df)
# Use indices of "illegal_df" to drop rows from "records"
#records = records.drop(illegal_df.index).reset_index(drop=True)
### 4 ADD VARIOUS DATA INTO POSTAL --------------------------------------------
### A) Prepare
# Reclassify "ykr_vyoh". This reclassification follows Finnish Environment
# Institute website (Only available in Finnish)
# https://www.ymparisto.fi/fi-FI/Elinymparisto_ja_kaavoitus/Yhdyskuntarakenne/Tietoa_yhdyskuntarakenteesta/Yhdyskuntarakenteen_vyohykkeet
ykr_vyoh["vyohselite"] = ykr_vyoh.vyohselite.replace(
{"keskustan reunavyöhyke/intensiivinen joukkoliikenne":
"keskustan reunavyöhyke",
"keskustan reunavyöhyke/joukkoliikenne":
"keskustan reunavyöhyke",
"alakeskuksen jalankulkuvyöhyke/intensiivinen joukkoliikenne":
"alakeskuksen jalankulkuvyöhyke",
"alakeskuksen jalankulkuvyöhyke/joukkoliikenne":
"alakeskuksen jalankulkuvyöhyke"})
# Add additional columns
postal["ykr_kesk_jalan"] = 0
postal["ykr_kesk_reuna"] = 0
postal["ykr_int_joukko"] = 0
postal["ykr_joukkoliik"] = 0
postal["ykr_alakesk_jalan"] = 0
postal["ykr_autovyoh"] = 0
postal["ykr_novalue"] = 0
postal["artificial"] = 0
### B) Add answer count
# Calculate column "answer_count". Answer count for each zipcode area
answ = records.zipcode.value_counts().rename("answer_count")
postal = postal.join(answ, on="zipcode")
### C) Add mean and median for parktime and walktime
# Calculate mean and median for columns "parktime" and "walktime". Join by
# column "zipcode"
roundmean = lambda x: round(x.mean(), 2)
calcs = records.groupby("zipcode").agg(
parktime_mean=("parktime", roundmean),
parktime_median=("parktime", "median"),
walktime_mean=("walktime", roundmean),
walktime_median=("walktime", "median"))
postal = postal.join(calcs, on="zipcode")
# Prepare the annotation of Polygon features
postal["coords"] = polygonCoordsToTuple(postal)
### D) Calculate percentage of YKR zones in each postal code area
# Overlay "yhdyskuntarakenteen vyöhykkeet" on DataFrame "postal" and see the
# percentage how much of each zone is included in the zipcode in question.
# Together with checking out how much artificial surface is there in research
# area we could find some interesting results in the statistical analysis.
# Dictionary key to help allocation of values
dictKey = {"keskustan jalankulkuvyöhyke": "ykr_kesk_jalan",
"keskustan reunavyöhyke": "ykr_kesk_reuna",
"intensiivinen joukkoliikennevyöhyke": "ykr_int_joukko",
"joukkoliikennevyöhyke": "ykr_joukkoliik",
"alakeskuksen jalankulkuvyöhyke": "ykr_alakesk_jalan",
"autovyöhyke": "ykr_autovyoh"}
# "geom_grouper" prevents multiple instances of the same zones in the dict
geom_grouper = lambda x: x.unary_union
for row in postal.itertuples():
# Intersect current Polygon with urban zones shapefile
thisPol = gpd.GeoDataFrame(geometry=[row.geometry], crs=ykr_vyoh.crs)
thisIntersect = gpd.overlay(thisPol, ykr_vyoh, how="intersection")
# Groupby ensures each zone appears only once
group = thisIntersect.groupby("vyohselite", as_index=False)["geometry"].agg(
{"geometry": geom_grouper})
# This calculates Polygon areas and informs us how indices have to be ordered
# to get sorted values
area_order = list(
group["geometry"].map(lambda p: p.area).sort_values(ascending=False).index)
group = group.reindex(area_order)
group = group.reset_index()
# Insert this dict into all zipcode rows
thisDict = [{
thisRow.vyohselite:
round(thisRow.geometry.area / thisPol.unary_union.area, 3)
} for thisRow in group.itertuples()]
# Iterate thisDict, list of dictionaries to insert values in "postal"
for item in thisDict:
for k, v in item.items():
# Add data to postal. In .loc square brackets select correct row of
# placement then use dictKey dictionary to tell which column to
# append in. Finally append current value v
postal.loc[postal.zipcode == row.zipcode, dictKey[k]] = v
# Calculate column "ykr_novalue"
postal["ykr_novalue"] = postal.apply(
lambda row: 1 - (row.ykr_kesk_jalan + row.ykr_kesk_reuna +
row.ykr_int_joukko + row.ykr_joukkoliik +
row.ykr_alakesk_jalan + row.ykr_autovyoh), axis=1)
# Make sure rounding does not cause trouble in the calculation above
postal.loc[postal["ykr_novalue"] < 0, "ykr_novalue"] = 0
### E) Calculate percentage of CORINE land cover artificial surface in each
# zipcode area
# Reproject the geometries by replacing the values with projected ones
artificial = artificial.to_crs(epsg=3067)
crs = artificial.crs
# Employ GeoPandas spatial index. We will reduce the extent to process
# the intersections and this will greatly trim down the time needed for the
# calculation.
artificial_sindex = artificial.sindex
for row in postal.itertuples():
thisPol = gpd.GeoDataFrame(geometry=[row.geometry], crs=crs)
thisGeom = thisPol.loc[0, "geometry"]
precise_matches = spatialIndexFunc(artificial_sindex, artificial, thisGeom)
# precise_matches may be empty, so check for that
if precise_matches.empty == False:
thisIntersect = gpd.overlay(thisPol, precise_matches, how="intersection")
if thisIntersect.empty == False:
artific_percent = thisIntersect.unary_union.area / thisPol.area
# append artificial surface data to column "artificial"
postal.loc[postal.zipcode == row.zipcode, "artificial"] = round(
artific_percent[0], 3)
### 5 MERGE YKR ZONES, ARTIFICIAL AND SUBDIV INTO RECORDS -------------------------
# Add generalised values of "ykr_vyoh" and "artificial" into records for further
# statistical analysis (this helps ggplot2, the plotting library in R). Also
# add subdivision data.
### A) CORINE Land Cover 2018 artificial surface
# Merge artificial surface data from "postal", then rename to differentiate
# numerical data "artificial_vals" and Jenks classified "artificial"
records = pd.merge(records, postal[["zipcode", "artificial"]], on="zipcode")
records = records.rename(columns={"artificial": "artificial_vals"})
# Calculate jenks breaks for "artificial". Use breaks to reclassify values
# in records. We will use code created by GitHub user Drewda. We will
# generate the breaks with "postal", so that repeated rows of "records.artificial"
# do not interfere with class breaks calculations.
breaks = getJenksBreaks(postal.artificial.tolist(), 5)
# See the breaks on plot
plt.figure(figsize=(10, 8))
hist1 = plt.hist(postal.artificial, bins=100, align="left", color="g")
for b in breaks:
plt.vlines(b, ymin=0, ymax=max(hist1[0]))
# new n=167: [0, 0.166, 0.443, 0.69, 0.896, 1.0]
# old, sheesh, n=5216: [0, 0.385, 0.647, 0.815, 0.927, 1.0]
# Reclassify using this clumsy nested np.where(). Use values calculated in
# getJenksBreaks()
records["artificial"] = np.where(
records.artificial_vals < breaks[1],
"Scarcely built",
(np.where(records.artificial_vals < breaks[2],
"Some built",
(np.where(records.artificial_vals < breaks[3],
"Moderately built",
(np.where(records.artificial_vals < breaks[4],
"Predominantly built",
(np.where(records.artificial_vals <= breaks[5],
"Fully built",
"novalue"))))))))).tolist()
# Get amount of artificial classes for thesis
#records.groupby("zipcode").first().groupby("artificial").size()
### B) YKR zones
# Use largest percentage for a class in each zipcode
ykrColumns = ["ykr_alakesk_jalan", "ykr_autovyoh", "ykr_int_joukko",
"ykr_joukkoliik", "ykr_kesk_jalan", "ykr_kesk_reuna", "ykr_novalue"]
# Create DataFrame with postal codes and the largest value from postal columns
# stated in "ykrColumns". This value will tell us how much of the postal code
# area is of the largest YKR zone (zone name in "ykr_zone")
largestYkrVals = postal[["zipcode"]].join([postal[ykrColumns].max(axis=1)])
largestYkrVals = largestYkrVals.rename(columns={0: "ykr_zone_vals"})
# Create DataFrame with postal codes and reclassified ykr zones. idxmax() will
# fetch us the names of the columns where largest value of each row resides.
# Column "ykr_zone" tells us in percentage how much a postal code area is
# of the YKR zone
largestYkr = postal[["zipcode"]].join([postal[ykrColumns].idxmax(axis=1)])
largestYkr = largestYkr.rename(columns={0: "ykr_zone"})
# Merge will spread values of column "ykr_zone_vals" and "ykr_zone" over all
# occurrences of the specific zipcodes
records = pd.merge(records, largestYkrVals, on="zipcode")
records = pd.merge(records, largestYkr, on="zipcode")
# Reverse dictKey, add entry for "ykr_novalue". Then rename ykr zone records
# to human readable versions
dictKey = {v: k for k, v in dictKey.items()}
dictKey["ykr_novalue"] = "novalue"
records["ykr_zone"] = records.ykr_zone.map(dictKey)
### C) Municipality subdivisions
# Some postal code areas contain localities which are in conflict with the
# subdivisions. For example, zipcode Lippajärvi-Järvenperä 02940 contains
# localities Högnäs (belongs to Vanha-Espoo), Lippajärvi (belongs to Suur-
# Leppävaara). In these cases I will use my own deliberation for
# classification. The main problem areas are just south and north of Kauniainen
# (for example Lippajärvi-Järvenperä and Sepänkylä-Kuurinniitty) and just
# south of Helsinki-Vantaa airport.
# View further information about subdivisions in the script
# "thesis_data_zipcodes.py".
# Create column for subdivision name in DataFrame "records"
records["subdiv"] = 0
# Using dictionary subdiv_dict and lists of zipcodes from
# "thesis_data_zipcodes.py", assign subdivision names to dataframe "records"
for varname, fullname in subdiv_dict.items():
records.loc[records.zipcode.isin(eval(varname)), "subdiv"] = fullname
#### This point marks the Python complete data of my thesis. ####
### PRELIMINARY STATISTICS ----------------------------------------------------
statistics = Stats(records, postal, visitors, invalid)
statistics.calculateStats()
### EXPORT TO SHP -------------------------------------------------------------
# Postal data to QGIS for thesis visualisation. Ignore the last column,
# "coords", GeoPandas does not like the tuple format
#postal.iloc[:, :-1].to_file("postal_for_qgis.shp")
### EXPORT TO CSV -------------------------------------------------------------
# Data to csv for more processing in R.
#records.to_csv("records_for_r.csv", encoding="Windows-1252")
#postal.to_csv("postal_for_r.csv", encoding="Windows-1252")
#visitors.to_csv("visitors_for_r.csv", encoding="Windows-1252")
#grid.iloc[:,[0, 4]].to_csv("grid_for_r.csv", encoding="Windows-1252")
# Data export for shinyapps.io. There are huge problems with character encoding
# down the line if Ascii-UTF-8 conversion is not done here.
#import unicodedata
#records["subdiv"] = records.subdiv.apply(lambda val: unicodedata.normalize("NFKD", val).encode("ascii", "ignore").decode())
#records["ykr_zone"] = records.ykr_zone.apply(lambda val: unicodedata.normalize("NFKD", val).encode("ascii", "ignore").decode())
#records.to_csv("shinyrecords.csv", encoding="utf-8")
#postal["nimi"] = postal.nimi.apply(lambda val: unicodedata.normalize("NFKD", val).encode("ascii", "ignore").decode())
#postal["namn"] = postal.namn.apply(lambda val: unicodedata.normalize("NFKD", val).encode("ascii", "ignore").decode())
#postal.to_csv("shinypostal.csv", encoding="utf-8")