-
Notifications
You must be signed in to change notification settings - Fork 72
/
SQL - Data Cleaning.sql
204 lines (124 loc) · 4.76 KB
/
SQL - Data Cleaning.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
/*
Cleaning Data in SQL
Skills used : CREATE, UPDATE, SELECT, CTE, JOINS, OREDR BY, GROUP BY
*/
select *
from nashville_housing
--------------------------------------------------------------------------------------------------------------------------
/*Standardize Date Format*/
select `Sale Date`
from nashville_housing
select `Sale Date`, convert(`Sale Date`, date)
from nashville_housing
update nashville_housing
set `Sale Date`= convert(`Sale Date`, date)
--------------------------------------------------------------------------------------------------------------------------
/*Populate Property Address Data*/
select *
from nashville_housing
where `Property Address` is null
order by `Parcel ID`
select a.`Parcel ID` , b.`Property Address`, b.`Parcel ID` ,
b.`Property Address` ,
ifnull(a.`Property Address`, b.`Property Address`) as `Address To Be Filled`
from nashville_housing a
join nashville_housing b
on a.`Parcel ID` = b.`Parcel ID`
and a.Column1 != b.Column1
where a.`Property Address` is null
update a
set a.`Property Address` = ifnull(a.`Property Address`, b.`Property Address`)
from nashville_housing a
join nashville_housing b
on a.`Parcel ID` = b.`Parcel ID`
and a.Column1 != b.Column1
where a.`Property Address` is null
--------------------------------------------------------------------------------------------------------------------------
/*Breaking out Address into Individual Columns (Address, City, State)*/
select `Property Address`
from nashville_housing
select
substring(`Property Address` , 1, locate(',', `Property Address`)) - 1 as Address ,
substring(`Property Address` , locate(',', `Property Address`) + 1 , length(`Property Address`)) as Address,
from nashville_housing
alter table nashville_housing
set `Property Split Address` varchar(255);
update nashville_housing
set `Property Split Address` = substring(`Property Address` , 1, locate(',', `Property Address`)) - 1
alter table nashville_housing
set `Property City` varchar(255);
update nashville_housing
set `Property City` = substring(`Property Address` , locate(',', `Property Address`) + 1 , length(`Property Address`))
select `Owner Address`
from nashville_housing
--------------------------------------------------------------------------------------------------------------------------
/*CREATING A SPLIT STRING FUNCTION TO SPLIT THE OWNER ADDRESS*/
create function SPLIT_STR(
x varchar(255),
delim varchar(12),
pos int
)
return varchar(255)
return replace(substring(substring_index(x, delim, pos),
length(substring_index(x, delim, pos -1)) + 1),
delim, '');
select
SPLIT_STR(`Owner Address`, ',', 1),
SPLIT_STR(`Owner Address`, ',', 2),
SPLIT_STR(`Owner Address`, ',', 3)
from nashville_housing
alter table nashville_housing
set `Address` varchar(255);
update nashville_housing
set `Address` = SPLIT_STR(`Owner Address`, ',', 1)
alter table nashville_housing
set `City` varchar(255);
update nashville_housing
set `City` = SPLIT_STR(`Owner Address`, ',', 2)
alter table nashville_housing
set `State` varchar(255);
update nashville_housing
set `State` = SPLIT_STR(`Owner Address`, ',', 3)
--------------------------------------------------------------------------------------------------------------------------
/*Change Y and N to Yes and No in `Sold as Vacant` */
select distinct (`Sold As Vacant`), count(`Sold As Vacant`)
from nashville_housing
group by `Sold As Vacant`
order by `Sold As Vacant`
select `Sold As Vacant` ,
case
when `Sold As Vacant` = 'Y' then 'Yes'
when `Sold As Vacant` = 'N' then 'No'
end as `Sold As Vacant`
from nashville_housing
update nashville_housing
set `Sold As Vacant` = case
when `Sold As Vacant` = 'Y' then 'Yes'
when `Sold As Vacant` = 'N' then 'No'
end as `Sold As Vacant`
--------------------------------------------------------------------------------------------------------------------------
/*Remove Duplicate*/
with RowNumCTE
as
(
select *,
row_number() OVER(
partition by
`Parcel ID`,
`Property Address`,
`Sale Price`,
`Sale Date`,
`Legal Reference`
order by
Column1
) row_num
from nashville_housing
)
delete
from RowNumCTE
where row_num > 1;
--------------------------------------------------------------------------------------------------------------------------
/*Delete Unused Column*/
alter table nashville_housing
drop table `Owner Address`, `Tax District`, `Property Address`, `Sale Date`;
--------------------------------------------------------------------------------------------------------------------------