-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Wrangling Ex 1.Rmd
132 lines (109 loc) · 4.82 KB
/
Data Wrangling Ex 1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
---
title: "DW Ex1 Basic Data Manipulation"
author: "Barry Davis"
date: "June 21, 2017"
output:
html_document:
code_folding: none
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE)
```
## Set up our environment
I'll be using some functions from dplyr and tidyr, so I will install the tidyverse package. This package includes dplyr and tidyr, along with other packages that can/may be helpful.
```{r}
library(tidyverse)
library(knitr)
library(kableExtra)
options(knitr.table.format = "html")
```
## Load the data into RStudio
Next, I load the data from the external Excel file that was provided into a data frame.
```{r}
prodPurchases <- read.csv("refine_original.csv", header = TRUE, sep = ",")
```
Let's take a peak and see what's inside.
```{r echo=FALSE}
kable(head(prodPurchases)) %>%
kable_styling(bootstrap_options = c("striped","hover", "condensed"))
```
## Time to clean it up
The names of the companies in this file were misspelled and had irregular capitalization. I will start by correcting those fields first. Regex can be a fickle beast, so I will first verify, then change.
```{r}
# Change to lowercase
prodPurchases$company <- tolower(prodPurchases$company)
# Test the regex, then change what matched for company names
test <- grep(pattern = ".*\\ps$", prodPurchases$company)
for(value in test) {print(prodPurchases[value,1])}
prodPurchases$company <- sub(pattern = ".*\\ps$", replacement = "Philips", x = prodPurchases$company)
test <- grep(pattern = "^ak.*", prodPurchases$company)
for(value in test) {print(prodPurchases[value,1])}
prodPurchases$company <- sub(pattern = "^ak.*", replacement = "Akzo", x = prodPurchases$company)
test <- grep(pattern = "^u.*", prodPurchases$company)
for(value in test) {print(prodPurchases[value,1])}
prodPurchases$company <- sub(pattern = "^u.*", replacement = "Unilever", x = prodPurchases$company)
test <- grep(pattern = "^v.*", prodPurchases$company)
for(value in test) {print(prodPurchases[value,1])}
prodPurchases$company <- sub(pattern = "^v.*", replacement = "Van Houten", x = prodPurchases$company)
```
```{r echo=FALSE}
prodPurchases$company
```
Just for good measure, let's make sure we don't have any left over.
```{r}
prodPurchases %>% count(company)
```
Next, the product codes and product numbers need to be separated to make viewing and working with this data easier.
```{r}
prodPurchases <- separate(prodPurchases, Product.code...number, c("product_code", "product_number"), sep = "-")
```
```{r echo=FALSE}
kable(head(select(prodPurchases, product_code, product_number))) %>%
kable_styling(bootstrap_options = c("striped","hover", "condensed"))
```
## What's missing?
Now I will add a column for product_category that's easier to read.
```{r}
prodPurchases$product_category <- sub(pattern = "^p$", replacement = "Smartphone", x = sub("^x$", "Laptop", sub("^v$", "TV", sub("^q$", "Tablet", prodPurchases$product_code))))
```
```{r echo=FALSE}
kable(head(select(prodPurchases, product_code, product_category))) %>%
kable_styling(bootstrap_options = c("striped","hover", "condensed"))
```
Now I'll add a column for geocoding called full_address
```{r}
prodPurchases <- prodPurchases %>%
mutate(full_address = paste(address, city, country, sep = ","))
```
```{r echo=FALSE}
kable(head(select(prodPurchases, address, city, country, full_address))) %>%
kable_styling(bootstrap_options = c("striped","hover", "condensed"))
```
I will also add 8 more columns, all binary representing what companies or products are referenced in that record.
```{r}
prodPurchases <- mutate(prodPurchases, company_philips = ifelse(company == "Philips", 1, 0))
prodPurchases <- mutate(prodPurchases, company_akzo = ifelse(company == "Akzo", 1, 0))
prodPurchases <- mutate(prodPurchases, company_van_houten = ifelse(company == "Van Houten", 1, 0))
prodPurchases <- mutate(prodPurchases, company_unilever = ifelse(company == "Unilever", 1, 0))
prodPurchases <- mutate(prodPurchases, product_smartphone = ifelse(product_category == "Smartphone", 1, 0))
prodPurchases <- mutate(prodPurchases, product_tv = ifelse(product_category == "TV", 1, 0))
prodPurchases <- mutate(prodPurchases, product_laptop = ifelse(product_category == "Laptop", 1, 0))
prodPurchases <- mutate(prodPurchases, product_tablet = ifelse(product_category == "Tablet", 1, 0))
```
```{r echo = FALSE}
kable(prodPurchases %>%
select(company, company_philips, company_akzo, company_unilever, company_van_houten) %>%
head()) %>%
kable_styling(bootstrap_options = c("striped","hover", "condensed"))
```
```{r echo=FALSE}
kable(prodPurchases %>%
select(product_category, product_smartphone, product_tv, product_laptop, product_tablet) %>%
head()) %>%
kable_styling(bootstrap_options = c("striped","hover", "condensed"))
```
## Finish up!
Finally, write a new .csv file for review
```{r}
write.csv(prodPurchases, "refine_clean.csv")
```