Skip to content

Commit

Permalink
tidymodels 복습좀 하자 & 데이콘은 정답 제출이 왜 안될까??
Browse files Browse the repository at this point in the history
  • Loading branch information
rhyeu committed Apr 23, 2021
1 parent a22e0c7 commit be8c8dd
Show file tree
Hide file tree
Showing 4 changed files with 2,249 additions and 798 deletions.
Binary file added .RDataTmp
Binary file not shown.
130 changes: 94 additions & 36 deletions study-presentation/sk-rhyeu-apr-24/solar_power_baseline.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ needs(lubridate)
needs(tidymodels)
needs(fpp2)
needs(parallel)
needs(ranger)
```

# 대회 : 동서발전 태양광 발전량 예측 AI 경진대회
Expand All @@ -53,16 +54,23 @@ here()

```{r}
## data read
dangjin_fcst = fread(here("competition/dacon_solar_power", file_list[1]))
dangjin_obs = fread(here("competition/dacon_solar_power", file_list[2])
, encoding = "UTF-8")
energy = fread(here("competition/dacon_solar_power", file_list[3]))
sample_submision = fread(here("competition/dacon_solar_power", file_list[4]))
site_info = fread(here("competition/dacon_solar_power", file_list[5])
, encoding = "UTF-8")
ulsan_fcst = fread(here("competition/dacon_solar_power", file_list[6]))
ulsan_obs = fread(here("competition/dacon_solar_power", file_list[7])
, encoding = "UTF-8")
dangjin_fcst = fread(
here("competition/dacon_solar_power", file_list[1]))
dangjin_obs = fread(
here("competition/dacon_solar_power", file_list[2]),
encoding = "UTF-8")
energy = fread(
here("competition/dacon_solar_power", file_list[3]))
sample_submision = fread(
here("competition/dacon_solar_power", file_list[4]))
site_info = fread(
here("competition/dacon_solar_power", file_list[5]),
encoding = "UTF-8")
ulsan_fcst = fread(
here("competition/dacon_solar_power", file_list[6]))
ulsan_obs = fread(
here("competition/dacon_solar_power", file_list[7]),
encoding = "UTF-8")
```

- site_info.csv - 발전소 정보
Expand Down Expand Up @@ -239,25 +247,12 @@ energy_data %>% group_by(site, hour) %>% count() %>%
```
```{r}
energy_data %>% group_by(site) %>% summarize(min = min(hour),
max = max(hour))
energy_data %>%
group_by(site) %>%
summarize(min = min(hour),
max = max(hour))
```



```
energy %>%
# 예측 대상이 되는 값만 확인
filter(
(site == 'dangjin_floating' & power >= 100) |
(site == 'dangjin_warehouse' & power >= 70) |
(site == 'dangjin' & power >= 100) |
(site == 'ulsan' & power >= 50)
) %>%
ggplot(aes(x = month, y = power, fill = site)) +
geom_boxplot() #+
#facet_wrap(~site)
```
## base line modeling : group mean method - 39.58257 (52~72등)

1. 시간대별 group mean
Expand Down Expand Up @@ -329,14 +324,18 @@ write.csv(sample_submision,



#머신러닝 모델
- 2020년 2월~2021년 1월을 validation set으로 지정하려고 해서 initial_time_split이나
- 2.머신러닝 모델
- 2020년 2월~2021년 1월을 validation set으로 지정하려고 해서 initial_time_split을 시도했으나 실패
- tidymodels에서 학습한 값으로 predict가 안됨
(에러: Can't subset columns that don't exist. x Column `power` doesn't exist.)
- ranger를 썼는데 데이터 타입 에러로 report 실패

```{r}
energy %>%
mutate(time = ymd_hms(energy$time)) %>%
pivot_longer(!time, names_to = 'site', values_to = 'power') -> energy_data
#
dim(energy_data)
# energy_data %>% filter(
# '2020-02-01 01:00:00' > time) -> train # 67356 행
#
Expand All @@ -356,7 +355,7 @@ energy %>%
#
# validation_split<- test %>% validation_split(prop = .1^4)
validation_split<- energy_data %>% validation_split(prop = 2/3)
# validation_split<- energy_data %>% validation_split(prop = 2/3)
#
# validation_split
```
Expand All @@ -370,12 +369,14 @@ validation_split<- energy_data %>% validation_split(prop = 2/3)
solar_recipe <- energy_data %>%
recipe(power~.) %>%
update_role(time, new_role = "ID") %>%
step_mutate(week = week(time),
# step_date로 하면 시간 정보 산출이 안 됨.
step_mutate(week = week(time),
day = day(time),
hour = hour(time),
am = am(time)
) %>%
step_naomit(all_numeric())%>% prep()
) %>%
step_naomit(all_numeric()) %>%
prep()
```

```{r}
Expand Down Expand Up @@ -430,11 +431,68 @@ solar_rf_fit <- solar_wflow %>%
```

```{r}
juice(solar_recipe)
# juice(solar_recipe)
```

```{r}
juice(solar_recipe) %>% skim()
```

```{r}
predict(solar_rf_fit, juice(solar_recipe))
```


```{r}
model <- ranger(
power ~ .,
data = juice(solar_recipe) %>% select(-c(time)),
num.threads = cores,
seed = 1234)
```

```{r}
model
```


```{r}
sample_submision %>%
mutate(time = ymd_hms(time)) %>%
pivot_longer(!time, names_to = 'site', values_to = 'power') -> submission_pivot
```


```{r}
solar_recipe %>%
bake(submission_pivot) -> submission_baked
```


```{r}
solar_ranger_predict <- predict(model,
data = submission_baked
)
```

```{r}
submission_baked %>%
bind_cols(solar_ranger_predict$predictions %>% round() %>% as.integer) %>%
rename('predict' = '...8' ) %>%
select(time, site, predict) %>%
pivot_wider(
names_from = site,
values_from = predict
) -> sample_submision2
```

```{r}
sample_submision2 %>% head()
```

```{r}
predict(solar_rf_fit, new_data = juice(solar_recipe) )
write.csv(sample_submision2,
row.names = F, here('study-presentation/sk-rhyeu-apr-24/submission2.csv'),
fileEncoding = 'UTF-8')
```

Loading

0 comments on commit be8c8dd

Please sign in to comment.