DLL-evaluation_v3.Rmd

---
title: "DLL-ES IRT Analysis and Recommendations"
author: "George Kachergis"
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(ggplot2)
theme_set(theme_classic(base_size = 14)) # default is 11
```

```{r load-fitted-IRT-models, echo=F, message=F}
require(here)
require(mirt)
require(tidyverse)
require(gridExtra)
require(kableExtra)
require(ggpubr)


load(here("data/eng_ws_wg_mod_2pl.Rds"))
coefs = list(en = coefs_2pl)
fscores = list(en = fscores_2pl)
mods = list(en = mod_2pl)

load(here("data/sp_wg_ws_mod_2pl.Rds"))
coefs$sp = coefs_2pl
fscores$sp = fscores_2pl
mods$sp = mod_2pl
rm(coefs_2pl, fscores_2pl, mod_2pl)
```

```{r load-cdi-short-forms, echo=F, message=F}
wg_short_en <- read_csv(here("DLL/eng-wg-short.csv")) # 89
ws_short_enA <- read_csv(here("DLL/eng-ws-shortA.csv")) # 100
#ws_short_enB <- read_csv(here("DLL/eng-ws-shortB.csv")) # 100 - note they did not use this one

#length(intersect(wg_short_en$word, coefs$en$definition)) # all match

#length(intersect(ws_short_enA$word, coefs$en$definition)) # all match, but:
# should "swing" (in Fenson2000) be (action) or (object) -- I made it object, since surrounded by nouns

#length(intersect(ws_short_enB$word, coefs$en$definition)) # all match
# replaced 'fish' with fish (animal) since surrounded by animals;
# 'chicken' -> chicken (food) given list context
# replaced 'feet' with 'foot' ...
# replaced 'drink' with drink (action)

# Spanish short forms
wg_short_sp <- read_csv(here("DLL/spanish-wg-short.csv"))
ws_short_sp <- read_csv(here("DLL/spanish-ws-short.csv"))
#(intersect(wg_short_sp$word, coefs$sp$definition)) # 102 - tambien not in wordbank
# 102 - wordbank errors: comer(se), adios (accent not on i), ver(se) ?, vasos (why plural?)
#setdiff(wg_short_sp$word, coefs$sp$definition)
#length(intersect(ws_short_sp$word, coefs$sp$definition)) # 100
```


## Goals

The goals are 1) to create a word list that is informative about both English and Spanish vocabulary size and 2) to ensure that there are sufficient doublets to estimate lexical overlap.
On an IRT view, we can't perfectly assess 2 (at least not without better bilingual CDI data), but we can assess criterion 1 - that is, we can look at whether the reduced word list is a good sub-test for the full CDI in each language. 
Our original concern was that the current DLL-ES test might not perform well for older or high ability kids due to the lack of abstract words, and we can test this formally.

The DLL lists are meant to be used together with the original English and Spanish MCDI short forms.

```{r load-all-DLL-lists, echo=F, message=F}
# Matched (i.e. short) DLLs
dll1short_raw <- read_csv(here("DLL/Sandy/DLL-ES Supplement Words - DLL-ES Short Form Supplements (Level 1).csv")) %>% 
  rename(english=`English Word`, 
         spanish=`Spanish Word`) 
dll2short_raw <- read_csv(here("DLL/Sandy/DLL-ES Supplement Words - DLL-ES Short Form Supplements (Level 2).csv")) %>% 
  rename(english=`English Word`, 
         spanish=`Spanish Word`) 

# Level 1 exclusions: examine to see if we find some equal difficulty, informative matches?
dll1exclusions <- read_csv(here("DLL/Sandy/DLL-ES Supplement Words - DLL-ES Long Form Excluded Matches (Level 1).csv")) %>% 
  rename(english=`English Word`, 
         spanish=`Spanish Word`) # 94
dll2exclusions <- read_csv(here("DLL/Sandy/DLL-ES Supplement Words - DLL-ES Long Form Excluded Matches (Level 2).csv")) %>% 
  rename(english=`English Word`, 
         spanish=`Spanish Word`) # 210

# Extended (i.e. long) DLLs 
dll1long_raw <- read_csv(here("DLL/Sandy/DLL-ES Supplement Words - DLL-ES Long Form Supplement (Level 1).csv")) %>% 
  rename(english=`English Word`, 
         spanish=`Spanish Word`) 
dll2long_raw <- read_csv(here("DLL/Sandy/DLL-ES Supplement Words - DLL-ES Long Form Supplement (Level 2).csv")) %>% 
  rename(english=`English Word`, 
         spanish=`Spanish Word`) 
```

Now we need to match the DLL words to the wordbank items for which we have IRT parameters.

### Matching DLL short Level 1

```{r match-english-words, message=F, warning=F}
# use with EN WG short form (12-18 mos)
#dll1ENshort <- read_csv(here("DLL/DLL-ES1-short-English.csv")) # 79 items (81 after splitting defs)

dll1short <- dll1short_raw %>% # this is now 168 items...it has the WG short form included
  mutate(english = tolower(english)) %>%
  mutate(english = case_when(english == 'i' ~ 'I',
                          english == 'tv (television)' ~ 'TV',
                          english == 'water' ~ 'water (beverage)',
                          english == 'grandma (or word used in your family)' ~ 'grandma*',
                          english == 'mommy (or word used in your family)' ~ 'mommy*',
                          english == 'choo choo (train sound)' ~ 'choo choo',
                          english == 'patty cake' ~ 'pattycake',
                          english == 'bye or bye bye' ~ 'bye',
                          english == 'teddy bear' ~ 'teddybear',
                          english == 'his/her' ~ 'his', # hers ? (and should DLL say 'his/hers' ?)
                          english == 'to have' ~ 'have',
                          english == 'shh' ~ 'shh/shush/hush',
                          english == 'to sit' ~ 'sit',
                          english == 'to be' ~ 'be',
                          english == 'put, put on' ~ 'put',
                          english == 'to write' ~ 'write',
                          english == 'arms' ~ 'arm',
                          english == 'church (or word used in your family)' ~ 'church*',
                          english == 'want' ~ 'wanna/want to',
                          english == 'in' ~ 'inside/in', # into ?
                          TRUE ~ english))

dll1ENshort_num_matching = length(intersect(dll1short$english, coefs$en$definition)) 
# 75/81 (GK version)
# 160/168 (Sandy)
#setdiff(dll1short$english, coefs$en$definition) 

# are these items also on the short form CDIs?
#length(intersect(dll1short$english, wg_short_en$word)) # all of the WG
#length(intersect(dll1short$english, ws_short_enA$word)) # 36 of the WS
```

```{r match-spanish-words, message=F, warning=F}
# use with SP WG short form (12-18 mos)
#dll1SPshort <- read_csv(here("DLL/DLL-ES1-short-Spanish.csv")) # 67 items

dll1short <- dll1short %>% # this is now 168 items...it has the WG short form included
  mutate(spanish = tolower(spanish)) %>%
  separate(col = spanish, into = c("spanish", NA), sep=" \\(") %>%
  mutate(spanish = case_when(spanish == 'mamá/mami' ~ 'mamá',
                             spanish == 'calcetines' ~ 'calcetín',
                             spanish == 'tomar baño / bañarse' ~ 'baño', # verb -> noun.. tomar(se) ?
                             spanish == 'espera' ~ 'esperar(se)', # close enough ?
                             spanish == 'acabar(se)' ~ 'acabar',
                             spanish == 'no hay más' ~ 'no hay', # or "más" ?
                             spanish == 'rapido' ~ 'rápido (descriptive)', # or rápido (quantifiers)
                             spanish == 'lastimado' ~ 'lastimar(se)', # close enough ?
                             spanish == 'otro' ~ 'otro/otra vez', # close enough ?
                             spanish == 'quiquiriqui' ~ 'quiquiriquí', # DLL missing acent
                             spanish == 'brazos' ~ 'brazo',
                             spanish == 'manos' ~ 'mano',
                             spanish == 'vaso' ~ 'vasos',
                             spanish == 'llaves' ~ 'llave',
                             spanish == 'adiós/byebye' ~ 'adíos/byebye', # wordbank accent is incorrect
                             spanish == 'uno, dos, tres' ~ 'uno dos tres...',
                             spanish == 'shh' ~ 'shhh',
                             spanish == 'ver' ~ 'ver(se)',
                             spanish == '¿dónde está?' ~ 'dónde', # close enough ? 
                             TRUE ~ spanish))

dll1SPshort_num_matching = length(intersect(dll1short$spanish, coefs$sp$definition)) # was 59 on GK version
# 158 / 168
#setdiff(dll1short$spanish, coefs$sp$definition)
# no match: tostada, alimentar, sonreír, algunos, también

# are these items also on the short form CDIs?
#length(intersect(dll1short$spanish, wg_short_sp$word)) # all of the WG
#length(intersect(dll1short$spanish, ws_short_sp$word)) # 63 of the WS
```

### Matching DLL short Level 2

```{r dll2-short, echo=F, message=F, warning=F}
# DLL short Level 2 English / Spanish
dll2short <- dll2short_raw %>% mutate(english = tolower(english)) %>% filter(!is.na(english)) %>%
  mutate(english = case_when(english == 'church (or word used in your family)' ~ 'church*',
                             english == 'tv' ~ 'TV',
                             english == 'to be' ~ 'be',
                             english == 'to be, there is' ~ 'there', # DLL typo?
                             english == 'mommy' ~ 'mommy*',
                             english == 'in the morning' ~ 'morning',
                             english == 'swing' ~ 'swing (object)', # or swing (action) ?
                             english == 'dress' ~ 'dress (object)',
                          TRUE ~ english))
# not on CDI: snake, drum, rice, skirt, mustache, pot, matches, newspaper, bell, godmother, 
# let's go (but "go"), know, turn on (but "on"), win

#length(intersect(dll2short$english, coefs$en$definition)) # 162 / 177
#setdiff(dll2short$english, coefs$en$definition) 

dll2short <- dll2short %>% 
  mutate(spanish = tolower(spanish)) %>%
  separate(col = spanish, into = c("spanish", NA), sep=" \\(") %>%
  mutate(spanish = case_when(spanish == 'zapatos' ~ 'zapato',
                             spanish == 'adiós/bye bye' ~ 'adíos/byebye', # wordbank typo
                             spanish == 'banco' ~ 'banco (outside)', # or banco (places)
                             spanish == 'coca' ~ 'soda/refresco', # close enough?
                             spanish == 'acabar/terminar' ~ 'acabar', # or terminar
                             spanish == 'quiquiriqui' ~ 'quiquiriquí', # DLL missing accent
                             #spanish == 'debajo' ~ 'abajo', # down vs. below...close enough?
                             spanish == 'en la noche/esta noche' ~ 'en la noche',
                             spanish == 'haber' ~ 'haber (hay)',
                             spanish == 'ir de compras' ~ 'comprar', # close enough? or 'ir(se)'
                             spanish == 'llevar' ~ 'llevar(se)',
                             #spanish == 'no más' ~ 'más', # close enough? or 'no'..or 'no hay'
                             spanish == 'puede' ~ 'poder', # infinitive--close enough?
                             spanish == 'que' ~ 'que (connection word)',
                             spanish == 'rapido' ~ 'rápido (descriptive)', # or rápido (quantifiers)
                             TRUE ~ spanish))
                             
length(intersect(dll2short$spanish, coefs$sp$definition)) # 153 / 177
#sort(setdiff(dll2short$spanish, coefs$sp$definition) )

#length(intersect(ws_short_enA$word, dll2short$english)) # 100
#length(intersect(ws_short_enB$word, dll2short$english)) # ..
```

### Matching DLL Extended Level 1

```{r dll1-long, message=F, warning=F}
dll1long <- dll1long_raw %>% mutate(english = tolower(english)) %>%
  mutate(english = case_when(english == 'daddy (or word used in your family)' ~ 'daddy*',
                          english == 'toy' ~ 'toy (object)',
                          english == 'swing' ~ 'swing (object)', # or swing (action) ?
                          english == 'dress' ~ 'dress (object)',
                          TRUE ~ english))


dll1long_EN_num_matching = length(intersect(dll1long$english, coefs$en$definition)) # 74 / 74 match

dll1long <- dll1long %>% mutate(spanish = tolower(spanish)) %>%
  separate(col = spanish, into = c("spanish", NA), sep=" \\(") %>%
  mutate(spanish = case_when(spanish == 'pipi' ~ 'pipí', # is this a match?
                             spanish == 'orejas' ~ 'oreja',
                             spanish == 'dedos' ~ 'dedo',
                             spanish == 'escalera' ~ 'escaleras', 
                             spanish == 'bolsa' ~ 'bolsa (clothing)', # or bolsa (household) ?
                             spanish == 'papá/papi' ~ 'papá', 
                             spanish == 'cosquillita' ~ 'cosquillitas', 
                             spanish == 'hacer la meme' ~ 'siesta', # close enough ?  or hacer?
                             TRUE ~ spanish))

dll1long_SP_num_matching = length(intersect(dll1long$spanish, coefs$sp$definition)) # 74 / 74 match
```

### Matching DLL Extended Level 2

```{r dll2-long, message=F, warning=F}
dll2long <- dll2long_raw %>% mutate(english = tolower(english)) %>%
  mutate(english = case_when(english == 'daddy (or name/word used in your family)' ~ 'daddy*',
                          english == 'teddy bear' ~ 'teddybear',
                          english == 'patty cake' ~ 'pattycake', # or swing (action) ?
                          english == 'dress' ~ 'dress (object)',
                          english == 'i' ~ 'I',
                          english == 'penis (or word used in your family)' ~ 'penis*',
                          english == 'water' ~ 'water (beverage)',
                          english == 'orange' ~ 'orange (food)',
                          english == 'clock/watch' ~ 'clock', # or watch (object)
                          english == 'drink' ~ 'drink (action)',
                          english == 'feet' ~ 'foot',
                          english == 'picture (\"or photo\")' ~ 'picture',
                          english == 'buttocks/bottom (or word used in your family)' ~ 'buttocks/bottom*',
                          TRUE ~ english))

dll2long <- dll2long %>% mutate(spanish = tolower(spanish)) %>%
  separate(col = spanish, into = c("spanish", NA), sep=" \\(") %>%
  mutate(spanish = case_when(spanish == 'pipi' ~ 'pipí',
                          spanish == 'shh' ~ 'shhh',
                          spanish == 'bolsa' ~ 'bolsa (clothing)', # or bolsa (household) ?
                          spanish == 'qué' ~ 'qué (question_words)',
                          TRUE ~ spanish))

#length(intersect(dll2long$spanish, coefs$sp$definition)) 
#setdiff(dll2long$spanish, coefs$sp$definition)
```


```{r load-wordbank-data, echo=F, message=F}
load(here("data/wordbank_eng_ws_wg_webcdi31-36mos.Rds"))

too_young <- which(d_demo$age < 12) # 378 children can't be producing any words yet

d_demo_en = d_demo[-too_young,] 
d_mat_en = d_mat[-too_young,]

# DLL items not named the same
#dll1ENshort[which(!is.element(dll1ENshort$word, colnames(d_mat))),]
#dll1SPshort[which(!is.element(dll1SPshort$word, coefs$sp$definition)),] # tostada, 
```


English DLL items not in our wordbank IRT model: *one, two, three*, *family*, *drum*, *good morning*, *also*, and *many*.
Spanish DLL items not in our wordbank IRT model: *tostada*, *algunos*, *alimentar*, *sonreír*, *no hay más* (although we have *no* and *no hay*, as well as *más*), and lastimado (but we have *lastimar(se)*).

## Does the DLL short form recover full form scores?

### English DLL Level 1: Production

Using data from `r nrow(subset(d_demo_en, age<19))` English-speaking children 12-18 month of age from Wordbank, we test how well sumscores from the DLL-ES1 matched form + CDI:WG short form predict children's English production scores from the long form (LF) CDI (WG/WS).
The left panel shows LF CDI scores vs. the DLL-ES1 Matched + CDI:WG short score, and the right panel shows the full CDI scores vs. just the CDI:WG short form score.

```{r cdi-short-DLL1-vs-full-cdi-English-prod, echo=F, message=F, fig.width=10, fig.height=4}
en_wg_short_cols = na.omit(match(wg_short_en$word, colnames(d_mat_en)))
en_wg_short_dll_cols = na.omit(match(dll1short$english, colnames(d_mat_en))) # 160 items
d_demo_en$production = rowSums(d_mat_en, na.rm=T)
d_demo_en$DLLsum = rowSums(d_mat_en[,en_wg_short_dll_cols], na.rm=T)
d_demo_en$WGshort = rowSums(d_mat_en[,en_wg_short_cols], na.rm=T)

en_wg_dll_long_cols = union(en_wg_short_dll_cols, na.omit(match(dll1long$english, colnames(d_mat_en)))) # 234

d_demo_en$DLL1long_sum = rowSums(d_mat_en[,en_wg_dll_long_cols], na.rm=T)

young_en = d_demo_en %>% filter(age <= 18, production < 396) # want only WG kids

dll_en_cor = cor(young_en$production, young_en$DLLsum) # .978
wg_en_cor = cor(young_en$production, young_en$WGshort) # .963
dll1long_en_cor = cor(young_en$production, young_en$DLL1long_sum) # .985
# reliability: psych::alpha()

# create SSE for linear extrapolation from DLL / short to full CDI:WG?

p1 <- young_en %>%
  ggplot(aes(x=production, y=DLLsum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Production Score") + ylab("DLL-ES1 Matched Score") + 
  geom_abline(slope=length(en_wg_short_dll_cols) / 395, intercept=0, linetype = 'dashed') +
  geom_smooth() + stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

p2 <- young_en %>%
  ggplot(aes(x=production, y=WGshort)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Production Score") + ylab("CDI:WG Short Score") + 
  geom_abline(slope=length(en_wg_short_cols) / 395, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") + 
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 60)

p3 <- young_en %>%
  ggplot(aes(x=production, y=DLL1long_sum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Production Score") + ylab("DLL-ES1 Extended Score") + 
  geom_abline(slope=length(en_wg_dll_long_cols) / 395, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") + 
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

#grid.arrange(p1, p2, nrow = 1, widths=c(4.5,3.9), heights=3.5)
ggarrange(p1, p3, p2, nrow=1, common.legend = T)
g <- arrangeGrob(p1, p3, p2, nrow = 1, widths=c(3.2,3.2,3.2), heights=3.2)
ggsave("DLL1short_vs_CDIWG_prod.pdf", g)
```

Overall, the correlation of children's CDI:WG short + DLL scores and their full CDI production scores is quite high ($r=`r round(dll_en_cor,2)`$), but as shown above, for small vocabulary sizes the DLL score overestimates full CDI:WG production scores, while for higher full CDI:WG scores the DLL underestimates vocab size (dotted line has slope $=`r length(en_wg_short_dll_cols)` / `r 395`$).
However, the CDI:WG short form alone (middle panel) shows a similar (and more extreme) overestimation for small vocabulary sizes.

### English DLL Level 1: Comprehension

Now we do the same for comprehension (receptive vocabulary) using Wordbank's CDI:WG English data.

```{r cdi-short-DLL1-vs-full-cdi-English-comp, echo=F, message=F, fig.width=10, fig.height=4}
load(here("data/comp/wordbank_eng_wg_webcdi.Rds"))

too_young <- which(d_demo$age < 12) # 378 children can't be producing any words yet

d_demo_en_comp = d_demo[-too_young,] 
d_mat_en_comp = d_mat_wg[-too_young,]

en_wg_short_colsC = na.omit(match(wg_short_en$word, colnames(d_mat_en_comp)))
en_wg_short_dll_colsC = na.omit(match(dll1short$english, colnames(d_mat_en_comp))) # 160 items
d_demo_en_comp$comprehension = rowSums(d_mat_en_comp, na.rm=T)
d_demo_en_comp$DLLsum = rowSums(d_mat_en_comp[,en_wg_short_dll_colsC], na.rm=T)
d_demo_en_comp$WGshort = rowSums(d_mat_en_comp[,en_wg_short_colsC], na.rm=T)

en_wg_dll_long_colsC = union(en_wg_short_dll_colsC, na.omit(match(dll1long$english, colnames(d_mat_en_comp)))) # 224

d_demo_en_comp$DLL1long_sum = rowSums(d_mat_en_comp[,en_wg_dll_long_colsC], na.rm=T)


dll_en_cor_comp = cor(d_demo_en_comp$comprehension, d_demo_en_comp$DLLsum) # .982
wg_en_cor_comp = cor(d_demo_en_comp$comprehension, d_demo_en_comp$WGshort) # .959
dll1long_en_cor_comp = cor(d_demo_en_comp$comprehension, d_demo_en_comp$DLL1long_sum) # .987

p1 <- d_demo_en_comp %>%
  ggplot(aes(x=comprehension, y=DLLsum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Comp. Score") + ylab("DLL-ES1 Matched Score") + 
  geom_abline(slope=length(en_wg_short_dll_colsC) / 395, intercept=0, linetype = 'dashed') +
  geom_smooth() +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

p2 <- d_demo_en_comp %>%
  ggplot(aes(x=comprehension, y=WGshort)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Comp. Score") + ylab("CDI:WG Short Score") + 
  geom_abline(slope=length(en_wg_short_colsC) / 395, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 60)

p3 <- d_demo_en_comp %>% 
  ggplot(aes(x=comprehension, y=DLL1long_sum)) + # , group=sex, color=sex
  geom_point(alpha=.1) + coord_flip() + 
  xlab("CDI:WG LF Comp. Score") + ylab("DLL-ES1 Extended Score") + 
  geom_abline(slope=length(en_wg_dll_long_colsC) / 395, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

#grid.arrange(p1, p2, p3, nrow = 1, widths=c(4.5,3.9, 3.9), heights=3.5)
ggarrange(p1, p3, p2, nrow=1, common.legend = T)
g <- arrangeGrob(p1, p3, p2, nrow = 1, widths=c(3.2,3.2,3.2), heights=3.2)
ggsave("DLL1short_vs_CDIWG_comp_EN.pdf", g)
```

The correlation of children's DLL1 scores and their CDI:WG LF comprehension scores is quite high ($r=`r round(dll_en_cor_comp,2)`$), and extrapolating from the DLL-ES1 to full CDI:WG scores shows very little overestimation (less than when using the CDI:WG short form alone).

### English DLL Level 2 (Production)

Using data from `r nrow(subset(d_demo_en, age>15 & age<31))` English-speaking children 16-30 month of age from Wordbank, we test how well sumscores from the DLL-ES2 matched form predict children's full production scores from the CDI:WS.
The left panel shows full CDI scores vs. the DLL-ES2 matched inventory (A) score, and the right panel shows the full CDI scores vs. just the CDI:WG short form (A) score.

```{r cdi-short-DLL2-vs-full-cdi-English, echo=F, message=F, fig.width=10, fig.height=4}
en_ws_short_cols = na.omit(match(ws_short_enA$word, colnames(d_mat_en))) # 
en_ws_short_dll_cols = na.omit(match(dll2short$english, colnames(d_mat_en))) # 162 items
d_demo_en$production = rowSums(d_mat_en, na.rm=T)
d_demo_en$DLL2sum = rowSums(d_mat_en[,en_ws_short_dll_cols], na.rm=T)
d_demo_en$WSshort = rowSums(d_mat_en[,en_ws_short_cols], na.rm=T)

en_ws_dll_long_cols = union(en_ws_short_dll_cols, na.omit(match(dll2long$english, colnames(d_mat_en)))) # 293

d_demo_en$DLL2long_sum = rowSums(d_mat_en[,en_ws_dll_long_cols], na.rm=T)


old_en = d_demo_en %>% filter(age > 15, age < 31)

dll2_en_cor = cor(old_en$production, old_en$DLL2sum) # .992
ws_en_cor = cor(old_en$production, old_en$WSshort) # .989

dll2long_en_cor = cor(old_en$production, old_en$DLL2long_sum) # .988

p1 <- old_en %>% 
  ggplot(aes(x=production, y=DLL2sum)) + # , group=sex, color=sex
  geom_point(alpha=.1) + coord_flip() + 
  xlab("CDI:WS LF Production Score") + ylab("DLL-ES2 Matched Score") + 
  geom_abline(slope=length(en_ws_short_dll_cols) / 680, intercept=0, linetype = 'dashed') +
  geom_smooth() +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

p2 <- old_en %>% 
  ggplot(aes(x=production, y=WSshort)) + # , group=sex, color=sex
  geom_point(alpha=.1) + coord_flip() + 
  xlab("CDI:WS LF Production Score") + ylab("CDI:WS Short Score") + 
  geom_abline(slope=length(en_ws_short_cols) / 680, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 60)

p3 <- old_en %>% 
  ggplot(aes(x=production, y=DLL2long_sum)) + # , group=sex, color=sex
  geom_point(alpha=.1) + coord_flip() + 
  xlab("CDI:WS LF Production Score") + ylab("DLL-ES1 Extended Score") + 
  geom_abline(slope=length(en_ws_dll_long_cols) / 680, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

#grid.arrange(p1, p2, p3, nrow = 1, widths=c(4.5,3.9, 3.9), heights=3.5)
ggarrange(p1, p3, p2, nrow=1, common.legend = T)
g <- arrangeGrob(p1, p3, p2, nrow = 1, widths=c(3.2,3.2,3.2), heights=3.2)
ggsave("DLL2short_vs_CDIWS_EN.pdf", g)
```

Overall, the correlation of children's CDI:WS short + DLL2 scores and their full CDI production scores is quite high ($r=`r round(dll2_en_cor,2)`$), but as shown above, the DLL2 again mostly overestimates production scores on the full CDI (dotted line has slope $=`r length(en_ws_short_dll_cols)` / `r 680`$).
In comparison, the CDI:WS short form (A) score only overestimates full CDI scores for smaller vocabulary sizes (<400).


### Spanish DLL Level 1

Now we look at overestimation for Spanish DLLs + CDI short forms.

```{r DLL-vs-full-score-Spanish, echo=F, message=F, fig.width=10, fig.height=4}
load(here("data/wordbank_sp_ws_wg_webcdi12-30mos.Rds"))
d_demo_sp = d_demo
d_mat_sp = d_mat

sp_wg_short_cols = na.omit(match(wg_short_sp$word, colnames(d_mat_sp)))
sp_ws_short_cols = na.omit(match(ws_short_sp$word, colnames(d_mat_sp)))


sp_wg_short_dll_cols = na.omit(match(dll1short$spanish, colnames(d_mat_sp)))
sp_ws_short_dll_cols = na.omit(match(dll2short$spanish, colnames(d_mat_sp)))

d_demo_sp$production = rowSums(d_mat, na.rm=T)
d_demo_sp$DLL1sum = rowSums(d_mat_sp[,sp_wg_short_dll_cols], na.rm=T)
d_demo_sp$DLL2sum = rowSums(d_mat_sp[,sp_ws_short_dll_cols], na.rm=T)
d_demo_sp$WGshort = rowSums(d_mat_sp[,sp_wg_short_cols], na.rm=T)
d_demo_sp$WSshort = rowSums(d_mat_sp[,sp_ws_short_cols], na.rm=T)

# DLL2 supplement
sp_ws_dll_long_cols = union(sp_ws_short_dll_cols, na.omit(match(dll2long$spanish, colnames(d_mat_sp))))
d_demo_sp$DLL2long_sum = rowSums(d_mat_sp[,sp_ws_dll_long_cols], na.rm=T)

# DLL1 supplement
sp_wg_dll_long_cols = union(sp_wg_short_dll_cols, na.omit(match(dll1long$spanish, colnames(d_mat_sp))))
d_demo_sp$DLL1long_sum = rowSums(d_mat_sp[,sp_wg_dll_long_cols], na.rm=T)


young_sp <- d_demo_sp %>% filter(age<19, production<=428)
old_sp <- d_demo_sp %>% filter(age>15)

dll1_sp_cor = cor(young_sp$production, young_sp$DLL1sum) # .988
dll1long_sp_cor = cor(young_sp$production, young_sp$DLL1long_sum) # .992
dll2_sp_cor = cor(old_sp$production, old_sp$DLL2sum) # .989
wg_sp_cor = cor(young_sp$production, young_sp$WGshort) # .983
ws_sp_cor = cor(old_sp$production, old_sp$WSshort) # .983

dll2long_sp_cor = cor(old_sp$production, old_sp$DLL2long_sum) # .982

p1 <- young_sp %>% ggplot(aes(x=production, y=DLL1sum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Production Score") + ylab("DLL-ES1 Matched Score") + 
  geom_abline(slope=length(sp_wg_short_dll_cols) / 428, intercept=0, linetype = 'dashed') +
  geom_smooth() +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

p2 <- young_sp %>% ggplot(aes(x=production, y=WGshort)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Production Score") + ylab("CDI:WG Short Score") + 
  geom_abline(slope=length(sp_wg_short_cols) / 428, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 60)

p3 <- young_sp %>% ggplot(aes(x=production, y=DLL1long_sum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Production Score") + ylab("DLL-ES1 Extended Score") + 
  geom_abline(slope=length(sp_wg_dll_long_cols) / 428, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

#grid.arrange(p1, p2, nrow = 1, widths=c(4.5,3.9), heights=3.5)
ggarrange(p1, p3, p2, nrow=1, common.legend = T)
g <- arrangeGrob(p1, p3, p2, nrow = 1, widths=c(3.2,3.2,3.2), heights=3.2)
ggsave("DLL1short_vs_CDIWG_prod_SP.pdf", g)
```


Using Wordbank data from `r nrow(young_sp)` Spanish-speaking children aged 12-18 months, we test how well sumscores from the DLL-ES1 Matched form correlate with children's full CDI:WG production scores.

As for English, the correlation of Spanish-speaking children's DLL scores and their full CDI:WG production scores is quite high ($r=`r round(dll1_sp_cor,2)`$), but as shown above, their DLL score overestimates the production score on the full CDI at smaller vocabulary sizes (dotted line has slope $=`r length(sp_wg_short_dll_cols)` / `r 428`$).
Do note that few children in this dataset have large productive vocabularies.

### Spanish DLL Level 1: Comprehension

Now we do the same for comprehension (receptive vocabulary) using Wordbank's CDI:WG Spanish data.

```{r cdi-short-DLL1-vs-full-cdi-Spanish-comp, echo=F, message=F, fig.width=10, fig.height=4}
load(here("data/comp/wordbank_sp_wg_webcdi.Rds"))

too_young <- which(d_demo$age < 12) # 242 children can't be producing any words yet...leave in for comp?

d_demo_sp_comp = d_demo[-too_young,] 
d_mat_sp_comp = d_mat_wg[-too_young,]

sp_wg_short_colsC = na.omit(match(wg_short_sp$word, colnames(d_mat_sp_comp)))
sp_wg_short_dll_colsC = na.omit(match(dll1short$spanish, colnames(d_mat_sp_comp))) # 144 items

# DLL1 supplement
sp_wg_dll_long_colsC = union(sp_wg_short_colsC, na.omit(match(dll1long$spanish, colnames(d_mat_sp_comp)))) # 160

d_demo_sp_comp$comprehension = rowSums(d_mat_sp_comp, na.rm=T)
d_demo_sp_comp$DLLsum = rowSums(d_mat_sp_comp[,sp_wg_short_dll_colsC], na.rm=T)
d_demo_sp_comp$WGshort = rowSums(d_mat_sp_comp[,sp_wg_short_colsC], na.rm=T)
d_demo_sp_comp$DLL1long_sum = rowSums(d_mat_sp_comp[,sp_wg_dll_long_colsC], na.rm=T)

dll_sp_cor_comp = cor(d_demo_sp_comp$comprehension, d_demo_sp_comp$DLLsum) # .982
wg_sp_cor_comp = cor(d_demo_sp_comp$comprehension, d_demo_sp_comp$WGshort) # .974
dll1long_sp_cor_comp = cor(d_demo_sp_comp$comprehension, d_demo_sp_comp$DLL1long_sum) # .981

p1 <- d_demo_sp_comp %>%
  ggplot(aes(x=comprehension, y=DLLsum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Comp. Score") + ylab("DLL-ES1 Matched Score") + 
  geom_abline(slope=length(sp_wg_short_dll_colsC) / 428, intercept=0, linetype = 'dashed') +
  geom_smooth() +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

p2 <- d_demo_sp_comp %>%
  ggplot(aes(x=comprehension, y=WGshort)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Comp. Score") + ylab("CDI:WG Short Score") + 
  geom_abline(slope=length(sp_wg_short_colsC) / 428, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 60)

p3 <- d_demo_sp_comp %>% ggplot(aes(x=comprehension, y=DLL1long_sum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WG LF Comp. Score") + ylab("DLL-ES1 Extended Score") + 
  geom_abline(slope=length(sp_wg_dll_long_colsC) / 428, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

#grid.arrange(p1, p2, nrow = 1, widths=c(4.5,3.9), heights=3.5)
ggarrange(p1, p3, p2, nrow=1, common.legend = T)
g <- arrangeGrob(p1, p3, p2, nrow = 1, widths=c(3.2,3.2,3.2), heights=3.2)
ggsave("DLL1short_vs_CDIWG_comp_SP.pdf", g)
```

The correlation of children's DLL1 Matched scores and their full CDI:WG comprehension scores is quite high ($r=`r round(dll_sp_cor_comp,2)`$), and extrapolating from DLL to full CDI:WG scores shows very little overestimation--similar to the level shown by using the CDI:WG alone.

### Spanish DLL Level 2 (Production)

```{r spanish-dll2, echo=F, message=F, fig.width=10, fig.height=4}

p1 <- old_sp %>% ggplot(aes(x=production, y=DLL1sum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WS LF Production Score") + ylab("DLL-ES2 Matched Score") + 
  geom_abline(slope=length(sp_ws_short_dll_cols) / 680, intercept=0, linetype = 'dashed') +
  geom_smooth() +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

p2 <- old_sp %>% ggplot(aes(x=production, y=WGshort)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WS LF Production Score") + ylab("CDI:WS Short Score") + 
  geom_abline(slope=length(sp_ws_short_cols) / 680, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 60)

p3 <- old_sp %>% ggplot(aes(x=production, y=DLL2long_sum)) + # , group=sex, color=sex
  geom_point(alpha=.3) + coord_flip() + 
  xlab("CDI:WS LF Production Score") + ylab("DLL-ES2 Extended Score") + 
  geom_abline(slope=length(sp_ws_dll_long_cols) / 680, intercept=0, linetype = 'dashed') +
  geom_smooth() + theme(legend.position = "none") +
  stat_cor(aes(label = ..r.label..), r.accuracy = 0.001, label.x = 60, label.y = 100)

#grid.arrange(p1, p2, p3, nrow = 1, widths=c(4.5,3.9,3.9), heights=3.5)
ggarrange(p1, p3, p2, nrow=1, common.legend = T)
g <- arrangeGrob(p1, p3, p2, nrow = 1, widths=c(3.2,3.2,3.2), heights=3.2)
ggsave("DLL2short_vs_CDIWS_SP.pdf", g)
```

## Full CDI vs. Short CDI / DLL Correlations

```{r, echo=F}
cortab <- data.frame(Language = c(rep("English", 3), rep("Spanish", 3)),
                "Full Form Score" = rep(c("CDI:WG production", "CDI:WG comprehension", "CDI:WS production"), 2),
                N = c(nrow(young_en), nrow(young_en), nrow(old_en), 
                      nrow(young_sp), nrow(young_sp), nrow(old_sp)),
                "r with CDI short form" = c(wg_en_cor, wg_en_cor_comp, ws_en_cor,
                                            wg_sp_cor, wg_sp_cor_comp, ws_sp_cor),
                "r with DLL-ES matched" = c(dll_en_cor, dll_en_cor_comp, dll2_en_cor,
                                                    dll1_sp_cor, dll_sp_cor_comp, dll2_sp_cor),
                "r with DLL-ES extended" = c(dll1long_en_cor, dll1long_en_cor_comp, dll2long_en_cor,
                                        dll1long_sp_cor, dll1long_sp_cor_comp, dll2long_sp_cor))

cortab %>% kable(., digits=3, table.attr = "style='width:30%;'") %>% 
  kable_classic(full_width = T, position = "center")
```


## Recommendations

```{r dll-item-ease, echo=F}
en_dll1_ease <- coefs$en %>% mutate(onDLL = 
                  ifelse(is.element(definition, dll1short$english), 1, 0)) %>%
  group_by(onDLL) %>% 
  summarise(easiness=mean(d), n=n())

en_dll2_ease <- coefs$en %>% mutate(onDLL = 
                  ifelse(is.element(definition, dll2short$english), 1, 0)) %>%
  group_by(onDLL) %>% 
  summarise(easiness=mean(d), n=n())

en_wg_short_ease <- coefs$en %>% mutate(onWGshort = 
                          ifelse(is.element(definition, wg_short_en$word), 1, 0)) %>%
  group_by(onWGshort) %>% 
  summarise(easiness=mean(d), n=n())

en_ws_short_ease <- coefs$en %>% mutate(onWSshort = 
                          ifelse(is.element(definition, ws_short_enA$word), 1, 0)) %>%
  group_by(onWSshort) %>% 
  summarise(easiness=mean(d), n=n())

sp_dll1_ease <- coefs$sp %>% mutate(onDLL = 
                  ifelse(is.element(definition, dll1short$spanish), 1, 0)) %>%
  group_by(onDLL) %>% 
  summarise(easiness=mean(d), n=n())

sp_dll2_ease <- coefs$sp %>% mutate(onDLL = 
                  ifelse(is.element(definition, dll2short$spanish), 1, 0)) %>%
  group_by(onDLL) %>% 
  summarise(easiness=mean(d), n=n())
```


Overall, it seems that many of the items on the DLL are somewhat easier than average, and thus these forms tend to overestimate children's full CDI scores (indeed, for English items on the DLL1 Matched form, the average easiness is `r round(en_dll1_ease[2,]$easiness,2)`, while the mean easiness of items not on the DLL is `r round(en_dll1_ease[1,]$easiness,2)`).
This is also true of the CDI:WG short English form: the average easiness is `r round(en_wg_short_ease[2,]$easiness,2)` and the average ease of items not on the WG short form is `r round(en_wg_short_ease[1,]$easiness,2)`.
The CDI:WS short English form (A) is less biased towards easy items: average easiness is `r round(en_ws_short_ease[2,]$easiness,2)` vs. `r round(en_ws_short_ease[1,]$easiness,2)` for items not on the short WS.
The histograms below show the distribution of easiness parameters for English (left) and Spanish (right) CDI words. 
Solid lines show the average ease of DLL items (DLL 1 = red, DLL 2 = orange), and dashed lines show the average of non-DLL items.

Spanish DLL1 items have an average ease of `r round(sp_dll1_ease[2,]$easiness,2)`, while other items on the full CDI have a mean ease of `r round(sp_dll1_ease[1,]$easiness,2)`.

Spanish DLL2 items have an average ease of `r round(sp_dll2_ease[2,]$easiness,2)`, while other CDI items have a mean of `r round(en_dll2_ease[1,]$easiness,2)`.

English DLL2 items have an average ease of `r round(en_dll2_ease[2,]$easiness,2)`, while other items on the full CDI have a mean ease of `r round(en_dll2_ease[1,]$easiness,2)`.


```{r, echo=F, message=F, fig.width=8, fig.height=4}
p1 <- ggplot(coefs$en, aes(x=d)) + geom_histogram() +  
  xlab("English Item Easiness") + 
  geom_vline(xintercept=en_dll1_ease$easiness, color="red", linetype=c("dashed","solid")) +
  geom_vline(xintercept=en_dll2_ease$easiness, color="orange", linetype=c("dashed","solid"))
p2 <- ggplot(coefs$sp, aes(x=d)) + geom_histogram() + 
  xlab("Spanish Item Easiness") + 
  geom_vline(xintercept=sp_dll1_ease$easiness, color="red", linetype=c("dashed","solid")) +
  geom_vline(xintercept=sp_dll2_ease$easiness, color="orange", linetype=c("dashed","solid"))

grid.arrange(p1, p2, nrow = 1, widths=c(4,4), heights=3)
g <- arrangeGrob(p1, p2, nrow = 1, widths=c(3.2,3.2), heights=3.2)
ggsave("EN_vs_SP_item_easiness_distros.pdf", g)
```


We recommend bringing the overall mean estimated IRT difficulty of the words selected for the DLLs closer to the mean difficulty of the words on the rest of the CDI.

To start, we examine IRT easiness parameters for the doublets on the existing DLL lists, looking for items with large mismatch between their English and Spanish ease.

## Do doublets have similar difficulties?

We want to whether assess doublet items have similar difficulty (operationalized by their IRT parameters) in English and in Spanish. 
For example, consider if "perro" was for some reason much more difficult than "dog", then you wouldn't want to include it because it wouldn't be a good item for estimating vocabulary overlap!

Below are shown the parameters for items from the DLL-ES1 Matched form: en_d = English easiness, sp_d = Spanish easiness), ordered by the most to least discrepant (difficulty difference squared); (_a1 columns show item discriminations (slopes), and en_sp_d_diff simply shows the difference (English - Spanish) in the easiness parameter).

### DLL Level 1 Short Form

For the Matched form, we merely want to identify items that have very different difficulty in English and Spanish, and recommend that researchers interested in estimating conceptual overlap in bilinguals not include these items in their calculations due to the bias.
Below is shown the distribution of the squared difference in difficulty for doublets from the DLL Level 1 short form.

```{r compare-doublet-difficulties-dll1-short, echo=F}
# a1 = discrimination, d = difficulty, g = lower-bound, u = upper-bound

pars_dll1short <- dll1short %>% 
  left_join(coefs$sp, by=c("spanish"="definition")) %>% 
  select(english, spanish, a1, d) %>% 
  rename(sp_a1 = a1, sp_d = d) %>% 
  left_join(coefs$en, by=c("english"="definition")) %>%
  select(english, spanish, sp_a1, sp_d, a1, d) %>% 
  rename(en_a1 = a1, en_d = d)

pars_dll1short <- pars_dll1short %>% 
  mutate(en_sp_d_diff = en_d - sp_d,
         d_diff_sq = (sp_d - en_d)^2) %>% # want to also maximize discrimination..
  filter(!is.na(d_diff_sq)) %>%
  arrange(desc(d_diff_sq))

threshold = 7.17 # mean = 2.17, sd = 3.3... 2.17 + 1.5*3.3 = 7.17
bad_dll1 <- subset(pars_dll1short, d_diff_sq >= threshold)

hist(pars_dll1short$d_diff_sq, main="", xlab="(EN diff - SP diff)^2")
abline(v=threshold,col="red")
```

The mean squared difference in difficulty of doublets on the DLL-ES1 Matched form is `r round(mean(pars_dll1short$d_diff_sq), 2)` (SD=`r round(sd(pars_dll1short$d_diff_sq), 2)`), and as shown above this distribution is highly skewed: most doublets are fairly well-matched (median d_diff_sq=0.99), but a few items are extremely mismatched.
We recommend that the `r nrow(bad_dll1)` items (shown below) with a squared difficulty difference of 7.17 (mean + 1.5 * SD) or more be excluded from calculations of conceptual overlap.

```{r, echo=F}
knitr::kable(bad_dll1, digits=2, table.attr = "style='width:30%;'") %>%
  kable_classic(full_width = T, position = "center")
```

### DLL Level 2 Matched Form

```{r compare-doublet-difficulties-dll2-short, echo=F}
pars_dll2short <- dll2short %>% 
  left_join(coefs$sp, by=c("spanish"="definition")) %>% 
  select(english, spanish, a1, d) %>% 
  rename(sp_a1 = a1, sp_d = d) %>% 
  left_join(coefs$en, by=c("english"="definition")) %>%
  select(english, spanish, sp_a1, sp_d, a1, d) %>% 
  rename(en_a1 = a1, en_d = d)

pars_dll2short <- pars_dll2short %>% 
  mutate(en_sp_d_diff = en_d - sp_d,
         d_diff_sq = (sp_d - en_d)^2) %>% 
  filter(!is.na(d_diff_sq)) %>%
  arrange(desc(d_diff_sq))

threshold = 8.22 # M = 2.53, SD = 3.79  M + 1.5*SD = 8.22
bad_dll2 <- subset(pars_dll2short, d_diff_sq >= threshold)

hist(pars_dll2short$d_diff_sq, main="", xlab="(EN diff - SP diff)^2")
abline(v=threshold,col="red")
```

The mean squared difference in difficulty of doublets on the DLL-ES2 Matched form is `r round(mean(pars_dll2short$d_diff_sq), 2)` (SD=`r round(sd(pars_dll2short$d_diff_sq), 2)`), and once again while most doublets are fairly well-matched, several items are extremely mismatched.
We recommend that the `r nrow(bad_dll2)` items (shown below) with a squared difficulty difference of 8.22 (mean + 1.5 * SD) or more be excluded from calculations of conceptual overlap.

```{r, echo=F}
knitr::kable(bad_dll2, digits=2, table.attr = "style='width:30%;'") %>% kable_classic(full_width = T, position = "center")
```

```{r save-dll-diffs, echo=F}
write.csv(pars_dll1short, file="DLL1short-IRT-parms.csv")
write.csv(pars_dll2short, file="DLL2short-IRT-parms.csv")
```


Now we examine the doublets on the long supplemental DLL forms, and make recommendations of mismatched items to swap out.

### DLL Level 1 Extended

Below are shown the parameters for items from the DLL Level 1 extended form.
Clearly some of these items have quite different difficulty levels in English and Spanish, and we should try to find items that are more equivalent and swap them onto the long supplement.

```{r compare-doublet-difficulties-dll1-long, echo=F}
# a1 = discrimination, d = difficulty, g = lower-bound, u = upper-bound

pars_dll <- dll1long %>% 
  left_join(coefs$sp, by=c("spanish"="definition")) %>% 
  select(english, spanish, a1, d) %>% 
  rename(sp_a1 = a1, sp_d = d) %>% 
  left_join(coefs$en, by=c("english"="definition")) %>%
  select(english, spanish, sp_a1, sp_d, a1, d) %>% 
  rename(en_a1 = a1, en_d = d)

pars_dll <- pars_dll %>% 
  mutate(en_sp_d_diff = en_d - sp_d,
         d_diff_sq = (sp_d - en_d)^2) %>% 
  filter(!is.na(d_diff_sq)) %>%
  arrange(desc(d_diff_sq))

knitr::kable(pars_dll, digits=2, table.attr = "style='width:30%;'") %>% kable_classic(full_width = T, position = "center")

#pars_dll <- pars_dll %>% filter(!is.element(english, wg_short_en$word),
#                                !is.element(spanish, wg_short_sp$word))
# that gets rid of all of them

#pars_dll %>% ggplot(aes(x=en_d, y=sp_d)) +
#  geom_point() + theme_minimal() # geom_text_repel() # but can only use 1lang..

# mean(diff): Spanish = -.75,  English = -.25
#colMeans(pars_dll[,c("sp_d","en_d")])
```

### DLL Level 2 Extended

```{r compare-doublet-difficulties-dll2, echo=F}
# a1 = discrimination, d = difficulty, g = lower-bound, u = upper-bound

pars_dll2 <- dll2long %>% 
  left_join(coefs$sp, by=c("spanish"="definition")) %>% 
  select(english, spanish, a1, d) %>% 
  rename(sp_a1 = a1, sp_d = d) %>% 
  left_join(coefs$en, by=c("english"="definition")) %>%
  select(english, spanish, sp_a1, sp_d, a1, d) %>% 
  rename(en_a1 = a1, en_d = d)

pars_dll2 <- pars_dll2 %>% 
  mutate(en_sp_d_diff = en_d - sp_d,
         d_diff_sq = (sp_d - en_d)^2) %>% 
  filter(!is.na(d_diff_sq)) %>%
  arrange(desc(d_diff_sq))

knitr::kable(pars_dll2, digits=2, table.attr = "style='width:30%;'") %>% kable_classic(full_width = T, position = "center")

# mean(diff): Spanish = -.72,  English = -.35
#colMeans(pars_dll2[,c("sp_d","en_d")])
```

## Recommended Item Swaps

We will use Wordbank's unilemmas to find translation-equivalent pairs that have smaller d_diff_sq values than current DLL extended items.
We first get the English / Spanish unilemmas from wordbank (both WS and WG), and below simply show the Spanish vs. English easiness parameters.

```{r unilemmas, echo=F, message=F}
# created in get_L1-L2_unilemmas.R
load("data/English-Spanish-dict.Rdata")

# should we filter out the words from the 

# add unilemmas to DLL dfs
dll1long <- dll1long %>% 
  left_join(dict %>% select(english, uni_lemma, d_diff_sq, en_d, sp_d, lexical_class), by=c("english"="english")) %>% 
  select(-`English Word Origin`, -`Spanish Word Origin`, -`Spanish Alternative Translations (experimenter or MCDI)`) %>%
  group_by(english) %>% arrange(d_diff_sq) %>% slice(1) %>% ungroup() # just take item with minimal easiness difference

dll2long <- dll2long %>% 
  left_join(dict %>% select(english, uni_lemma, d_diff_sq, en_d, sp_d, lexical_class), by=c("english"="english")) %>%
  select(-`English Word Origin`, -`Spanish Word Origin`, -`Spanish Alternative Translations (experimenter or MCDI)`) %>%
  group_by(english) %>% arrange(d_diff_sq) %>% slice(1) %>% ungroup()


dict %>% ggplot(aes(x=en_d, y=sp_d, color=lexical_class)) + 
  geom_abline(slope=1, intercept=0, linetype = "dashed") +
  geom_point(alpha=.7) + geom_smooth(method='lm') + 
  xlab("English word easiness") + ylab("Spanish word easiness")

# original
#dict_ <- dict %>% filter(!is.element(english, dll1short$english),
#                        !is.element(spanish, dll1short$spanish), # was 380, now only 218
#                        !is.element(english, dll2short$english),
#                        !is.element(spanish, dll2short$spanish),
#                        !is.element(english, dll1long$english),
#                        !is.element(spanish, dll2long$spanish)) # 183 


# only DLL2 that are excluded
dict <- dict %>% filter(!is.element(english, dll1short$english),
                        !is.element(spanish, dll1short$spanish), # was 380, now only 218
                        !is.element(english, dll2short$english),
                        !is.element(spanish, dll2short$spanish),
                        !is.element(english, dll1long$english),
                        !is.element(spanish, dll2long$spanish)) # 183 
```


Below are shown the `r nrow(dict)` unilemmas for which we have English and Spanish IRT parameters and that are *not* already on any of the DLL-ES lists (Level 1 or Level 2, short or supplemental lists).
Items with small difficulty mismatch (d_diff_sq) can be chosen from this list to replace items with worse than average items on the DLL-ES supplemental forms.
The median d_diff_sq on the DLL-ES Level 1 supplement is `r round(median(dll1long$d_diff_sq, na.rm=T),3)` (sd=`r round(sd(dll1long$d_diff_sq, na.rm=T),3)`).
The median d_diff_sq on the DLL-ES Level 2 supplement is `r round(median(dll2long$d_diff_sq, na.rm=T),3)` (sd=`r round(sd(dll2long$d_diff_sq, na.rm=T),3)`).

Finally, for the hand-picked swaps we will report the original DLL list's easiness SSE, as well as the improvement (in easiness SSE) after the final swaps are made.

```{r, echo=F}
#dict %>% arrange(d_diff_sq) %>% select(-uni_lemma) %>%
#  kable(., digits=3, table.attr = "style='width:30%;'") %>% 
#  kable_classic(full_width = T, position = "center")
dict %>% DT::datatable() %>%
  DT::formatRound(columns=c("en_d","sp_d","en_sp_d_diff","d_diff_sq"), digits=3)
```

## DLL-ES1 Extended Form Swaps

```{r, echo=F}
# need the full dict 
load("data/English-Spanish-dict.Rdata")

orig_l1_dds = dll1long %>% 
  mutate(en_sp_d_diff = en_d - sp_d) %>%
  filter(!is.na(d_diff_sq)) %>%
  summarise(mean_diff_sq = mean(d_diff_sq), mean_diff = mean(en_sp_d_diff))
# original list mean d_diff_sq per item = 1.26 (and SP-EN diff = -.515, so EN is easier)


l1_removed <- c("give","ear","cheese","crib","banana","shirt","chicken (food)",
                "moo","nap","apple","stop","drawer","face","swing (object)","belly button")

l1_added <- c("draw", "elephant", "animal", "scissors", "brother", "sister", "grandpa",
              "hammer", "hurry", "squirrel", "show", "firetruck", "boots", "penguin",
              "turtle")

# setdiff(l1_removed, dict$english) # nap
# setdiff(l1_added, dict$english) # firetruck

new_dll1long <- dict %>% filter(is.element(english, dll1long$english)) %>% # 71/74 in dict
  filter(!is.element(english, l1_removed)) %>% # 57
  bind_rows(dict %>% filter(is.element(english, l1_added))) # 73

new_l1_dds = new_dll1long %>% 
  filter(!is.na(d_diff_sq)) %>%
  mutate(en_sp_d_diff = en_d - sp_d) %>%
  summarise(mean_diff_sq = mean(d_diff_sq), mean_diff = mean(en_sp_d_diff))
# now .535 and SP-EN diff = -.187
ratio_l1_improv = orig_l1_dds / new_l1_dds
```

The average discrepancy on the original DLL-ES1 Extended was `r round(orig_l1_dds[1], 2)`, with the Spanish items being on average more difficult than their English equivalents (English - Spanish ease: `r round(orig_l1_dds[2],2)`).
After substituting 15 of the items with the largest discrepancy, the average discrepancy was `r round(new_l1_dds[1], 2)`, and the overall difference between languages was closer to zero (English - Spanish ease: `r round(new_l1_dds[2],2)`).

## DLL-ES2 Extended Form Swaps

Now we evaluate the swaps for the DLL-ES2 Extended form.

```{r, echo=F}
orig_l2_dds = dll2long %>% 
  mutate(en_sp_d_diff = en_d - sp_d) %>%
  filter(!is.na(d_diff_sq)) %>%
  summarise(mean_diff_sq = mean(d_diff_sq), mean_diff = mean(en_sp_d_diff))
            #med_diff_sq = median(d_diff_sq))
# original list mean d_diff_sq per item = 1.31, and EN-SP diff = -.31


l2_removed <- c("bubbles", "water (beverage)", "soup", "bread", "table", "coat",
                "pencil", "bee", "chocolate", "cheese", "crib", "motorcycle",
                "nose", "soda/pop", "chicken (food)") # instead of dropping "girl" (15th), drop "nose" (16th)

l2_added <- c("elephant", "animal", "scissors", "helicopter", "brother",
              "sister", "vagina*", "toothbrush", "hammer", "closet", "shower",
              "squirrel", "firetruck", "woof woof", "penguin")

# setdiff(l2_removed, dict$english) # bubbles coat pencil chocolate
# setdiff(l2_added, dict$english)

new_dll2long <- dict %>% filter(is.element(english, dll2long$english)) %>% # 116/131 in dict
  filter(!is.element(english, l2_removed)) %>% # 102 - 1.01 and -.25 after removal..
  bind_rows(dict %>% filter(is.element(english, l2_added))) # 117

new_l2_dds = new_dll2long %>% 
  filter(!is.na(d_diff_sq)) %>%
  mutate(en_sp_d_diff = en_d - sp_d) %>%
  summarise(mean_diff_sq = mean(d_diff_sq), mean_diff = mean(en_sp_d_diff))
# now .961 and SP-EN diff = -.211
ratio_l2_improv = orig_l2_dds / new_l2_dds
```

The average discrepancy on the original DLL-ES2 Extended was `r round(orig_l2_dds[1], 2)`, with the Spanish items being on average more difficult than their English equivalents (English - Spanish ease: `r round(orig_l2_dds[2],2)`).
After substituting the 15 most discrepant items, the average discrepancy was `r round(new_l2_dds[1], 2)`, and the overall difference between languages was closer to zero (English - Spanish ease: `r round(new_l2_dds[2],2)`).

## New DLL vs. Full CDI Correlations

Finally, we re-evaluate the correlations between full CDI scores and DLL scores after the above swaps on the DLL-ES1 and DLL-ES2 Extended forms.

```{r, echo=F}
en_wg_dll_long_cols = na.omit(match(new_dll1long$english, colnames(d_mat_en)))
en_ws_dll_long_cols = na.omit(match(new_dll2long$english, colnames(d_mat_en)))

d_demo_en$newDLL1long_sum = rowSums(d_mat_en[,en_wg_dll_long_cols], na.rm=T) + 
  d_demo_en$DLLsum
d_demo_en$newDLL2long_sum = rowSums(d_mat_en[,en_ws_short_dll_cols], na.rm=T) +
  d_demo_en$DLL2sum


sp_wg_dll_long_cols = na.omit(match(new_dll1long$spanish, colnames(d_mat_sp)))
sp_ws_dll_long_cols = na.omit(match(new_dll2long$spanish, colnames(d_mat_sp)))

d_demo_sp$newDLL1long_sum = rowSums(d_mat_sp[,sp_wg_dll_long_cols], na.rm=T) +
  d_demo_sp$DLL1sum
d_demo_sp$newDLL2long_sum = rowSums(d_mat_sp[,sp_ws_short_dll_cols], na.rm=T) +
  d_demo_sp$DLL2sum
# are these *just* the DLL long supplement scores? need to add to Dll1sum / Dll2sum /

# comprehension
en_wg_dll_long_cols = na.omit(match(new_dll1long$english, colnames(d_mat_en_comp)))
d_demo_en_comp$newDLL1long_sum = rowSums(d_mat_en_comp[,en_wg_dll_long_cols], na.rm=T) + 
  d_demo_en_comp$DLLsum

sp_wg_dll_long_cols = na.omit(match(new_dll1long$spanish, colnames(d_mat_sp_comp)))
d_demo_sp_comp$newDLL1long_sum = rowSums(d_mat_sp_comp[,sp_wg_dll_long_cols], na.rm=T) +
  d_demo_sp_comp$DLLsum

young_en <- d_demo_en %>% filter(age<19, production<396)
old_en <- d_demo_en %>% filter(age>15)

young_sp <- d_demo_sp %>% filter(age<19, production<=428)
old_sp <- d_demo_sp %>% filter(age>15)

dll1long_en_cor = cor(young_en$production, young_en$newDLL1long_sum) 
dll2long_en_cor = cor(old_en$production, old_en$newDLL2long_sum) 

dll1long_sp_cor = cor(young_sp$production, young_sp$newDLL1long_sum) 
dll2long_sp_cor = cor(old_sp$production, old_sp$newDLL2long_sum) 

dll1long_new_en_cor_comp = with(d_demo_en_comp, cor(comprehension, newDLL1long_sum)) 
dll1long_new_sp_cor_comp = with(d_demo_sp_comp, cor(comprehension, newDLL1long_sum))
```

New DLL-ES1 Extended vs. English CDI:WG LF comprehension scores: $r = `r round(dll1long_new_en_cor_comp, 3)`$. 
New DLL-ES1 Extended vs. Spanish CDI:WG LF comprehension scores: $r = `r round(dll1long_new_sp_cor_comp, 3)`$.

New DLL-ES1 Extended vs. English CDI:WG LF production scores: $r = `r round(dll1long_en_cor, 3)`$.
New DLL-ES1 Extended vs. Spanish CDI:WG LF production scores: $r = `r round(dll1long_sp_cor, 3)`$.

New DLL-ES2 Extended vs. English CDI:WS LF production scores: $r = `r round(dll2long_en_cor, 3)`$.
New DLL-ES2 Extended vs. Spanish CDI:WS LF production scores: $r = `r round(dll2long_sp_cor, 3)`$.