-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_get_adj_uses.R
126 lines (96 loc) · 5.09 KB
/
2_get_adj_uses.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Author: Jeremy Boyd ([email protected])
# Summary: Identifies adjective uses in CHILDES corpus files;
# organizes them for later analysis.
# Specify the location of corpus files.
corpus_dir <- "~/Documents/studies/A-ADJ/AP/get_adj_uses/corpora"
# Create list; set adjective counter to zero
corpus.file.list <- list()
adjcount <- 1
# Iterate over all corpus files
for (file in dir(path = corpus_dir, full.names = TRUE, recursive = TRUE)) {
# Only process .cha files.
if (grepl("\\.cha", file, perl = TRUE)) {
# User output
cat("Processing ", file, "...", "\n", sep = "")
# Store corpus and filenames
corpusname <- unlist((strsplit(file, "\\/")))[10]
filename <- unlist((strsplit(file, "\\/")))[length(
unlist((strsplit(file, "\\/"))))]
# Read in file
corpus.file <- scan(file, what = "char", sep = "\n", quiet = TRUE)
#**********PREP RAW CORPUS FOR ANALYSIS**********
# Concatenate all lines into one, replacing newlines with "___".
corpus.file.2 <- paste(corpus.file, collapse = "___")
# Replace "___" followed by at least one whitespace character with a
# single space
corpus.file.3 <- gsub("___\\s+", " ", corpus.file.2, perl = TRUE)
# Split string on "___" to break up single line
corpus.file.4 <- unlist(strsplit(corpus.file.3, "___", perl = TRUE))
# Get rid of extra spaces. Use this version below in order to get
# correct line numbers
corpus.file.4 <- gsub(" +", " ", corpus.file.4, perl = TRUE)
# Find header line with child age in it. Problem for
# macwhinney/85a1.cha, because that file doesn't contain CHI. Instead,
# there are two children coded MAR and ROS. I've included a kludgy
# fix below. Funny that the only formatting error in all of the files
# this script looks at comes from the guy who created the CHILDES
# database.
if (length(grep("@ID.+CHI\\|[0-9]", corpus.file.4, perl = TRUE)) == 1)
{
ageline <- grep("@ID.+CHI\\|[0-9]", corpus.file.4,
value = TRUE, perl = TRUE)
} else {
ageline <- "problem_getting_age | problem_getting_age | problem_getting_age | problem_getting_age"
}
# Split on '|' to get child age.
age <- unlist(strsplit(ageline, "\\|"))[4]
# Set previous line to empty
preline <- ""
# Set line number to 0
linenum <- 0
#**********POPULATE LIST**********
# Iterate over lines
for (line in corpus.file.4) {
# Only process %mor lines that have adjectives.
if (grepl("%mor", line, perl = TRUE) & grepl("adj\\|", line, perl = TRUE)) {
# Get utterance info from preline
speaker <- substr(preline, 2, 4)
utt <- substr(preline, 7, nchar(preline))
# Get %mor annotation
mor <- substr(line, 7, nchar(line))
# Get positions of all adjectives in line
adjlist <- gregexpr("adj\\|[a-zA-Z&#\\-]+", mor, perl = TRUE)
# Iterate over adjectives in adjlist (could be list of 1)
for (j in 1:length(unlist(adjlist))) {
# Get position of first and last letters of adj
firstletter <- unlist(adjlist)[j]+4
lastletter <- (unlist(adjlist)[j]+4) + (attr(adjlist[[1]], "match.length")[j]-5)
# Get adjective from line
adj <- substr(mor, firstletter, lastletter)
# Write adj entry to list
corpus.file.list[["corpus"]][adjcount] = corpusname
corpus.file.list[["file"]][adjcount] = filename
# Note that line numbers are based off of the modified
# corpus file--corpus.file.4.
corpus.file.list[["speaker"]][adjcount] = speaker
corpus.file.list[["age"]][adjcount] = age
corpus.file.list[["adjective"]][adjcount] = adj
corpus.file.list[["utterance"]][adjcount] = utt
corpus.file.list[["mor"]][adjcount] = mor
# Increment adjective count
adjcount <- adjcount + 1
}
}
# Save previous line; increment line number
preline <- line
linenum <- linenum + 1
}
}
}
#**********CONVERT TO DATAFRAME AND SAVE**********
# Convert list to dataframe
adjdata <- as.data.frame(sapply(corpus.file.list,
"[", 1:max(sapply(corpus.file.list, length))))
# Save dataframe to file
write.table(adjdata, file = "adj_data.txt", sep = "\t", eol = "\n",
quote = FALSE, row.names = FALSE, col.names = TRUE, append = FALSE)