Skip to content

Commit 1ddfdbc

Browse files
committed
Related to #27 (crfsuite). Allow to reconstruct the original text + allow to add a from/to field in as.data.frame (useful but undocumented feature).
1 parent 50931b0 commit 1ddfdbc

File tree

6 files changed

+157
-4
lines changed

6 files changed

+157
-4
lines changed

.Rbuildignore

+1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ LICENSE
77
dutch-ud-2.0-170801.udpipe
88
sanskrit-ud-2.0-170801.udpipe
99
dutch-lassysmall-ud-2.0-170801.udpipe
10+
spanish-ud-2.0-170801.udpipe
1011
docusaurus/

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ dev
1010
.udpipe_process.log
1111
dutch-ud-2.0-170801.udpipe
1212
sanskrit-ud-2.0-170801.udpipe
13-
dutch-lassysmall-ud-2.0-170801.udpipe
13+
dutch-lassysmall-ud-2.0-170801.udpipe
14+
spanish-ud-2.0-170801.udpipe

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: udpipe
22
Type: Package
33
Title: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit
4-
Version: 0.6.1
4+
Version: 0.6.2
55
Maintainer: Jan Wijffels <[email protected]>
66
Authors@R: c(person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = '[email protected]'),
77
person('BNOSAC', role = 'cph'),

NEWS.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# CHANGES IN udpipe VERSION 0.6.2
2+
3+
- Allow to reconstruct the original text + allow to add a from/to field in as.data.frame (useful but undocumented feature)
4+
15
# CHANGES IN udpipe VERSION 0.6.1
26

37
- src/udpipe.cpp: at the request of CRAN: remove dynamic execution specification which g++-7 and later complain about by removing the throw statements

R/udpipe_parse.R

+13-2
Original file line numberDiff line numberDiff line change
@@ -160,13 +160,20 @@ read_connlu <- function(x, is_udpipe_annotation = FALSE, ...){
160160
output_fields <- append(output_fields, values = "term_id", after = 4)
161161
}
162162
}
163+
if("from_to" %in% names(ldots)){
164+
if(isTRUE(ldots$from_to)){
165+
output_fields <- append(output_fields, values = c("from", "to"), after = 4)
166+
}
167+
}
163168
## Default output
164169
default <- data.frame(doc_id = character(),
165170
paragraph_id = integer(),
166171
sentence_id = character(),
167-
sentence = character(),
172+
sentence = character(),
173+
from = integer(),
174+
to = integer(),
168175
term_id = integer(),
169-
token_id = character(),
176+
token_id = character(),
170177
token = character(),
171178
lemma = character(),
172179
upos = character(),
@@ -240,6 +247,10 @@ read_connlu <- function(x, is_udpipe_annotation = FALSE, ...){
240247
out[, dep_rel := underscore_as_na(dep_rel)]
241248
out[, deps := underscore_as_na(deps)]
242249
out[, misc := underscore_as_na(misc)]
250+
if(all(c("from", "to") %in% output_fields)){
251+
out[, c("from", "to") := udpipe_reconstruct(sentence_id = sentence_id, token = token, token_id = token_id, misc = misc, only_from_to = TRUE),
252+
by = list(doc_id)]
253+
}
243254
out <- out[, output_fields, with = FALSE]
244255
out <- data.table::setDF(out)
245256
out

R/udpipe_reconstruct.R

+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
if(FALSE){
2+
library(udpipe)
3+
txt <- " Maxime y su mujer\\ hicieron que nuestra estancia
4+
fuera lo mas comoda posible. \n
5+
El primer dia Maxime nos espero hasta tarde para recibirnos y
6+
darnos todas las indicaciones posibles del apartamento y
7+
de la situacion de aparcamiento en el barrio ya que fuimos
8+
desde Espana con el coche ( es todo zona azul de 9:00 a 18:00 pero
9+
como saliamos pronto y llegabamos tarde no nos afectaba).\n
10+
El apartamento es muy completo, la verdad es como aparece
11+
en el anuncio, es mas, incluso tiene una barandilla
12+
en la escaleras que dan a la habitacion que en la foto no sale.\n
13+
El jardin esta muy bien para desayunar o cenar ya que
14+
tiene una mesa grande para ello.\n
15+
El barrio es muy tranquilo con bastantes tiendas y restaurantes.\n
16+
En general estuvimos muy comodos durante nuestra estancia,
17+
repetiriamos ahora mismo.\n Muchas gracias por todo Maxime. "
18+
ud_model <- udpipe_download_model(language = "spanish")
19+
ud_model <- udpipe_load_model(ud_model$file_model)
20+
x <- udpipe_annotate(ud_model, x = txt)
21+
x <- as.data.frame(x, from_to = TRUE)
22+
original <- udpipe_reconstruct(sentence_id = x$sentence_id, token = x$token, token_id = x$token_id, misc = x$misc)
23+
}
24+
25+
26+
udpipe_reconstruct <- function(sentence_id, token, token_id, misc, only_from_to = FALSE){
27+
28+
##
29+
## FROM THE UDPIPE DOCS:
30+
##
31+
32+
# The markup uses the following MISC fields on tokens (not words in multi-word tokens):
33+
# SpacesBefore=content (by default empty): spaces/other content preceding the token
34+
# SpacesAfter=content (by default a space if SpaceAfter=No feature is not present, empty otherwise): spaces/other content following the token
35+
# SpacesInToken=content (by default equal to the FORM of the token): FORM of the token including original spaces (this is needed only if tokens are allowed to contain spaces and a token contains a tab or newline characters)
36+
37+
# The content of all the three fields must be escaped to allow storing tabs and newlines. The following C-like schema is used:
38+
# \s: space
39+
# \t: tab
40+
# \r: CR character
41+
# \n: LF character
42+
# \p: | (pipe character)
43+
# \\: \ (backslash character)
44+
45+
rawtxt <- token
46+
47+
has_spacesafter_no <- grepl(pattern = "SpaceAfter=No", misc)
48+
has_spacesafter <- grepl(pattern = "SpacesAfter=", misc)
49+
has_spacesbefore <- grepl(pattern = "SpacesBefore=", misc)
50+
has_spacesintoken <- grepl(pattern = "SpacesInToken=", misc)
51+
52+
##
53+
## Spaces after
54+
##
55+
after <- rep("", length(token))
56+
## if no spaceafter feature, there is a space
57+
after[!has_spacesafter] <- " "
58+
## if missing, there is a space after
59+
after[is.na(misc)] <- " "
60+
## if contains SpaceAfter=No, there is nothing to add
61+
after[has_spacesafter_no] <- ""
62+
## if contains SpacesAfter=, add the spaces to the after part
63+
idx <- which(has_spacesafter)
64+
addme <- gsub(pattern = "(SpacesAfter=)(.+)($|Spaces)", "\\2", misc[idx])
65+
addme <- gsub("\\\\s", " ", addme)
66+
addme <- gsub("\\\\n", "\n", addme)
67+
addme <- gsub("\\\\t", "\t", addme)
68+
addme <- gsub("\\\\r", "\r", addme)
69+
addme <- gsub("\\\\p", "|", addme)
70+
addme <- gsub("\\\\", "\\", addme)
71+
after[idx] <- addme
72+
## Fix for using std::istringstream in udpipe_annotate as it always ends with a newline character
73+
after[length(after)] <- gsub("\n$", "", after[length(after)])
74+
75+
##
76+
## Spaces before
77+
##
78+
before <- rep("", length(token))
79+
## if contains SpacesBefore=, add the spaces to the after part
80+
idx <- which(has_spacesbefore)
81+
addme <- gsub(pattern = "(SpacesBefore=)(.+)($|Spaces)", "\\2", misc[idx])
82+
addme <- gsub("\\\\s", " ", addme)
83+
addme <- gsub("\\\\n", "\n", addme)
84+
addme <- gsub("\\\\t", "\t", addme)
85+
addme <- gsub("\\\\r", "\r", addme)
86+
addme <- gsub("\\\\p", "|", addme)
87+
addme <- gsub("\\\\", "\\", addme)
88+
before[idx] <- addme
89+
90+
##
91+
## SpacesInToken - MISC field stores form of the token including original spaces if there is a space in the token which can not be handled by FORM
92+
##
93+
idx <- which(has_spacesintoken)
94+
token[idx] <- gsub(pattern = "(SpacesInToken=)(.+)($|Spaces)", "\\2", misc[idx])
95+
96+
##
97+
## Construct original text
98+
##
99+
original_txt <- sprintf("%s%s%s", before, token, after)
100+
101+
##
102+
## Multi-word tokens are not considered
103+
##
104+
is_multi_word <- grepl("-", token_id)
105+
ids <- sprintf("%s.%s", sentence_id, token_id)
106+
ids_remove <- mapply(sentence_id = sentence_id[is_multi_word],
107+
token_id = token_id[is_multi_word],
108+
FUN=function(sentence_id, token_id){
109+
sprintf("%s.%s", sentence_id, unlist(strsplit(token_id, split = "-")))
110+
}, SIMPLIFY = TRUE, USE.NAMES = FALSE)
111+
idx <- which(ids %in% ids_remove)
112+
original_txt[idx] <- ""
113+
114+
##
115+
## Construct from-to
116+
##
117+
before[idx] <- ""
118+
after[idx] <- ""
119+
120+
nchars <- nchar(original_txt)
121+
original_to <- cumsum(nchars)
122+
original_from <- original_to - nchars + 1L
123+
from <- original_from + nchar(before)
124+
to <- original_to - nchar(after)
125+
from[idx] <- NA_integer_
126+
to[idx] <- NA_integer_
127+
128+
129+
if(only_from_to){
130+
return(list(from = from, to = to))
131+
}else{
132+
return(list(text = paste(original_txt, collapse = ""),
133+
from = from,
134+
to = to))
135+
}
136+
}

0 commit comments

Comments
 (0)