1
+ if (FALSE ){
2
+ library(udpipe )
3
+ txt <- " Maxime y su mujer\\ hicieron que nuestra estancia
4
+ fuera lo mas comoda posible. \n
5
+ El primer dia Maxime nos espero hasta tarde para recibirnos y
6
+ darnos todas las indicaciones posibles del apartamento y
7
+ de la situacion de aparcamiento en el barrio ya que fuimos
8
+ desde Espana con el coche ( es todo zona azul de 9:00 a 18:00 pero
9
+ como saliamos pronto y llegabamos tarde no nos afectaba).\n
10
+ El apartamento es muy completo, la verdad es como aparece
11
+ en el anuncio, es mas, incluso tiene una barandilla
12
+ en la escaleras que dan a la habitacion que en la foto no sale.\n
13
+ El jardin esta muy bien para desayunar o cenar ya que
14
+ tiene una mesa grande para ello.\n
15
+ El barrio es muy tranquilo con bastantes tiendas y restaurantes.\n
16
+ En general estuvimos muy comodos durante nuestra estancia,
17
+ repetiriamos ahora mismo.\n Muchas gracias por todo Maxime. "
18
+ ud_model <- udpipe_download_model(language = " spanish" )
19
+ ud_model <- udpipe_load_model(ud_model $ file_model )
20
+ x <- udpipe_annotate(ud_model , x = txt )
21
+ x <- as.data.frame(x , from_to = TRUE )
22
+ original <- udpipe_reconstruct(sentence_id = x $ sentence_id , token = x $ token , token_id = x $ token_id , misc = x $ misc )
23
+ }
24
+
25
+
26
+ udpipe_reconstruct <- function (sentence_id , token , token_id , misc , only_from_to = FALSE ){
27
+
28
+ # #
29
+ # # FROM THE UDPIPE DOCS:
30
+ # #
31
+
32
+ # The markup uses the following MISC fields on tokens (not words in multi-word tokens):
33
+ # SpacesBefore=content (by default empty): spaces/other content preceding the token
34
+ # SpacesAfter=content (by default a space if SpaceAfter=No feature is not present, empty otherwise): spaces/other content following the token
35
+ # SpacesInToken=content (by default equal to the FORM of the token): FORM of the token including original spaces (this is needed only if tokens are allowed to contain spaces and a token contains a tab or newline characters)
36
+
37
+ # The content of all the three fields must be escaped to allow storing tabs and newlines. The following C-like schema is used:
38
+ # \s: space
39
+ # \t: tab
40
+ # \r: CR character
41
+ # \n: LF character
42
+ # \p: | (pipe character)
43
+ # \\: \ (backslash character)
44
+
45
+ rawtxt <- token
46
+
47
+ has_spacesafter_no <- grepl(pattern = " SpaceAfter=No" , misc )
48
+ has_spacesafter <- grepl(pattern = " SpacesAfter=" , misc )
49
+ has_spacesbefore <- grepl(pattern = " SpacesBefore=" , misc )
50
+ has_spacesintoken <- grepl(pattern = " SpacesInToken=" , misc )
51
+
52
+ # #
53
+ # # Spaces after
54
+ # #
55
+ after <- rep(" " , length(token ))
56
+ # # if no spaceafter feature, there is a space
57
+ after [! has_spacesafter ] <- " "
58
+ # # if missing, there is a space after
59
+ after [is.na(misc )] <- " "
60
+ # # if contains SpaceAfter=No, there is nothing to add
61
+ after [has_spacesafter_no ] <- " "
62
+ # # if contains SpacesAfter=, add the spaces to the after part
63
+ idx <- which(has_spacesafter )
64
+ addme <- gsub(pattern = " (SpacesAfter=)(.+)($|Spaces)" , " \\ 2" , misc [idx ])
65
+ addme <- gsub(" \\\\ s" , " " , addme )
66
+ addme <- gsub(" \\\\ n" , " \n " , addme )
67
+ addme <- gsub(" \\\\ t" , " \t " , addme )
68
+ addme <- gsub(" \\\\ r" , " \r " , addme )
69
+ addme <- gsub(" \\\\ p" , " |" , addme )
70
+ addme <- gsub(" \\\\ " , " \\ " , addme )
71
+ after [idx ] <- addme
72
+ # # Fix for using std::istringstream in udpipe_annotate as it always ends with a newline character
73
+ after [length(after )] <- gsub(" \n $" , " " , after [length(after )])
74
+
75
+ # #
76
+ # # Spaces before
77
+ # #
78
+ before <- rep(" " , length(token ))
79
+ # # if contains SpacesBefore=, add the spaces to the after part
80
+ idx <- which(has_spacesbefore )
81
+ addme <- gsub(pattern = " (SpacesBefore=)(.+)($|Spaces)" , " \\ 2" , misc [idx ])
82
+ addme <- gsub(" \\\\ s" , " " , addme )
83
+ addme <- gsub(" \\\\ n" , " \n " , addme )
84
+ addme <- gsub(" \\\\ t" , " \t " , addme )
85
+ addme <- gsub(" \\\\ r" , " \r " , addme )
86
+ addme <- gsub(" \\\\ p" , " |" , addme )
87
+ addme <- gsub(" \\\\ " , " \\ " , addme )
88
+ before [idx ] <- addme
89
+
90
+ # #
91
+ # # SpacesInToken - MISC field stores form of the token including original spaces if there is a space in the token which can not be handled by FORM
92
+ # #
93
+ idx <- which(has_spacesintoken )
94
+ token [idx ] <- gsub(pattern = " (SpacesInToken=)(.+)($|Spaces)" , " \\ 2" , misc [idx ])
95
+
96
+ # #
97
+ # # Construct original text
98
+ # #
99
+ original_txt <- sprintf(" %s%s%s" , before , token , after )
100
+
101
+ # #
102
+ # # Multi-word tokens are not considered
103
+ # #
104
+ is_multi_word <- grepl(" -" , token_id )
105
+ ids <- sprintf(" %s.%s" , sentence_id , token_id )
106
+ ids_remove <- mapply(sentence_id = sentence_id [is_multi_word ],
107
+ token_id = token_id [is_multi_word ],
108
+ FUN = function (sentence_id , token_id ){
109
+ sprintf(" %s.%s" , sentence_id , unlist(strsplit(token_id , split = " -" )))
110
+ }, SIMPLIFY = TRUE , USE.NAMES = FALSE )
111
+ idx <- which(ids %in% ids_remove )
112
+ original_txt [idx ] <- " "
113
+
114
+ # #
115
+ # # Construct from-to
116
+ # #
117
+ before [idx ] <- " "
118
+ after [idx ] <- " "
119
+
120
+ nchars <- nchar(original_txt )
121
+ original_to <- cumsum(nchars )
122
+ original_from <- original_to - nchars + 1L
123
+ from <- original_from + nchar(before )
124
+ to <- original_to - nchar(after )
125
+ from [idx ] <- NA_integer_
126
+ to [idx ] <- NA_integer_
127
+
128
+
129
+ if (only_from_to ){
130
+ return (list (from = from , to = to ))
131
+ }else {
132
+ return (list (text = paste(original_txt , collapse = " " ),
133
+ from = from ,
134
+ to = to ))
135
+ }
136
+ }
0 commit comments