Related to #27 (crfsuite). Allow to reconstruct the original text + allow to add a from/to field in as.data.frame (useful but undocumented feature).

jwijffels · jwijffels · commit 1ddfdbcee34a · 2018-08-28T23:58:29.000+02:00
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -7,4 +7,5 @@ LICENSE
 dutch-ud-2.0-170801.udpipe
 sanskrit-ud-2.0-170801.udpipe
 dutch-lassysmall-ud-2.0-170801.udpipe
+spanish-ud-2.0-170801.udpipe
 docusaurus/
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@ dev
 .udpipe_process.log
 dutch-ud-2.0-170801.udpipe
 sanskrit-ud-2.0-170801.udpipe
-dutch-lassysmall-ud-2.0-170801.udpipe
+dutch-lassysmall-ud-2.0-170801.udpipe
+spanish-ud-2.0-170801.udpipe
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: udpipe
 Type: Package
 Title: Tokenization, Parts of Speech Tagging, Lemmatization and Dependency Parsing with the 'UDPipe' 'NLP' Toolkit
-Version: 0.6.1
+Version: 0.6.2
 Maintainer: Jan Wijffels <jwijffels@bnosac.be>
 Authors@R: c(person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = 'jwijffels@bnosac.be'), 
     person('BNOSAC', role = 'cph'), 
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# CHANGES IN udpipe VERSION 0.6.2
+
+- Allow to reconstruct the original text + allow to add a from/to field in as.data.frame (useful but undocumented feature)
+
 # CHANGES IN udpipe VERSION 0.6.1
 
 - src/udpipe.cpp: at the request of CRAN: remove dynamic execution specification which g++-7 and later complain about by removing the throw statements
diff --git a/R/udpipe_parse.R b/R/udpipe_parse.R
@@ -160,13 +160,20 @@ read_connlu <- function(x, is_udpipe_annotation = FALSE, ...){
       output_fields <- append(output_fields, values = "term_id", after = 4)
     }
   }
+  if("from_to" %in% names(ldots)){
+    if(isTRUE(ldots$from_to)){
+      output_fields <- append(output_fields, values = c("from", "to"), after = 4)
+    }
+  }
   ## Default output 
   default <- data.frame(doc_id = character(), 
                         paragraph_id = integer(), 
                         sentence_id = character(), 
-                        sentence = character(), 
+                        sentence = character(),
+                        from = integer(),
+                        to = integer(),
                         term_id = integer(),
-                        token_id = character(), 
+                        token_id = character(),
                         token = character(), 
                         lemma = character(), 
                         upos = character(), 
@@ -240,6 +247,10 @@ read_connlu <- function(x, is_udpipe_annotation = FALSE, ...){
   out[, dep_rel := underscore_as_na(dep_rel)]
   out[, deps := underscore_as_na(deps)]
   out[, misc := underscore_as_na(misc)]
+  if(all(c("from", "to") %in% output_fields)){
+    out[, c("from", "to") := udpipe_reconstruct(sentence_id = sentence_id, token = token, token_id = token_id, misc = misc, only_from_to = TRUE), 
+        by = list(doc_id)]
+  }
   out <- out[, output_fields, with = FALSE]
   out <- data.table::setDF(out)
   out
diff --git a/R/udpipe_reconstruct.R b/R/udpipe_reconstruct.R
@@ -0,0 +1,136 @@
+if(FALSE){
+  library(udpipe)
+  txt <- "  Maxime y su mujer\\ hicieron que nuestra estancia 
+    fuera lo mas comoda posible. \n  
+    El primer dia Maxime nos espero hasta tarde para recibirnos y 
+    darnos todas las indicaciones posibles del apartamento y 
+    de la situacion de aparcamiento en el barrio ya que fuimos 
+    desde Espana con el coche ( es todo zona azul de 9:00 a 18:00 pero 
+    como saliamos pronto y llegabamos tarde no nos afectaba).\n  
+    El apartamento es muy completo, la verdad es como aparece 
+    en el anuncio, es mas, incluso tiene una barandilla 
+    en la escaleras que dan a la habitacion que en la foto no sale.\n   
+    El jardin esta muy bien para desayunar o cenar ya que 
+    tiene una mesa grande para ello.\n   
+    El barrio es muy tranquilo con bastantes tiendas y restaurantes.\n
+       En general estuvimos muy comodos durante nuestra estancia, 
+       repetiriamos ahora mismo.\n   Muchas gracias por todo Maxime.  "
+  ud_model <- udpipe_download_model(language = "spanish")
+  ud_model <- udpipe_load_model(ud_model$file_model)
+  x <- udpipe_annotate(ud_model, x = txt)
+  x <- as.data.frame(x, from_to = TRUE)
+  original <- udpipe_reconstruct(sentence_id = x$sentence_id, token = x$token, token_id = x$token_id, misc = x$misc)
+}
+
+
+udpipe_reconstruct <- function(sentence_id, token, token_id, misc, only_from_to = FALSE){
+
+  ##
+  ## FROM THE UDPIPE DOCS: 
+  ##
+  
+  # The markup uses the following MISC fields on tokens (not words in multi-word tokens):
+  # SpacesBefore=content (by default empty): spaces/other content preceding the token
+  # SpacesAfter=content (by default a space if SpaceAfter=No feature is not present, empty otherwise): spaces/other content following the token
+  # SpacesInToken=content (by default equal to the FORM of the token): FORM of the token including original spaces (this is needed only if tokens are allowed to contain spaces and a token contains a tab or newline characters)
+  
+  # The content of all the three fields must be escaped to allow storing tabs and newlines. The following C-like schema is used:
+  # \s: space
+  # \t: tab
+  # \r: CR character
+  # \n: LF character
+  # \p: | (pipe character)
+  # \\: \ (backslash character)
+
+  rawtxt <- token
+  
+  has_spacesafter_no <- grepl(pattern = "SpaceAfter=No", misc)
+  has_spacesafter <- grepl(pattern = "SpacesAfter=", misc)
+  has_spacesbefore <- grepl(pattern = "SpacesBefore=", misc)
+  has_spacesintoken <- grepl(pattern = "SpacesInToken=", misc)
+  
+  ##
+  ## Spaces after
+  ##
+  after <- rep("", length(token))
+  ## if no spaceafter feature, there is a space
+  after[!has_spacesafter] <- " "
+  ## if missing, there is a space after
+  after[is.na(misc)] <- " "
+  ## if contains SpaceAfter=No, there is nothing to add
+  after[has_spacesafter_no] <- ""
+  ## if contains SpacesAfter=, add the spaces to the after part
+  idx <- which(has_spacesafter)
+  addme <- gsub(pattern = "(SpacesAfter=)(.+)($|Spaces)", "\\2", misc[idx])
+  addme <- gsub("\\\\s", " ", addme)
+  addme <- gsub("\\\\n", "\n", addme)
+  addme <- gsub("\\\\t", "\t", addme)
+  addme <- gsub("\\\\r", "\r", addme)
+  addme <- gsub("\\\\p", "|", addme)
+  addme <- gsub("\\\\", "\\", addme)
+  after[idx] <- addme
+  ## Fix for using std::istringstream in udpipe_annotate as it always ends with a newline character
+  after[length(after)] <- gsub("\n$", "", after[length(after)])
+  
+  ##
+  ## Spaces before
+  ##
+  before <- rep("", length(token))
+  ## if contains SpacesBefore=, add the spaces to the after part
+  idx <- which(has_spacesbefore)
+  addme <- gsub(pattern = "(SpacesBefore=)(.+)($|Spaces)", "\\2", misc[idx])
+  addme <- gsub("\\\\s", " ", addme)
+  addme <- gsub("\\\\n", "\n", addme)
+  addme <- gsub("\\\\t", "\t", addme)
+  addme <- gsub("\\\\r", "\r", addme)
+  addme <- gsub("\\\\p", "|", addme)
+  addme <- gsub("\\\\", "\\", addme)
+  before[idx] <- addme
+  
+  ##
+  ## SpacesInToken - MISC field stores form of the token including original spaces if there is a space in the token which can not be handled by FORM
+  ##
+  idx <- which(has_spacesintoken)
+  token[idx] <- gsub(pattern = "(SpacesInToken=)(.+)($|Spaces)", "\\2", misc[idx])
+  
+  ##
+  ## Construct original text
+  ##
+  original_txt <- sprintf("%s%s%s", before, token, after)
+  
+  ##
+  ## Multi-word tokens are not considered
+  ##
+  is_multi_word <- grepl("-", token_id)
+  ids <- sprintf("%s.%s", sentence_id, token_id)
+  ids_remove <- mapply(sentence_id = sentence_id[is_multi_word],
+                       token_id = token_id[is_multi_word], 
+                       FUN=function(sentence_id, token_id){
+                         sprintf("%s.%s", sentence_id, unlist(strsplit(token_id, split = "-")))
+                         }, SIMPLIFY = TRUE, USE.NAMES = FALSE)
+  idx <- which(ids %in% ids_remove)
+  original_txt[idx] <- ""
+  
+  ##
+  ## Construct from-to
+  ##
+  before[idx] <- ""
+  after[idx] <- ""
+  
+  nchars <- nchar(original_txt)
+  original_to <- cumsum(nchars)
+  original_from <- original_to - nchars + 1L
+  from <- original_from + nchar(before)
+  to <- original_to - nchar(after)
+  from[idx] <- NA_integer_
+  to[idx] <- NA_integer_
+
+  
+  if(only_from_to){
+    return(list(from = from, to = to))  
+  }else{
+    return(list(text = paste(original_txt, collapse = ""),
+                from = from,
+                to = to))  
+  }
+}