updating grobidAnalisers to consider break line in tokenizeToLayoutTo…

…ken, when a \n is encountered #180
kermitt2 · Apr 14, 2018 · 27ffd3b · 27ffd3b
1 parent 34fba0a
commit 27ffd3b
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 49 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidAnalyzer.java
@@ -3,6 +3,7 @@
 import org.grobid.core.lang.Language;
 import org.grobid.core.layout.LayoutToken;
 
+import org.grobid.core.utilities.UnicodeUtil;
 import org.wipo.nlp.textboundaries.ReTokenizer;
 import org.wipo.nlp.textboundaries.ReTokenizerFactory;
 
@@ -158,15 +159,21 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text) {
 
 	public List<LayoutToken> tokenizeWithLayoutToken(String text, Language lang) {
         List<LayoutToken> result = new ArrayList<>();
+        text = UnicodeUtil.normaliseText(text);
         List<String> tokens = tokenize(text, lang);
         int pos = 0;
-        for(String tok : tokens) {
-        	LayoutToken layoutToken = new LayoutToken();
+        for (int i = 0; i < tokens.size(); i++) {
+            String tok = tokens.get(i);
+            LayoutToken layoutToken = new LayoutToken();
             layoutToken.setText(tok);
             layoutToken.setOffset(pos);
             result.add(layoutToken);
             pos += tok.length();
+            if (i < tokens.size() - 1 && tokens.get(i + 1).equals("\n")) {
+                layoutToken.setNewLineAfter(true);
+            }
         }
+
         return result;
     }
 }
diff --git a/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidDefaultAnalyzer.java b/grobid-core/src/main/java/org/grobid/core/analyzers/GrobidDefaultAnalyzer.java
@@ -5,9 +5,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -25,22 +25,22 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.StringTokenizer;
-	
+
 /**
  * Default tokenizer adequate for all Indo-European languages.
  *
  * @author Patrice Lopez
  */
 public class GrobidDefaultAnalyzer implements Analyzer {
 
-	private static volatile GrobidDefaultAnalyzer instance;
+    private static volatile GrobidDefaultAnalyzer instance;
 
-	public static GrobidDefaultAnalyzer getInstance() {
+    public static GrobidDefaultAnalyzer getInstance() {
         if (instance == null) {
             //double check idiom
             // synchronized (instanceController) {
-                if (instance == null)
-					getNewInstance();
+            if (instance == null)
+                getNewInstance();
             // }
         }
         return instance;
@@ -49,63 +49,75 @@ public static GrobidDefaultAnalyzer getInstance() {
     /**
      * Creates a new instance.
      */
-	private static synchronized void getNewInstance() {
-		instance = new GrobidDefaultAnalyzer();
-	}
+    private static synchronized void getNewInstance() {
+        instance = new GrobidDefaultAnalyzer();
+    }
 
     /**
      * Hidden constructor
      */
     private GrobidDefaultAnalyzer() {
-	}
+    }
 
     public static final String delimiters = TextUtilities.delimiters;
     //" \n\r\t([,:;?.!/)-–−\"“”‘’'`$]*\u2666\u2665\u2663\u2660\u00A0";
 
     public String getName() {
-		return "DefaultGrobidAnalyzer";
-	} 
-
-	public List<String> tokenize(String text) {
-		// as a default analyzer, language is not considered
-		return tokenize(text, null);
-	}
-
-	public List<String> tokenize(String text, Language lang) {
-		List<String> result = new ArrayList<String>();
-		text = UnicodeUtil.normaliseText(text);
-		StringTokenizer st = new StringTokenizer(text, delimiters, true);
-		while(st.hasMoreTokens()) {
-			result.add(st.nextToken());
-		}
-		return result;
-	}
-
-	public List<String> retokenize(List<String> chunks) {
-		StringTokenizer st = null;
-		List<String> result = new ArrayList<String>();
-		for(String chunk : chunks) {
-			chunk = UnicodeUtil.normaliseText(chunk);
-			st = new StringTokenizer(chunk, delimiters, true);
-			while(st.hasMoreTokens()) {
-				result.add(st.nextToken());
-			}
-		}
-		return result;
-	}
-
-	public List<LayoutToken> tokenizeWithLayoutToken(String text) {
+        return "DefaultGrobidAnalyzer";
+    }
+
+    public List<String> tokenize(String text) {
+        // as a default analyzer, language is not considered
+        return tokenize(text, null);
+    }
+
+    public List<String> tokenize(String text, Language lang) {
+        List<String> result = new ArrayList<>();
+        text = UnicodeUtil.normaliseText(text);
+        StringTokenizer st = new StringTokenizer(text, delimiters, true);
+        while (st.hasMoreTokens()) {
+            result.add(st.nextToken());
+        }
+        return result;
+    }
+
+    public List<String> retokenize(List<String> chunks) {
+        StringTokenizer st = null;
+        List<String> result = new ArrayList<>();
+        for (String chunk : chunks) {
+            chunk = UnicodeUtil.normaliseText(chunk);
+            st = new StringTokenizer(chunk, delimiters, true);
+            while (st.hasMoreTokens()) {
+                result.add(st.nextToken());
+            }
+        }
+        return result;
+    }
+
+    public List<LayoutToken> tokenizeWithLayoutToken(String text) {
+        return tokenizeWithLayoutToken(text, null);
+    }
+
+    /**
+     * Tokenize text returning list of LayoutTokens.
+     */
+    public List<LayoutToken> tokenizeWithLayoutToken(String text, Language language) {
         List<LayoutToken> result = new ArrayList<>();
         text = UnicodeUtil.normaliseText(text);
-        List<String> tokens = tokenize(text);
+        List<String> tokens = tokenize(text, language);
         int pos = 0;
-        for(String tok : tokens) {
-        	LayoutToken layoutToken = new LayoutToken();
+        for (int i = 0; i < tokens.size(); i++) {
+            String tok = tokens.get(i);
+            LayoutToken layoutToken = new LayoutToken();
             layoutToken.setText(tok);
             layoutToken.setOffset(pos);
             result.add(layoutToken);
             pos += tok.length();
+            if (i < tokens.size() - 1 && tokens.get(i + 1).equals("\n")) {
+                layoutToken.setNewLineAfter(true);
+            }
         }
+
         return result;
     }
 }
diff --git a/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidAnalyzerTest.java b/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidAnalyzerTest.java
@@ -0,0 +1,48 @@
+package org.grobid.core.analyzers;
+
+import org.grobid.core.layout.LayoutToken;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.*;
+
+public class GrobidAnalyzerTest {
+    GrobidAnalyzer target;
+
+    @Before
+    public void setUp() throws Exception {
+        target = GrobidAnalyzer.getInstance();
+    }
+
+    @Test
+    public void testTokenizeWithLayoutToken() {
+        final List<LayoutToken> layoutTokens = target.tokenizeWithLayoutToken("This is a normal \ntext,\n\n\n on several lines.\n");
+
+        assertThat(layoutTokens, hasSize(22));
+        assertThat(layoutTokens.get(0).getText(), is("This"));
+        assertThat(layoutTokens.get(1).getText(), is(" "));
+        assertThat(layoutTokens.get(6).getText(), is("normal"));
+        assertThat(layoutTokens.get(7).getText(), is(" "));
+        assertThat(layoutTokens.get(7).isNewLineAfter(), is(true));
+        assertThat(layoutTokens.get(8).getText(), is("\n"));
+        assertThat(layoutTokens.get(8).isNewLineAfter(), is(false));
+        assertThat(layoutTokens.get(10).getText(), is(","));
+        assertThat(layoutTokens.get(10).isNewLineAfter(), is(true));
+        assertThat(layoutTokens.get(11).getText(), is("\n"));
+        assertThat(layoutTokens.get(11).isNewLineAfter(), is(true));
+        assertThat(layoutTokens.get(12).getText(), is("\n"));
+        assertThat(layoutTokens.get(12).isNewLineAfter(), is(true));
+        assertThat(layoutTokens.get(13).getText(), is("\n"));
+        assertThat(layoutTokens.get(13).isNewLineAfter(), is(false));
+    }
+
+    @Test
+    public void testTokenizeWithLayoutToken_emptyText() {
+        assertThat(target.tokenizeWithLayoutToken(""), hasSize(0));
+    }
+
+}
diff --git a/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidDefaultAnalyzerTest.java b/grobid-core/src/test/java/org/grobid/core/analyzers/GrobidDefaultAnalyzerTest.java
@@ -0,0 +1,48 @@
+package org.grobid.core.analyzers;
+
+import org.grobid.core.layout.LayoutToken;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.*;
+
+public class GrobidDefaultAnalyzerTest {
+
+    GrobidDefaultAnalyzer target;
+
+    @Before
+    public void setUp() throws Exception {
+        target = GrobidDefaultAnalyzer.getInstance();
+    }
+
+    @Test
+    public void testTokenizeWithLayoutToken() {
+        final List<LayoutToken> layoutTokens = target.tokenizeWithLayoutToken("This is a normal \ntext,\n\n\n on several lines.\n");
+
+        assertThat(layoutTokens, hasSize(22));
+        assertThat(layoutTokens.get(0).getText(), is("This"));
+        assertThat(layoutTokens.get(1).getText(), is(" "));
+        assertThat(layoutTokens.get(6).getText(), is("normal"));
+        assertThat(layoutTokens.get(7).getText(), is(" "));
+        assertThat(layoutTokens.get(7).isNewLineAfter(), is(true));
+        assertThat(layoutTokens.get(8).getText(), is("\n"));
+        assertThat(layoutTokens.get(8).isNewLineAfter(), is(false));
+        assertThat(layoutTokens.get(10).getText(), is(","));
+        assertThat(layoutTokens.get(10).isNewLineAfter(), is(true));
+        assertThat(layoutTokens.get(11).getText(), is("\n"));
+        assertThat(layoutTokens.get(11).isNewLineAfter(), is(true));
+        assertThat(layoutTokens.get(12).getText(), is("\n"));
+        assertThat(layoutTokens.get(12).isNewLineAfter(), is(true));
+        assertThat(layoutTokens.get(13).getText(), is("\n"));
+        assertThat(layoutTokens.get(13).isNewLineAfter(), is(false));
+    }
+
+    @Test
+    public void testTokenizeWithLayoutToken_emptyText() {
+        assertThat(target.tokenizeWithLayoutToken(""), hasSize(0));
+    }
+}