kermitt2 · kermitt2 · Jun 9, 2024 · Jun 9, 2024 · Jun 16, 2024 · Jun 17, 2024
diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml
@@ -13,11 +13,11 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-      - name: Set up JDK 17
+      - name: Set up JDK 11
         uses: actions/setup-java@v4
         with:
-          java-version: '17.0.10+7'
-          distribution: 'temurin'
+          java-version: '11'
+          distribution: 'adopt'
           cache: 'gradle'
       - name: Build with Gradle
         run: ./gradlew clean assemble --info --stacktrace --no-daemon

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,29 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [0.8.1] - 2024-06-10
+
+### Added
+ - Identified URLs are now added in the TEI output #1099
+ - Added DL models for patent processing #1082
+ - Copyright and licence identification models #1078 
+ - Add research infrastructure recognition for funding processing #1085
+
+### Changed
+ - Improved the recognition of URLs using (when available) PDF annotations, such as clickable links
+ - Updated TEI schema #1084
+ - Review patent process #1082
+ - Add Kotlin language to support development and testing #1096
+
+### Fixed
+ - Sentence segmentation avoids to split sentences with an URL in the middle  #1097
+ - Sentence segmentation is now applied to funding and acknowledgement #1106
+ - Docker image was optimized to reduce the needed space #1088
+ - Fixed OOBE when processing large quantities of notes #1075
+ - Corrected `<title>` coordinate attribute name #1070
+ - Fix missing coordinates in paragraph continuation #1076
+ - Fixed JSON log output
+
 ## [0.8.0] - 2023-11-19
 
 ### Added

diff --git a/Readme.md b/Readme.md
@@ -105,11 +105,10 @@ Detailed end-to-end [benchmarking](https://grobid.readthedocs.io/en/latest/Bench
 A series of additional modules have been developed for performing __structure aware__ text mining directly on scholar PDF, reusing GROBID's PDF processing and sequence labelling weaponry:
 
 - [software-mention](https://github.com/ourresearch/software-mentions): recognition of software mentions and associated attributes in scientific literature
-- [datastet](https://github.com/kermitt2/datastet): identification of named and implicit research datasets and associated attributes in scientific articles
+- [datastet](https://github.com/kermitt2/datastet): identification of sections and sentences introducing datasets in a scientific article, identification of dataset names and attributes (implict and named datasets) and classification of the type of datasets
 - [grobid-quantities](https://github.com/kermitt2/grobid-quantities): recognition and normalization of physical quantities/measurements
 - [grobid-superconductors](https://github.com/lfoppiano/grobid-superconductors): recognition of superconductor material and properties in scientific literature
 - [entity-fishing](https://github.com/kermitt2/entity-fishing), a tool for extracting Wikidata entities from text and document, which can also use Grobid to pre-process scientific articles in PDF, leading to more precise and relevant entity extraction and the capacity to annotate the PDF with interactive layout
-- [datastet](https://github.com/kermitt2/datastet): identification of sections and sentences introducing datasets in a scientific article, identification of dataset names (implict and named datasets) and classification of the type of these datasets
 - [grobid-ner](https://github.com/kermitt2/grobid-ner): named entity recognition
 - [grobid-astro](https://github.com/kermitt2/grobid-astro): recognition of astronomical entities in scientific papers
 - [grobid-bio](https://github.com/kermitt2/grobid-bio): a toy bio-entity tagger using BioNLP/NLPBA 2004 dataset

diff --git a/build.gradle b/build.gradle
@@ -64,12 +64,12 @@ subprojects {
 //    targetCompatibility = 1.11
 
     kotlin {
-        jvmToolchain(17)
+        jvmToolchain(11)
     }
 
     java {
         toolchain {
-            languageVersion.set(JavaLanguageVersion.of(17))
+            languageVersion.set(JavaLanguageVersion.of(11))
         }
     }
 
@@ -490,20 +490,29 @@ project(":grobid-trainer") {
         "train_name_header"           : "org.grobid.trainer.NameHeaderTrainer",
         "train_name_citation"         : "org.grobid.trainer.NameCitationTrainer",
         "train_affiliation_address"   : "org.grobid.trainer.AffiliationAddressTrainer",
-        "train_header"                : "org.grobid.trainer.HeaderTrainer",
+//        "train_header"                : "org.grobid.trainer.HeaderTrainer",
         "train_fulltext"              : "org.grobid.trainer.FulltextTrainer",
         "train_shorttext"             : "org.grobid.trainer.ShorttextTrainer",
         "train_figure"                : "org.grobid.trainer.FigureTrainer",
         "train_table"                 : "org.grobid.trainer.TableTrainer",
         "train_citation"              : "org.grobid.trainer.CitationTrainer",
         "train_date"                  : "org.grobid.trainer.DateTrainer",
-        "train_segmentation"          : "org.grobid.trainer.SegmentationTrainer",
+//        "train_segmentation"          : "org.grobid.trainer.SegmentationTrainer",
         "train_reference_segmentation": "org.grobid.trainer.ReferenceSegmenterTrainer",
         "train_ebook_model"           : "org.grobid.trainer.EbookTrainer",
         "train_patent_citation"       : "org.grobid.trainer.PatentParserTrainer",
         "train_funding_acknowledgement" : "org.grobid.trainer.FundingAcknowledgementTrainer"
     ]
 
+    def complexTrainerTasks = [
+        "train_header"                : ["org.grobid.trainer.HeaderTrainer", ""],
+        "train_header_light"           : ["org.grobid.trainer.HeaderTrainer", "light"],
+        "train_header_ietf"           : ["org.grobid.trainer.HeaderTrainer", "ietf"],
+        "train_segmentation"          : ["org.grobid.trainer.SegmentationTrainer", ""],
+        "train_segmentation_light"     : ["org.grobid.trainer.SegmentationTrainer", "light"],
+        "train_segmentation_ietf"     : ["org.grobid.trainer.SegmentationTrainer", "ietf"]
+    ]
+
     def libraries = ""
     if (Os.isFamily(Os.FAMILY_MAC)) {
         if (Os.OS_ARCH.equals("aarch64")) {
@@ -528,6 +537,18 @@ project(":grobid-trainer") {
         }
     }
 
+    complexTrainerTasks.each { taskName, mainClassNameAndArgs ->
+        tasks.create(name: taskName, type: JavaExec, group: 'modeltraining') {
+            main = mainClassNameAndArgs[0]
+            classpath = sourceSets.main.runtimeClasspath
+            if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0)
+                jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"
+            if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0)
+                jvmArgs '-Xmx3072m', "--add-opens", "java.base/java.lang=ALL-UNNAMED"    
+            args mainClassNameAndArgs[1]
+        }
+    }
+
     // evaluation tasks
     ext.getArg = { propName, defaultVal ->
         return project.hasProperty(propName) ? project.getProperty(propName) : defaultVal;

diff --git a/doc/Configuration.md b/doc/Configuration.md
@@ -85,7 +85,7 @@ CORS for the GROBID web API service can be configurated by the following yaml pa
 
 GROBID uses external implementation for recognizing the language used in a publication and for performing sentence disambiguation. 
 
-There is currently only one possible language recognition implementation possible (Cybozu Language Detector) and two possible sentence segmenters (OpenNLP, default and the Pragmatic Segmenter).
+There is currently only one possible language recognition implementation possible (Cybozu Language Detector) and two possible sentence segmenters (OpenNLP (default) and the Pragmatic Segmenter).
 
 ```yml 
   # the actual implementation for language recognition to be used
@@ -95,6 +95,7 @@ There is currently only one possible language recognition implementation possibl
   #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
   sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"  
 ```
+**NOTE**: While OpenNLP is 60 time faster than the Pragmatic Segmenter, it performs "slightly" worst. The pragmatic segmenter runs with the JRuby Interpreter.  
 
 ### Service configuration
 
@@ -121,7 +122,7 @@ When executing the service, models can be loaded in a lazy manner (if you plan t
   modelPreload: true
 ```
 
-Finally the following part specifies the port to be used by the GROBID web service:
+Finally, the following part specifies the port to be used by the GROBID web service:
 
 ```yml
 server:

diff --git a/doc/Deep-Learning-models.md b/doc/Deep-Learning-models.md
@@ -20,7 +20,7 @@ Current neural models can be up to 50 times slower than CRF, depending on the ar
 
 By default, only CRF models are used by Grobid. You need to select the Deep Learning models you would like to use in the GROBID configuration yaml file (`grobid/grobid-home/config/grobid.yaml`). See [here](https://grobid.readthedocs.io/en/latest/Configuration/#configuring-the-models) for more details on how to select these models. The most convenient way to use the Deep Learning models is to use the full GROBID Docker image and pass a configuration file at launch of the container describing the selected models to be used instead of the default CRF ones. Note that the full GROBID Docker image is already configured to use Deep Learning models for bibliographical reference and affiliation-address parsing. 
 
-For current GROBID version 0.8.0, we recommend considering the usage of the following Deep Learning models: 
+For current GROBID version 0.8.1, we recommend considering the usage of the following Deep Learning models: 
 
 - `citation` model: for bibliographical parsing, the `BidLSTM_CRF_FEATURES` architecture provides currently the best accuracy, significantly better than CRF (+3 to +5 points in F1-Score). With a GPU, there is normally no runtime impact by selecting this model. SciBERT fine-tuned model performs currently at  lower accuracy. 
 
@@ -57,7 +57,7 @@ DeLFT version `0.3.2` has been tested successfully with Python 3.7 and 3.8. For
 
 ```shell
 cd deflt/
-python3 grobidTagger.py delft/applications/citation tag  --architecture BidLSTM_CRF
+python -m delft.applications.grobidTagger citation tag --architecture BidLSTM_CRF
 ```
 
 If it works (you see some annotations in JSON format), you are sure to have a working DeLFT environment for **all** GROBID models. The next steps address the native bridge between DeLFT and the JVM running GROBID. 
@@ -98,7 +98,7 @@ If you are using a Python environment for the DeLFT installation, you can set th
 
 ```yaml
   delft:
-    python_virtualEnv: /where/my/damned/python/virtualenv/is/
+    python_virtualEnv: /where/my/damned/python/virtualenv/is/ 
 ```
 
 Normally by setting the Python environment path in the config file (e.g. `pythonVirtualEnv: "../delft/env"`), you will not need to launch GROBID in the same activated environment. 

diff --git a/doc/Frequently-asked-questions.md b/doc/Frequently-asked-questions.md
@@ -56,7 +56,7 @@ In addition, consider more RAM memory when running Deep Learning model on CPU, e
 You will get the embedded images converted into `.png` by using the normal batch command. For instance:
 
 ```console
-java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.0-onejar.jar -gH grobid-home -dIn ~/test/in/ -dOut ~/test/out -exe processFullText 
+java -Xmx4G -Djava.library.path=grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep -jar grobid-core/build/libs/grobid-core-0.8.1-onejar.jar -gH grobid-home -dIn ~/test/in/ -dOut ~/test/out -exe processFullText 
 ```
 
 There is a web service doing the same, returning everything in a big zip file, `processFulltextAssetDocument`, still usable but deprecated.