From 6570c8ea079235de175bfad6f646a095ba2c29ce Mon Sep 17 00:00:00 2001 From: johnfouf Date: Mon, 13 Sep 2021 14:37:00 +0300 Subject: [PATCH 1/4] add chest_era project mining --- .../main_sqlite/oozie_app/lib/scripts/projects.sql | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql index f060a304e..732bd61f6 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql @@ -70,6 +70,14 @@ regexprmatches("support|project|grant|fund|thanks|agreement|research|acknowledge --DFG union all +-- CHIST-ERA +select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', (prev||" <<< "||middle||" >>> "||next)) as C1, docid, id, fundingclass1, grantid from +(setschema 'docid,prev,middle,next' select c1, textwindow2s(comprspaces(lower(keywords(c2))),10,2,10,"\bchist era\b") from pubs where c2 is not null), grants +where (regexprmatches(lower(acronym), prev||" "||middle||" "||next) or grantid = "unidentified") and fundingclass1 = "CHIST-ERA" group by docid, id + +union all + + -- Canadian funders select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', textsnippet) as C1, docid, id, fundingclass1, grantid from ( @@ -255,6 +263,7 @@ select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', sqroot(min ) ) where confidence > 0.16) group by docid,id); +delete from output_table where fundingClass1="CHIST-ERA" and grantid="unidentified" and docid in (select docid from output_table where grantid!="unidentified"); delete from matched_undefined_miur_only where docid in (select docid from output_table where fundingClass1="MIUR"); delete from matched_undefined_wt_only where docid in (select docid from output_table where fundingClass1="WT"); delete from matched_undefined_gsri where docid in (select docid from output_table where fundingClass1="GSRI"); From 88d7ab7d2c4a2a9ab5878efcce42e69fc412bbd6 Mon Sep 17 00:00:00 2001 From: johnfouf Date: Tue, 14 Sep 2021 14:35:31 +0300 Subject: [PATCH 2/4] Update projects.sql chist-era, edit pattern for acronyms add boundaries --- .../project/main_sqlite/oozie_app/lib/scripts/projects.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql index 732bd61f6..e4679c91c 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql @@ -73,7 +73,7 @@ union all -- CHIST-ERA select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', (prev||" <<< "||middle||" >>> "||next)) as C1, docid, id, fundingclass1, grantid from (setschema 'docid,prev,middle,next' select c1, textwindow2s(comprspaces(lower(keywords(c2))),10,2,10,"\bchist era\b") from pubs where c2 is not null), grants -where (regexprmatches(lower(acronym), prev||" "||middle||" "||next) or grantid = "unidentified") and fundingclass1 = "CHIST-ERA" group by docid, id +where (regexprmatches("\b"||lower(acronym)||"\b", prev||" "||middle||" "||next) or grantid = "unidentified") and fundingclass1 = "CHIST-ERA" group by docid, id union all From c029265afdf6af9db60f019aecc8a7a376f8569f Mon Sep 17 00:00:00 2001 From: johnfouf Date: Wed, 15 Sep 2021 14:23:21 +0300 Subject: [PATCH 3/4] tokenize acronyms + chistera without whitespaces --- .../project/main_sqlite/oozie_app/lib/scripts/projects.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql index e4679c91c..d7a9f18e1 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql @@ -72,8 +72,8 @@ regexprmatches("support|project|grant|fund|thanks|agreement|research|acknowledge union all -- CHIST-ERA select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', (prev||" <<< "||middle||" >>> "||next)) as C1, docid, id, fundingclass1, grantid from -(setschema 'docid,prev,middle,next' select c1, textwindow2s(comprspaces(lower(keywords(c2))),10,2,10,"\bchist era\b") from pubs where c2 is not null), grants -where (regexprmatches("\b"||lower(acronym)||"\b", prev||" "||middle||" "||next) or grantid = "unidentified") and fundingclass1 = "CHIST-ERA" group by docid, id +(setschema 'docid,prev,middle,next' select c1, textwindow2s(comprspaces(lower(keywords(c2))),10,2,10,"\bchist era\b|\bchistera\b") from pubs where c2 is not null), grants +where (regexprmatches("\b"||keywords(lower(acronym))||"\b", prev||" "||middle||" "||next) or grantid = "unidentified") and fundingclass1 = "CHIST-ERA" group by docid, id union all From c555b92f9449eb9a36e1c12ea4a47670c7371809 Mon Sep 17 00:00:00 2001 From: johnfouf Date: Wed, 15 Sep 2021 14:54:34 +0300 Subject: [PATCH 4/4] drop chistera without whitespace --- .../project/main_sqlite/oozie_app/lib/scripts/projects.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql index d7a9f18e1..c67ada9fc 100644 --- a/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql +++ b/iis-wf/iis-wf-referenceextraction/src/main/resources/eu/dnetlib/iis/wf/referenceextraction/project/main_sqlite/oozie_app/lib/scripts/projects.sql @@ -72,7 +72,7 @@ regexprmatches("support|project|grant|fund|thanks|agreement|research|acknowledge union all -- CHIST-ERA select jdict('documentId', docid, 'projectId', id, 'confidenceLevel', 0.8, 'textsnippet', (prev||" <<< "||middle||" >>> "||next)) as C1, docid, id, fundingclass1, grantid from -(setschema 'docid,prev,middle,next' select c1, textwindow2s(comprspaces(lower(keywords(c2))),10,2,10,"\bchist era\b|\bchistera\b") from pubs where c2 is not null), grants +(setschema 'docid,prev,middle,next' select c1, textwindow2s(comprspaces(lower(keywords(c2))),10,2,10,"\bchist era\b") from pubs where c2 is not null), grants where (regexprmatches("\b"||keywords(lower(acronym))||"\b", prev||" "||middle||" "||next) or grantid = "unidentified") and fundingclass1 = "CHIST-ERA" group by docid, id union all