diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1cde967
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+# Project source #
+###################
+*.project
+*.classpath
+
+# Package Files #
+#################
+*.jar
diff --git a/arff/triage0_50_ngrams_size1_stopwords.arff b/arff/triage0_50_ngrams_size1_stopwords.arff
new file mode 100644
index 0000000..c7525b4
--- /dev/null
+++ b/arff/triage0_50_ngrams_size1_stopwords.arff
@@ -0,0 +1,38 @@
+% Weka training file - HIV triage - 2015
+
+@RELATION triage
+@ATTRIBUTE docID REAL %PMID of paper
+@ATTRIBUTE Ngram0trees REAL %trees
+@ATTRIBUTE Ngram1model REAL %model
+@ATTRIBUTE Ngram2triage REAL %triage
+@ATTRIBUTE Ngram3genes REAL %genes
+@ATTRIBUTE Ngram4sampling REAL %sampling
+@ATTRIBUTE Ngram5classification REAL %classification
+@ATTRIBUTE Ngram6processing REAL %processing
+@ATTRIBUTE Ngram7fungal REAL %fungal
+@ATTRIBUTE Ngram8enzymes REAL %enzymes
+@ATTRIBUTE Ngram9manual REAL %manual
+@ATTRIBUTE Ngram10literature REAL %literature
+@ATTRIBUTE Ngram11annotation REAL %annotation
+@ATTRIBUTE Ngram12mycoclapfungalgenomicsca REAL %mycoclapfungalgenomicsca
+@ATTRIBUTE Ngram13machine REAL %machine
+@ATTRIBUTE Ngram14first REAL %first
+@ATTRIBUTE Ngram15features REAL %features
+@ATTRIBUTE Ngram16mycoclap REAL %mycoclap
+@ATTRIBUTE Ngram17results REAL %results
+@ATTRIBUTE Ngram18abstracttext REAL %abstracttext
+@ATTRIBUTE Ngram19task REAL %task
+@ATTRIBUTE Ngram20http REAL %http
+@ATTRIBUTE Ngram21support REAL %support
+@ATTRIBUTE Ngram22learning REAL %learning
+@ATTRIBUTE Ngram23database REAL %database
+@ATTRIBUTE Ngram24curation REAL %curation
+@ATTRIBUTE Ngram25logistic REAL %logistic
+@ATTRIBUTE Ngram26applications REAL %applications
+@ATTRIBUTE Ngram27articletitle REAL %articletitle
+@ATTRIBUTE class {positive, negative}
+@DATA
+
+25754864,0,0,0,2,0,0,2,5,3,1,1,2,2,0,1,0,6,1,0,0,2,2,0,4,2,0,2,0,negative
+25551575,2,4,3,0,2,2,0,0,0,1,2,0,0,4,1,2,0,1,0,2,0,2,3,0,1,2,0,0,negative
+
diff --git a/arff/triage1_50_ngrams_size1_stopwords.arff b/arff/triage1_50_ngrams_size1_stopwords.arff
new file mode 100644
index 0000000..ddfc07d
--- /dev/null
+++ b/arff/triage1_50_ngrams_size1_stopwords.arff
@@ -0,0 +1,38 @@
+% Weka test file - HIV triage - 2015
+
+@RELATION triage
+@ATTRIBUTE docID REAL %PMID of paper
+@ATTRIBUTE Ngram0trees REAL %trees
+@ATTRIBUTE Ngram1model REAL %model
+@ATTRIBUTE Ngram2triage REAL %triage
+@ATTRIBUTE Ngram3genes REAL %genes
+@ATTRIBUTE Ngram4sampling REAL %sampling
+@ATTRIBUTE Ngram5classification REAL %classification
+@ATTRIBUTE Ngram6processing REAL %processing
+@ATTRIBUTE Ngram7fungal REAL %fungal
+@ATTRIBUTE Ngram8enzymes REAL %enzymes
+@ATTRIBUTE Ngram9manual REAL %manual
+@ATTRIBUTE Ngram10literature REAL %literature
+@ATTRIBUTE Ngram11annotation REAL %annotation
+@ATTRIBUTE Ngram12mycoclapfungalgenomicsca REAL %mycoclapfungalgenomicsca
+@ATTRIBUTE Ngram13machine REAL %machine
+@ATTRIBUTE Ngram14first REAL %first
+@ATTRIBUTE Ngram15features REAL %features
+@ATTRIBUTE Ngram16mycoclap REAL %mycoclap
+@ATTRIBUTE Ngram17results REAL %results
+@ATTRIBUTE Ngram18abstracttext REAL %abstracttext
+@ATTRIBUTE Ngram19task REAL %task
+@ATTRIBUTE Ngram20http REAL %http
+@ATTRIBUTE Ngram21support REAL %support
+@ATTRIBUTE Ngram22learning REAL %learning
+@ATTRIBUTE Ngram23database REAL %database
+@ATTRIBUTE Ngram24curation REAL %curation
+@ATTRIBUTE Ngram25logistic REAL %logistic
+@ATTRIBUTE Ngram26applications REAL %applications
+@ATTRIBUTE Ngram27articletitle REAL %articletitle
+@ATTRIBUTE class {positive, negative}
+@DATA
+
+25754864,0,0,0,2,0,0,2,5,3,1,1,2,2,0,1,0,6,1,0,0,2,2,0,4,2,0,2,0,negative
+25551575,2,4,3,0,2,2,0,0,0,1,2,0,0,4,1,2,0,1,0,2,0,2,3,0,1,2,0,0,negative
+
diff --git a/build.xml b/build.xml
new file mode 100644
index 0000000..efbea36
--- /dev/null
+++ b/build.xml
@@ -0,0 +1,153 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/corpus/mycoSORTSampleTriagecorpus_test.xml b/corpus/mycoSORTSampleTriagecorpus_test.xml
new file mode 100644
index 0000000..bdd6cf2
--- /dev/null
+++ b/corpus/mycoSORTSampleTriagecorpus_test.xml
@@ -0,0 +1,468 @@
+
+
+
+ 25754864
+
+ 2015
+ 03
+ 10
+
+
+ 2015
+ 03
+ 18
+
+
+
+ 1758-0463
+
+ 2015
+
+ 2015
+
+
+ Database : the journal of biological databases and curation
+ Database (Oxford)
+
+ mycoCLAP, the database for characterized lignocellulose-active proteins of fungal origin: resource and text mining curation support.
+
+
+
+ 10.1093/database/bav008
+ bav008
+
+ Enzymes active on components of lignocellulosic biomass are used for industrial applications ranging from food processing to biofuels production. These include a diverse array of glycoside hydrolases, carbohydrate esterases, polysaccharide lyases and oxidoreductases. Fungi are prolific producers of these enzymes, spurring fungal genome sequencing efforts to identify and catalogue the genes that encode them. To facilitate the functional annotation of these genes, biochemical data on over 800 fungal lignocellulose-degrading enzymes have been collected from the literature and organized into the searchable database, mycoCLAP (http://mycoclap.fungalgenomics.ca). First implemented in 2011, and updated as described here, mycoCLAP is capable of ranking search results according to closest biochemically characterized homologues: this improves the quality of the annotation, and significantly decreases the time required to annotate novel sequences. The database is freely available to the scientific community, as are the open source applications based on natural language processing developed to support the manual curation of mycoCLAP. Database URL: http://mycoclap.fungalgenomics.ca.
+ © The Author(s) 2015. Published by Oxford University Press.
+
+
+
+ Strasser
+ Kimchi
+ K
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ McDonnell
+ Erin
+ E
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Nyaga
+ Carol
+ C
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Wu
+ Min
+ M
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Wu
+ Sherry
+ S
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Almeida
+ Hayda
+ H
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Meurs
+ Marie-Jean
+ MJ
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Kosseim
+ Leila
+ L
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Powlowski
+ Justin
+ J
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Butler
+ Greg
+ G
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA gregb@encs.concordia.ca.
+
+
+
+ Tsang
+ Adrian
+ A
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ eng
+
+ Journal Article
+ Research Support, Non-U.S. Gov't
+
+
+ 2015
+ 03
+ 08
+
+
+
+ England
+ Database (Oxford)
+ 101517697
+ 1758-0463
+
+ IM
+
+
+ Nat Biotechnol. 2004 Jun;22(6):695-700
+ 15122302
+
+
+ Appl Environ Microbiol. 2013 Aug;79(15):4620-34
+ 23709508
+
+
+ J Mol Biol. 1990 Oct 5;215(3):403-10
+ 2231712
+
+
+ Nature. 2008 Sep 4;455(7209):47-50
+ 18769432
+
+
+ Nucleic Acids Res. 2009 Jan;37(Database issue):D233-8
+ 18838391
+
+
+ Nucleic Acids Res. 2009 Jan;37(Database issue):D588-92
+ 18984617
+
+
+ Database (Oxford). 2011;2011:bar020
+ 21622642
+
+
+ Genome Res. 2011 Jun;21(6):885-97
+ 21543515
+
+
+ Nat Methods. 2011;8(10):785-6
+ 21959131
+
+
+ Nat Biotechnol. 2011 Oct;29(10):922-7
+ 21964414
+
+
+ Enzyme Microb Technol. 2011 Apr 7;48(4-5):397-403
+ 22112956
+
+
+ BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5
+ 22595090
+
+
+ Science. 2012 Jun 29;336(6089):1715-9
+ 22745431
+
+
+ Nucleic Acids Res. 2013 Jan;41(Database issue):D43-7
+ 23161681
+
+
+ Nucleic Acids Res. 2013 Jan;41(Database issue):D36-42
+ 23193287
+
+
+ Nat Genet. 2000 May;25(1):25-9
+ 10802651
+
+
+ PLoS One. 2014;9(12):e115892
+ 25551575
+
+
+ PMC4352688
+
+
+
+
+ 2015
+
+
+
+
+ 2015
+ 3
+ 11
+ 6
+ 0
+
+
+ 2015
+ 3
+ 11
+ 6
+ 0
+
+
+ 2015
+ 3
+ 11
+ 6
+ 0
+
+
+ epublish
+
+ bav008
+ 10.1093/database/bav008
+ 25754864
+ PMC4352688
+
+
+
+
+
+
+ 25551575
+
+ 2015
+ 01
+ 01
+
+
+ 2015
+ 01
+ 13
+
+
+
+ 1932-6203
+
+ 9
+ 12
+
+ 2014
+
+
+ PloS one
+ PLoS ONE
+
+ Machine learning for biomedical literature triage.
+
+ e115892
+
+ 10.1371/journal.pone.0115892
+
+ This paper presents a machine learning system for supporting the first task of the biological literature manual curation process, called triage. We compare the performance of various classification models, by experimenting with dataset sampling factors and a set of features, as well as three different machine learning algorithms (Naive Bayes, Support Vector Machine and Logistic Model Trees). The results show that the most fitting model to handle the imbalanced datasets of the triage classification task is obtained by using domain relevant features, an under-sampling technique, and the Logistic Model Trees algorithm.
+
+
+
+ Almeida
+ Hayda
+ H
+
+ Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada.
+
+
+
+ Meurs
+ Marie-Jean
+ MJ
+
+ Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.
+
+
+
+ Kosseim
+ Leila
+ L
+
+ Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada.
+
+
+
+ Butler
+ Greg
+ G
+
+ Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada; Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.
+
+
+
+ Tsang
+ Adrian
+ A
+
+ Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.
+
+
+
+ eng
+
+ Journal Article
+ Research Support, Non-U.S. Gov't
+
+
+ 2014
+ 12
+ 31
+
+
+
+ United States
+ PLoS One
+ 101285081
+ 1932-6203
+
+ IM
+
+
+ Proc AMIA Symp. 2001;:17-21
+ 11825149
+
+
+ Artif Intell Med. 2005 Sep-Oct;35(1-2):121-34
+ 16024240
+
+
+ Mol Cell. 2006 Mar 3;21(5):589-94
+ 16507357
+
+
+ Bioinformatics. 2006 Mar 15;22(6):658-64
+ 16287934
+
+
+ Artif Intell Med. 2006 May;37(1):7-18
+ 16233974
+
+
+ Nature. 2008 Sep 4;455(7209):47-50
+ 18769432
+
+
+ IEEE Trans Syst Man Cybern B Cybern. 2009 Feb;39(1):281-8
+ 19068445
+
+
+ Database (Oxford). 2011;2011:bar020
+ 21622642
+
+
+ J Integr Bioinform. 2011;8(3):176
+ 21926439
+
+
+ Database (Oxford). 2012;2012:bas020
+ 22513129
+
+
+ BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5
+ 22595090
+
+
+ PLoS One. 2013;8(6):e65848
+ 23785456
+
+
+ PLoS One. 2013;8(12):e80503
+ 24312478
+
+
+ PLoS One. 2014;9(4):e91315
+ 24705246
+
+
+ PLoS One. 2014;9(7):e102039
+ 25036529
+
+
+ PMC4281078
+
+
+
+
+ 2014
+
+
+
+
+ 2014
+ 9
+ 4
+
+
+ 2014
+ 11
+ 27
+
+
+ 2014
+ 12
+ 31
+
+
+ 2015
+ 1
+ 1
+ 6
+ 0
+
+
+ 2015
+ 1
+ 1
+ 6
+ 0
+
+
+ 2015
+ 1
+ 1
+ 6
+ 0
+
+
+ epublish
+
+ 10.1371/journal.pone.0115892
+ PONE-D-14-39858
+ 25551575
+ PMC4281078
+
+
+
+
+
diff --git a/corpus/mycoSORTSampleTriagecorpus_train_50.xml b/corpus/mycoSORTSampleTriagecorpus_train_50.xml
new file mode 100644
index 0000000..bdd6cf2
--- /dev/null
+++ b/corpus/mycoSORTSampleTriagecorpus_train_50.xml
@@ -0,0 +1,468 @@
+
+
+
+ 25754864
+
+ 2015
+ 03
+ 10
+
+
+ 2015
+ 03
+ 18
+
+
+
+ 1758-0463
+
+ 2015
+
+ 2015
+
+
+ Database : the journal of biological databases and curation
+ Database (Oxford)
+
+ mycoCLAP, the database for characterized lignocellulose-active proteins of fungal origin: resource and text mining curation support.
+
+
+
+ 10.1093/database/bav008
+ bav008
+
+ Enzymes active on components of lignocellulosic biomass are used for industrial applications ranging from food processing to biofuels production. These include a diverse array of glycoside hydrolases, carbohydrate esterases, polysaccharide lyases and oxidoreductases. Fungi are prolific producers of these enzymes, spurring fungal genome sequencing efforts to identify and catalogue the genes that encode them. To facilitate the functional annotation of these genes, biochemical data on over 800 fungal lignocellulose-degrading enzymes have been collected from the literature and organized into the searchable database, mycoCLAP (http://mycoclap.fungalgenomics.ca). First implemented in 2011, and updated as described here, mycoCLAP is capable of ranking search results according to closest biochemically characterized homologues: this improves the quality of the annotation, and significantly decreases the time required to annotate novel sequences. The database is freely available to the scientific community, as are the open source applications based on natural language processing developed to support the manual curation of mycoCLAP. Database URL: http://mycoclap.fungalgenomics.ca.
+ © The Author(s) 2015. Published by Oxford University Press.
+
+
+
+ Strasser
+ Kimchi
+ K
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ McDonnell
+ Erin
+ E
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Nyaga
+ Carol
+ C
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Wu
+ Min
+ M
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Wu
+ Sherry
+ S
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Almeida
+ Hayda
+ H
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Meurs
+ Marie-Jean
+ MJ
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Kosseim
+ Leila
+ L
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Powlowski
+ Justin
+ J
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ Butler
+ Greg
+ G
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA gregb@encs.concordia.ca.
+
+
+
+ Tsang
+ Adrian
+ A
+
+ Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.
+
+
+
+ eng
+
+ Journal Article
+ Research Support, Non-U.S. Gov't
+
+
+ 2015
+ 03
+ 08
+
+
+
+ England
+ Database (Oxford)
+ 101517697
+ 1758-0463
+
+ IM
+
+
+ Nat Biotechnol. 2004 Jun;22(6):695-700
+ 15122302
+
+
+ Appl Environ Microbiol. 2013 Aug;79(15):4620-34
+ 23709508
+
+
+ J Mol Biol. 1990 Oct 5;215(3):403-10
+ 2231712
+
+
+ Nature. 2008 Sep 4;455(7209):47-50
+ 18769432
+
+
+ Nucleic Acids Res. 2009 Jan;37(Database issue):D233-8
+ 18838391
+
+
+ Nucleic Acids Res. 2009 Jan;37(Database issue):D588-92
+ 18984617
+
+
+ Database (Oxford). 2011;2011:bar020
+ 21622642
+
+
+ Genome Res. 2011 Jun;21(6):885-97
+ 21543515
+
+
+ Nat Methods. 2011;8(10):785-6
+ 21959131
+
+
+ Nat Biotechnol. 2011 Oct;29(10):922-7
+ 21964414
+
+
+ Enzyme Microb Technol. 2011 Apr 7;48(4-5):397-403
+ 22112956
+
+
+ BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5
+ 22595090
+
+
+ Science. 2012 Jun 29;336(6089):1715-9
+ 22745431
+
+
+ Nucleic Acids Res. 2013 Jan;41(Database issue):D43-7
+ 23161681
+
+
+ Nucleic Acids Res. 2013 Jan;41(Database issue):D36-42
+ 23193287
+
+
+ Nat Genet. 2000 May;25(1):25-9
+ 10802651
+
+
+ PLoS One. 2014;9(12):e115892
+ 25551575
+
+
+ PMC4352688
+
+
+
+
+ 2015
+
+
+
+
+ 2015
+ 3
+ 11
+ 6
+ 0
+
+
+ 2015
+ 3
+ 11
+ 6
+ 0
+
+
+ 2015
+ 3
+ 11
+ 6
+ 0
+
+
+ epublish
+
+ bav008
+ 10.1093/database/bav008
+ 25754864
+ PMC4352688
+
+
+
+
+
+
+ 25551575
+
+ 2015
+ 01
+ 01
+
+
+ 2015
+ 01
+ 13
+
+
+
+ 1932-6203
+
+ 9
+ 12
+
+ 2014
+
+
+ PloS one
+ PLoS ONE
+
+ Machine learning for biomedical literature triage.
+
+ e115892
+
+ 10.1371/journal.pone.0115892
+
+ This paper presents a machine learning system for supporting the first task of the biological literature manual curation process, called triage. We compare the performance of various classification models, by experimenting with dataset sampling factors and a set of features, as well as three different machine learning algorithms (Naive Bayes, Support Vector Machine and Logistic Model Trees). The results show that the most fitting model to handle the imbalanced datasets of the triage classification task is obtained by using domain relevant features, an under-sampling technique, and the Logistic Model Trees algorithm.
+
+
+
+ Almeida
+ Hayda
+ H
+
+ Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada.
+
+
+
+ Meurs
+ Marie-Jean
+ MJ
+
+ Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.
+
+
+
+ Kosseim
+ Leila
+ L
+
+ Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada.
+
+
+
+ Butler
+ Greg
+ G
+
+ Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada; Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.
+
+
+
+ Tsang
+ Adrian
+ A
+
+ Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.
+
+
+
+ eng
+
+ Journal Article
+ Research Support, Non-U.S. Gov't
+
+
+ 2014
+ 12
+ 31
+
+
+
+ United States
+ PLoS One
+ 101285081
+ 1932-6203
+
+ IM
+
+
+ Proc AMIA Symp. 2001;:17-21
+ 11825149
+
+
+ Artif Intell Med. 2005 Sep-Oct;35(1-2):121-34
+ 16024240
+
+
+ Mol Cell. 2006 Mar 3;21(5):589-94
+ 16507357
+
+
+ Bioinformatics. 2006 Mar 15;22(6):658-64
+ 16287934
+
+
+ Artif Intell Med. 2006 May;37(1):7-18
+ 16233974
+
+
+ Nature. 2008 Sep 4;455(7209):47-50
+ 18769432
+
+
+ IEEE Trans Syst Man Cybern B Cybern. 2009 Feb;39(1):281-8
+ 19068445
+
+
+ Database (Oxford). 2011;2011:bar020
+ 21622642
+
+
+ J Integr Bioinform. 2011;8(3):176
+ 21926439
+
+
+ Database (Oxford). 2012;2012:bas020
+ 22513129
+
+
+ BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5
+ 22595090
+
+
+ PLoS One. 2013;8(6):e65848
+ 23785456
+
+
+ PLoS One. 2013;8(12):e80503
+ 24312478
+
+
+ PLoS One. 2014;9(4):e91315
+ 24705246
+
+
+ PLoS One. 2014;9(7):e102039
+ 25036529
+
+
+ PMC4281078
+
+
+
+
+ 2014
+
+
+
+
+ 2014
+ 9
+ 4
+
+
+ 2014
+ 11
+ 27
+
+
+ 2014
+ 12
+ 31
+
+
+ 2015
+ 1
+ 1
+ 6
+ 0
+
+
+ 2015
+ 1
+ 1
+ 6
+ 0
+
+
+ 2015
+ 1
+ 1
+ 6
+ 0
+
+
+ epublish
+
+ 10.1371/journal.pone.0115892
+ PONE-D-14-39858
+ 25551575
+ PMC4281078
+
+
+
+
+
diff --git a/features/docIDs.txt b/features/docIDs.txt
new file mode 100644
index 0000000..8da8a8a
--- /dev/null
+++ b/features/docIDs.txt
@@ -0,0 +1,1198 @@
+14565843 positive
+23073100 negative
+11501467 negative
+20208428 positive
+9074500 negative
+986853 negative
+8787388 positive
+20826217 positive
+11471729 positive
+16059706 negative
+11298744 positive
+21168763 negative
+10424099 negative
+18415096 positive
+15866877 positive
+2396985 negative
+7838157 negative
+44415 negative
+11170563 negative
+16128806 positive
+12435269 positive
+19756584 positive
+21382036 negative
+15830675 negative
+7579664 negative
+9114071 negative
+21635140 negative
+20070371 positive
+19505579 positive
+14987996 positive
+12882162 positive
+6767680 negative
+963703 negative
+15278289 positive
+17119968 positive
+9486422 negative
+8698653 positive
+16278932 positive
+16488199 positive
+15466516 positive
+21829351 negative
+17302745 negative
+1479358 positive
+15450181 positive
+9199426 negative
+14976875 negative
+2703464 negative
+15290142 negative
+21972816 negative
+16431275 positive
+15541296 positive
+19060407 negative
+18499583 negative
+22260051 negative
+17027758 positive
+12209794 negative
+1400249 positive
+21466636 negative
+11916668 positive
+1452095 negative
+9508797 negative
+19277742 positive
+22226198 positive
+18175902 negative
+20803138 negative
+17341093 negative
+8948426 negative
+9011379 negative
+14595695 negative
+22906713 negative
+8524797 positive
+2226842 positive
+19590866 positive
+9485595 positive
+2332056 negative
+10957961 negative
+9013549 positive
+10923795 positive
+24212486 negative
+10713452 positive
+13306714 negative
+22031024 positive
+8978090 negative
+22067437 negative
+19205049 positive
+16926418 negative
+19393602 positive
+8597544 negative
+3595987 negative
+19690850 positive
+21306947 positive
+23790084 negative
+8597548 positive
+15686849 positive
+9931476 positive
+1398098 positive
+18936994 negative
+18045411 positive
+17229143 positive
+22350290 negative
+8135518 positive
+1588915 negative
+13031603 negative
+16789551 negative
+23218368 positive
+14766566 negative
+11930943 negative
+8647098 positive
+18806001 negative
+21360092 negative
+10802187 positive
+12788920 positive
+17910720 negative
+1368680 positive
+16404950 positive
+21897016 positive
+9468803 negative
+23071108 positive
+8647081 positive
+10434062 negative
+8250548 positive
+23226882 negative
+12162562 positive
+19809198 negative
+9140529 negative
+21575132 negative
+15054207 negative
+11064202 negative
+18800599 positive
+23589840 negative
+1392588 negative
+7603444 positive
+18388475 negative
+17381511 positive
+8804409 negative
+22132148 positive
+8765754 negative
+9140977 positive
+23625219 negative
+17351093 positive
+22075023 negative
+16462863 positive
+19556747 positive
+15944854 negative
+15054209 positive
+8856078 positive
+17662982 negative
+16473771 positive
+8243636 negative
+8810077 positive
+22795531 negative
+23497862 negative
+3114236 positive
+15449305 negative
+12591897 positive
+22733825 negative
+19996679 positive
+7763357 negative
+9803534 negative
+13637982 negative
+19479322 negative
+18555305 negative
+18500632 negative
+16232740 positive
+1368254 positive
+10583968 negative
+16133102 positive
+12882555 negative
+16041128 positive
+16761182 negative
+20109094 positive
+10952011 positive
+11358516 positive
+20518356 negative
+6169675 negative
+19645671 negative
+9802217 positive
+18587856 negative
+21237221 positive
+12485115 positive
+23625216 negative
+9872754 positive
+8503847 positive
+9463945 positive
+17928699 positive
+1768103 negative
+7614556 negative
+14586108 positive
+12363086 negative
+12543554 positive
+3117961 negative
+21402188 negative
+17043824 positive
+7811079 positive
+16343463 positive
+8486628 positive
+10908793 negative
+8964516 negative
+9648215 positive
+11849507 positive
+3080320 negative
+12513977 negative
+23508400 negative
+16662849 negative
+12406766 positive
+23725035 negative
+2337347 positive
+2560409 negative
+22961332 negative
+2510150 positive
+14716497 negative
+23412069 positive
+1735428 negative
+21168322 negative
+8987884 positive
+9830097 negative
+15809023 negative
+15892742 negative
+12908861 negative
+22496740 negative
+19205687 positive
+9492270 positive
+1429462 positive
+7191044 negative
+12889 negative
+17111131 positive
+9430631 negative
+9438354 negative
+17021873 positive
+20562284 positive
+10101286 positive
+20552260 positive
+24113511 negative
+23170978 positive
+21510637 negative
+8509335 positive
+9611196 negative
+18388455 negative
+15756814 positive
+17168300 negative
+23666150 negative
+8987855 positive
+23236275 positive
+22776993 positive
+22835655 positive
+1644702 negative
+1567377 negative
+21369980 negative
+17665191 positive
+10606774 positive
+18490176 positive
+10508057 positive
+11988501 negative
+24077704 negative
+8987622 positive
+15500988 positive
+12909730 negative
+9813313 negative
+10672446 negative
+18460787 negative
+16822232 positive
+5861997 negative
+2629784 negative
+11136466 negative
+8640604 positive
+12147340 positive
+12555575 negative
+24274505 negative
+10939261 negative
+22425351 negative
+1490609 negative
+12172603 negative
+15998406 negative
+12073090 positive
+16349883 negative
+11178973 negative
+1368193 positive
+12455695 positive
+9330667 negative
+2331322 positive
+17376674 positive
+22624316 negative
+1878999 positive
+8960907 negative
+19507068 positive
+7859305 positive
+8299175 positive
+12892894 positive
+8575021 positive
+2146364 negative
+8575023 positive
+2126511 negative
+16524914 negative
+9732526 positive
+11523809 positive
+19761044 negative
+8724139 negative
+8589415 positive
+15291818 positive
+16361785 negative
+23124346 positive
+19500674 positive
+988467 negative
+22432613 negative
+18068392 negative
+12945177 negative
+20652740 positive
+8276068 negative
+10091328 positive
+7640003 positive
+21971070 negative
+18414798 negative
+12743761 negative
+7984103 positive
+10215597 positive
+16637705 negative
+8589407 positive
+8945534 positive
+8589408 negative
+12325291 negative
+19527927 positive
+17838811 negative
+16233798 positive
+18668421 negative
+20129093 positive
+15294290 positive
+23240568 positive
+20823521 negative
+16244441 positive
+7896713 positive
+15870328 positive
+22579450 negative
+16614858 positive
+16039872 positive
+19288093 positive
+18694928 negative
+19189377 negative
+15651 negative
+19575195 positive
+7488173 positive
+12409103 positive
+6358191 negative
+16284933 negative
+7574556 negative
+9830143 negative
+18845181 negative
+15362290 negative
+16233531 positive
+17955189 positive
+19507018 positive
+16233536 positive
+23298573 negative
+18704748 positive
+24186432 negative
+12715256 negative
+18550352 positive
+16380244 negative
+15090228 negative
+18944813 positive
+8598062 positive
+14323029 negative
+16232670 positive
+19054103 positive
+16232432 positive
+6406022 negative
+826291 negative
+11829749 positive
+19513709 positive
+7906649 positive
+9058977 negative
+18023045 positive
+21181156 positive
+21243443 positive
+8381338 negative
+15136043 negative negative
+18378599 negative
+14685768 positive
+22407682 negative
+2707445 positive
+18083533 positive
+22132219 negative
+12226497 negative
+14763977 negative
+14532063 positive
+18563407 positive
+19545999 negative
+19967375 negative
+16233515 positive
+1787790 positive
+22705517 negative
+8975597 positive
+23470758 negative
+23728162 negative
+9309656 negative
+10779688 positive
+1447290 negative
+11257513 positive
+11115392 positive
+10049844 negative
+19088319 positive
+15950056 negative
+12754825 negative
+20569406 negative
+21307589 positive
+23844185 negative
+6541478 negative
+18408068 positive
+20077114 positive
+9169610 positive
+16677342 negative
+23508952 positive
+12602898 negative
+17433483 positive
+10675564 positive
+18443829 negative
+17625262 positive
+17599813 positive
+32175 positive
+7574590 positive
+20212162 negative
+8781176 positive
+18923909 negative
+10491168 positive
+18264680 negative
+23500559 negative
+6863431 negative
+9805384 positive
+14524699 positive
+8400376 positive
+2135869 positive
+14523125 positive
+16129506 positive
+8400378 negative
+18943122 negative
+16275128 positive
+10493932 positive
+23199732 positive
+17651154 negative
+12665550 positive
+12224649 positive
+7439182 negative
+8959766 negative
+9608522 negative
+23199738 positive
+19734721 positive
+20143777 positive
+23306879 positive
+11376609 negative negative
+9334183 negative
+10049864 negative
+16697997 positive
+23489323 negative
+21040747 positive
+21442271 positive
+24372593 negative
+15746364 negative
+9212440 negative
+1952931 negative
+11179652 negative
+8955395 negative
+12843664 positive
+2158993 positive
+23299456 positive
+18512263 positive
+14665735 positive
+20043150 positive
+21626020 positive
+23836384 negative
+7487028 negative
+7961884 positive
+22653604 positive
+12619666 positive
+3111887 negative
+19107534 positive
+32833 positive
+15580593 positive
+16901567 positive
+7824933 positive
+10499260 negative
+9506837 positive
+19527524 positive
+11061997 negative
+22373601 negative
+9758774 positive
+22074954 negative
+19473250 positive
+9758775 positive
+17977149 negative
+12845603 negative
+2506439 negative
+10553664 positive
+22150279 negative
+18456943 positive
+15270720 negative
+18675351 positive
+10385327 negative
+7629010 negative
+1592808 positive
+17505783 negative
+18722542 negative
+12843680 positive
+8593683 positive
+16980715 positive
+16734792 positive
+7487009 negative
+23285046 positive
+22524557 negative
+16520923 positive
+16107755 positive
+19912637 positive
+3936420 positive
+7626800 negative
+9464371 positive
+19269961 positive
+2152162 positive
+22444635 positive
+16666407 negative
+3268297 negative
+8900004 positive
+10659715 negative
+2760033 positive
+1612414 positive
+1368837 positive
+23931690 negative
+8756392 positive
+20424835 negative
+18548669 positive
+10514255 negative
+19922433 negative
+16233124 positive
+15174310 positive
+17043085 negative
+16140328 positive
+8669913 negative
+20429042 negative
+3125847 negative
+20592022 positive
+24212538 negative
+1425667 positive
+7262712 negative
+10525153 positive
+21710260 positive
+7926830 positive
+14674022 negative
+14735222 positive
+18975142 negative
+18935968 positive
+20014432 negative
+23184220 negative
+18722595 positive
+4779294 negative
+15838031 positive
+15025429 negative
+9464399 negative
+23129650 positive
+7493964 positive
+27428 negative
+3561490 positive
+8436950 positive
+9761741 positive
+21945415 negative
+1367522 positive
+20382376 negative
+4269377 negative
+9118231 negative
+7012186 negative
+19756576 negative
+22360347 negative
+15288024 negative
+10586505 positive
+9805373 positive
+22442229 positive
+15782637 negative
+8905923 positive
+15246667 negative
+8688436 positive
+14988022 positive
+18850325 negative
+7549103 positive
+8901566 negative
+8595661 positive
+9547139 negative
+15668816 negative
+1368843 positive
+12702357 positive
+16874542 positive
+23326459 positive
+8935788 negative
+16374635 positive
+1368603 positive
+9153431 positive
+9987124 positive
+16478498 negative
+10029988 positive
+8595669 positive
+22080345 positive
+22754023 positive
+22080343 negative
+8065265 positive
+8961569 negative
+9165762 positive
+20573014 positive
+22940311 positive
+23303647 negative
+12726996 positive
+24479319 negative
+9450333 positive
+20727822 positive
+17922847 positive
+16664778 negative
+16134120 negative
+22072708 negative
+9371889 positive
+8948110 negative
+20619350 positive
+16474906 positive
+11754346 negative
+2579525 negative
+20734107 negative
+8464071 negative
+22805919 negative
+22709462 negative
+9841776 negative
+23100915 negative
+7788716 positive
+7788717 positive
+8654984 positive
+4040855 negative
+45611 negative
+12233746 negative
+17614952 negative
+22685137 positive
+18233 negative
+21948841 positive
+19202090 positive
+16137662 negative
+21726361 negative
+1368777 positive
+3527986 negative
+9654123 positive
+18307762 positive
+12597025 negative
+2508563 negative
+8000538 positive
+560223 negative
+2063624 negative
+1632643 negative
+21193820 positive
+15194814 positive
+11856 negative
+9301101 negative
+24020787 negative
+8190078 positive
+19835139 negative
+22712405 negative
+18377882 positive
+9000377 positive
+21622 negative
+21364303 negative
+16186619 negative
+7987261 positive
+8997712 negative
+22940347 negative
+16523351 negative
+24085297 negative
+1654681 negative negative
+8358833 positive
+8358835 positive
+19725536 positive
+23897210 negative
+15757176 positive
+2187435 positive
+8616259 negative
+9370370 positive
+8358830 positive
+17503147 positive
+2509432 positive
+4281647 negative
+1781689 negative
+22349190 positive
+10508113 positive
+24035805 negative
+21748379 positive
+1815765 negative
+20541633 negative
+11494757 negative
+12668107 negative
+10376824 positive
+21490699 positive
+20851958 positive
+15519295 positive
+24128930 negative
+10773459 negative
+23268348 positive
+11768539 positive
+16233094 positive
+8709949 negative
+18595320 negative
+10725538 positive
+11193399 positive
+17646981 positive
+11217409 negative
+15280013 positive
+15006424 positive
+22309761 positive
+24316358 negative
+16896601 positive
+24528642 negative
+17115208 positive
+22584433 positive
+22437835 positive
+23094334 negative
+10517025 positive
+2076554 positive
+8768520 positive
+23619241 negative
+20623432 negative
+10347026 positive
+11722900 positive
+15921894 negative
+23190610 positive
+17345128 positive
+15130150 negative
+8514419 negative
+9990729 positive
+8941946 negative
+22796724 negative
+19669931 positive
+22160328 positive
+1889394 negative
+12427996 positive
+9324248 positive
+16233072 positive
+23508399 negative
+9797312 positive
+9694679 negative
+9128738 positive
+3907189 negative
+6184962 negative
+3128741 negative
+8431310 negative
+8433972 positive
+23537284 positive
+22846889 positive
+9929401 negative
+19784554 positive
+21848609 negative
+18668373 negative
+19239548 positive
+23990297 negative
+8837440 positive
+8020743 negative
+19736001 positive
+23356577 positive
+16283301 negative
+2688929 positive
+23858710 negative
+16844780 positive
+234905 negative
+16121227 negative
+23959893 negative
+16333341 negative
+11357511 positive
+23261999 positive
+18040681 negative
+8390128 negative
+20579868 negative
+1172175 negative
+106849 negative
+20102533 negative
+1511691 positive
+12723619 positive
+16461639 positive
+11722552 positive
+17905460 negative
+7670182 positive
+20406672 positive
+17623028 positive
+1748872 negative
+9682473 positive
+15564668 positive
+16272431 negative
+23263965 positive
+20735824 negative
+18938241 negative
+2070799 positive
+7750151 positive
+6791629 negative
+10395989 negative
+10052139 negative
+11925050 negative
+11376040 negative
+10052135 positive
+15135402 positive
+10933800 negative negative
+12670686 negative
+9019140 negative
+7763458 negative
+7764306 positive
+14642815 positive
+21532326 negative
+14648113 positive
+7586029 positive
+23615741 positive
+10974100 negative
+19174189 positive
+8670100 positive
+8947054 negative
+10588045 positive
+12356463 positive
+20512738 negative
+1406248 positive
+23844364 negative
+12450128 positive
+8057846 positive
+19934038 negative
+2834092 negative
+5661593 negative
+1814275 positive
+20039188 negative
+18553693 negative
+18524918 negative
+19777823 positive
+16232837 positive
+8297343 positive
+20680265 positive
+18483792 negative
+22860913 negative
+9535817 positive
+8390581 negative
+1369024 positive
+7646037 positive
+10092840 positive
+18490069 negative
+18020405 positive
+22033931 positive
+7708682 positive
+8590631 positive
+15262228 positive
+6787335 negative
+11157256 positive
+9797333 negative
+16556727 negative
+12649442 positive
+12501406 negative
+22689149 positive
+24237246 negative
+6984129 negative
+3265327 negative
+18347828 negative
+1369269 negative
+19060392 positive
+8452520 positive
+12657297 negative
+12297320 negative
+11396904 negative
+24416614 negative
+9167273 negative
+15649508 negative
+20169200 negative
+19039584 positive
+18720841 positive
+20464942 negative
+12623067 positive
+15716038 positive
+23330392 positive
+22859955 positive
+23241981 negative
+9023952 negative
+16349528 positive
+8590658 negative
+6765603 positive
+6440004 positive
+11911612 negative
+17067546 negative
+18998121 positive
+9514754 positive
+1761224 positive
+17928959 positive
+1369161 positive
+11692674 negative
+7764056 positive
+7766233 negative
+15502357 positive
+12137954 positive
+12922166 positive
+15342117 negative
+20645085 positive
+16887562 negative
+12018245 negative
+16650812 positive
+6420649 negative
+9756616 positive
+7574642 positive
+3012284 negative
+10422230 negative
+21124049 negative
+22906186 positive
+15691940 positive
+9742698 positive
+9169553 negative
+7747967 negative
+21708265 positive
+12619703 negative
+22225502 positive
+18716810 negative
+6049382 negative
+10642523 positive
+8477731 positive
+15322773 positive
+18725302 positive
+10931904 positive
+16657425 negative
+19110429 negative
+18346891 positive
+19809200 positive
+14633044 negative
+11997095 positive
+18944759 negative
+23399248 positive
+12949620 negative
+10906956 negative
+11255010 positive
+3932329 negative
+9043114 negative
+10993164 positive
+22365717 negative
+21801352 negative
+19502758 positive
+9466262 negative
+10386374 positive
+20510474 negative
+11368016 positive
+8988359 negative
+23204424 positive
+21564548 positive
+6248742 negative
+24315640 negative
+22684857 positive
+19156406 positive
+11272822 negative negative
+21161225 positive
+19661691 positive
+17651209 positive
+8474449 negative
+18620557 negative
+8437 positive
+23824666 negative
+23525113 positive
+4255900 negative
+11254576 positive
+9791893 positive
+11281712 negative
+22120123 negative
+1970434 positive
+13278322 negative
+23252695 positive
+23583262 negative
+12567244 negative
+23333949 negative
+23180124 positive
+7864815 positive
+8824176 negative
+17627774 negative
+9008887 positive
+2308855 positive
+22448043 negative
+7789795 positive
+12767807 positive
+10377251 positive
+23171402 negative
+16833 negative
+405328 negative
+1624111 negative
+15607743 positive
+9758835 positive
+23833180 negative
+11166820 negative
+18551552 negative
+4661766 negative
+6409895 positive
+17363438 positive
+15917612 negative
+20936239 negative
+7670194 positive
+1339327 positive
+16653055 negative
+8440481 positive
+22073551 negative
+8806739 negative
+15998305 negative
+10665422 negative
+17988729 negative
+17419071 positive
+20879842 negative
+23332834 positive
+20879840 negative
+1934116 positive
+22112956 positive
+19453169 positive
+20591661 positive
+1896470 negative
+19000618 positive
+19580870 negative
+9406381 positive
+17002602 positive
+6771030 negative
+15555940 positive
+15555941 negative
+1320186 positive
+8836148 positive
+23101390 positive
+21531609 negative
+3384334 positive
+23624166 negative
+24313660 negative
+17551789 positive
+21632240 negative
+15555935 positive
+23318568 negative
+16121561 negative
+8929394 positive
+22203550 negative
+3440521 positive
+12466887 positive
+8815461 negative
+8837470 positive
+19924304 negative
+12630320 negative
+17194495 positive
+1606968 positive
+21498763 negative
+9335167 positive
+3153146 negative
+15116339 negative
+17277884 positive
+23850557 negative
+8017902 negative
+15784980 positive
+18661293 negative
+8085821 positive
+20222446 positive
+22048567 negative
+20336338 positive
+956129 negative
+1934135 positive
+20652693 positive
+14704857 negative
+9884411 positive
+24329860 negative
+16988781 positive
+20400566 negative
+20400560 negative
+16701547 negative
+18072936 positive
+12167544 positive
+12094738 negative
+19336219 negative
+9145525 positive
+11795847 positive
+573117 negative
+12237858 positive
+16887700 negative
+16366715 negative
+17884661 negative
+23107704 positive
+3314909 negative
+15469730 positive
+16202538 positive
+15280646 positive
+23365723 negative
+18479937 positive
+9546185 positive
+3596237 positive
+9766241 negative
+3290051 negative
+11553760 negative
+10584016 positive
+10586675 negative
+11319115 positive
+23898996 negative
+16694 negative
+12506981 positive
+15749766 negative
+16362326 negative
+2450787 negative
+15821912 positive
+15808943 positive
+18633609 positive
+8577701 negative
+20414741 negative
+507620 negative
+15715951 negative
+8939815 positive
+23869387 negative
+3246351 positive
+15988573 positive
+7121328 negative
+10636904 positive
+16572843 negative
+15715941 positive
+18839231 negative
+17487548 positive
+1366983 negative
+16330537 positive
+9756469 positive
+8529895 positive
+23768357 negative
+10427736 negative
+16944135 positive
+16614901 positive
+15850449 negative
+7592488 positive
+23463247 positive
+16668702 negative
+17851776 negative
+19044008 negative
+16328626 positive
+16739943 negative
+8455560 negative
+19129654 positive
+15629130 positive
+12374797 positive
+9325167 negative
+12850270 positive
+15666544 negative
+22353731 positive
+9002269 negative
+20419375 positive
+1366537 negative
+17964183 positive
+20235799 negative
+560243 negative
+3124870 negative
+15316684 negative
+12145937 negative
+4736235 negative
+2113524 positive
+20850111 negative
+24232491 negative
+7805053 positive
+4796770 negative
+23292745 negative
+6540443 negative
+18942586 negative
+10572260 positive
+9003585 negative
+2925681 positive
+17455791 positive
+16407250 negative
+12664153 positive
+1899374 negative
+16776296 positive
+2785629 negative
+15819855 negative
+21350668 negative
+23280774 negative
+9177963 negative
+12489121 negative
+12383257 positive
+16535476 negative
+11402645 negative
+10830498 positive
+7961928 negative
+22738957 negative
+11601607 negative
+17876815 positive
+1930835 negative
+8572698 positive
+23454546 negative
+20675115 negative
+18060506 positive
+3240864 positive
+2762318 negative
+20807550 negative
+21920035 positive
+9546178 positive
+9546179 positive
+23956415 negative
+22036533 positive
+22750808 positive
+23077275 positive
+1643283 negative
+12387863 negative
+11955286 negative
+3912145 negative
+14558143 negative
+16233469 positive
+8419289 positive
+2227420 positive
+12400688 positive
+1979298 positive
+19941023 positive
+10222181 positive
+18327544 negative
+6068063 negative
+22579385 negative
+6068064 negative
\ No newline at end of file
diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore
new file mode 100644
index 0000000..6b468b6
--- /dev/null
+++ b/src/analyse/.gitignore
@@ -0,0 +1 @@
+*.class
diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java
new file mode 100644
index 0000000..dfb2d48
--- /dev/null
+++ b/src/analyse/Extractor.java
@@ -0,0 +1,455 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package analyse;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import configure.ConfigConstants;
+/**
+ * Implements common tools to FeatureExtractor
+ * and NgramExtractor classes that are used to
+ * extract features from doc instances
+ *
+ * @author halmeida
+ *
+ */
+public class Extractor {
+
+
+ String id;
+ String endId;
+ String openFile;
+ String endFile;
+ String openAbst;
+ String closeAbst;
+ String abstractLabel;
+ String openEC;
+ String closeEC;
+ String classTag;
+ String openTitle;
+ String closeTitle;
+ String openJournal;
+ String closeJournal;
+ String copyR;
+ String closeCopyR;
+
+ /**
+ * Replaces special characters to clean
+ * text for tokenizing.
+ *
+ * @param str text to be cleaned
+ * @return string with cleaned text
+ */
+ public String removeSpecialChar(String str){
+ str = str.replace("}", "");
+ str = str.replace("{", "");
+ str = str.replace("]", "");
+ str = str.replace("[", "");
+ str = str.replace("\"", "");
+ str = str.replace("<", "");
+ str = str.replace(">", "");
+ str = str.replace("/", " ");
+ str = str.replace("\\", " ");
+ str = str.replace("#", "");
+ str = str.replace("*", "");
+ str = str.replace(">", "");
+ str = str.replace("&apos", "");
+ str = str.replace("%", "");
+ str = str.replace(""", "");
+ str = str.replace("&", "");
+ str = str.replace("=", "");
+ str = str.replace("?", "");
+ str = str.replace("!", "");
+ str = str.replace(";", "");
+ str = str.replace(":", "");
+// str = str.replace(",", "");
+// str = str.replace(".", "");
+ str = str.replace(")", "");
+ str = str.replace("(", "");
+ str = str.replace("\t\t", "\t");
+ //losing ngrams because of hifen between names
+ str = str.replace("-", " ");
+ str = str.replace(" ", "");
+
+ return str;
+ }
+
+ /**
+ * Handles external tags (and multiple abstract
+ * text tags) present in a single paper
+ * @param str abstract content
+ * @return string without external tags
+ */
+
+ public String processAbstract(String str){
+ str = str.replace(" ", "");
+
+ if(str.contains("Copyright") && !(str.contains("."))) str = str.replace("", ".");
+
+ String[] remove = str.split("");
+ StringBuilder sb = new StringBuilder();
+ String temp = "";
+ String abstrac = "";
+
+ for(int i = 0; i < remove.length; i++){
+ temp = temp + remove[i];
+
+ if(temp.contains("")));
+ }
+
+ //Handling the word "Copyright" before the end of abstract
+ if(temp.contains("Copyright ")){
+ temp = "";
+ do{
+ i++;
+ //an exception here can mean that a copyright information
+ //tag content did not ended with a period
+ }while(!(remove[i]).equalsIgnoreCase("."));
+ }
+ else sb.append(remove[i]);
+ }
+
+ abstrac = sb.toString();
+ abstrac = removeAbstractTags(abstrac);
+
+ return abstrac;
+ }
+
+
+ /**
+ * Removes specific tags encountered on Abstract texts.
+ * This is used to clean the abstract text before
+ * processing the feature count on the model.
+ * @param str
+ * @return
+ */
+
+ public String removeAbstractTags(String str){
+ //this order of removing tags matters to
+ //exclude the first tag from the abstracts.
+
+ str = str.replace("", "");
+ str = str.replace("", "");
+ str = str.replace("", "");
+ str = str.replace("copyright", "");
+ str = str.replace("", "");
+ str = str.replace("", "");
+ str = str.replace("", "");
+ str = str.replace("", "");
+
+ return str;
+ }
+
+
+ /**
+ * Removes the markup annotations of a
+ * text field, and keeps its content
+ *
+ * @param str text containing markups
+ * @return string with cleaned text
+ */
+ public String removeTags(String str) {
+ String[] remove = str.split("");
+ StringBuilder sb = new StringBuilder();
+
+ for(int i = 0; i < remove.length; i++){
+
+ //iterating over the text until finding opening tag
+ if(remove[i].equalsIgnoreCase("<")){
+ do{
+ i++;
+ }
+ //skipping the content until finding closing tag
+ while(!(remove[i].equalsIgnoreCase(">")));
+ }
+ else sb.append(remove[i]);
+ }
+
+ return sb.toString();
+ }
+
+
+ /**
+ * Displays the keys and values of the
+ * maps created.
+ *
+ * @param hash HashMap containing list,
+ * values, counts
+ */
+ public void displayList(HashMap hash){
+ Iterator