diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1cde967 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# Project source # +################### +*.project +*.classpath + +# Package Files # +################# +*.jar diff --git a/arff/triage0_50_ngrams_size1_stopwords.arff b/arff/triage0_50_ngrams_size1_stopwords.arff new file mode 100644 index 0000000..c7525b4 --- /dev/null +++ b/arff/triage0_50_ngrams_size1_stopwords.arff @@ -0,0 +1,38 @@ +% Weka training file - HIV triage - 2015 + +@RELATION triage +@ATTRIBUTE docID REAL %PMID of paper +@ATTRIBUTE Ngram0trees REAL %trees +@ATTRIBUTE Ngram1model REAL %model +@ATTRIBUTE Ngram2triage REAL %triage +@ATTRIBUTE Ngram3genes REAL %genes +@ATTRIBUTE Ngram4sampling REAL %sampling +@ATTRIBUTE Ngram5classification REAL %classification +@ATTRIBUTE Ngram6processing REAL %processing +@ATTRIBUTE Ngram7fungal REAL %fungal +@ATTRIBUTE Ngram8enzymes REAL %enzymes +@ATTRIBUTE Ngram9manual REAL %manual +@ATTRIBUTE Ngram10literature REAL %literature +@ATTRIBUTE Ngram11annotation REAL %annotation +@ATTRIBUTE Ngram12mycoclapfungalgenomicsca REAL %mycoclapfungalgenomicsca +@ATTRIBUTE Ngram13machine REAL %machine +@ATTRIBUTE Ngram14first REAL %first +@ATTRIBUTE Ngram15features REAL %features +@ATTRIBUTE Ngram16mycoclap REAL %mycoclap +@ATTRIBUTE Ngram17results REAL %results +@ATTRIBUTE Ngram18abstracttext REAL %abstracttext +@ATTRIBUTE Ngram19task REAL %task +@ATTRIBUTE Ngram20http REAL %http +@ATTRIBUTE Ngram21support REAL %support +@ATTRIBUTE Ngram22learning REAL %learning +@ATTRIBUTE Ngram23database REAL %database +@ATTRIBUTE Ngram24curation REAL %curation +@ATTRIBUTE Ngram25logistic REAL %logistic +@ATTRIBUTE Ngram26applications REAL %applications +@ATTRIBUTE Ngram27articletitle REAL %articletitle +@ATTRIBUTE class {positive, negative} +@DATA + +25754864,0,0,0,2,0,0,2,5,3,1,1,2,2,0,1,0,6,1,0,0,2,2,0,4,2,0,2,0,negative +25551575,2,4,3,0,2,2,0,0,0,1,2,0,0,4,1,2,0,1,0,2,0,2,3,0,1,2,0,0,negative + diff --git a/arff/triage1_50_ngrams_size1_stopwords.arff b/arff/triage1_50_ngrams_size1_stopwords.arff new file mode 100644 index 0000000..ddfc07d --- /dev/null +++ b/arff/triage1_50_ngrams_size1_stopwords.arff @@ -0,0 +1,38 @@ +% Weka test file - HIV triage - 2015 + +@RELATION triage +@ATTRIBUTE docID REAL %PMID of paper +@ATTRIBUTE Ngram0trees REAL %trees +@ATTRIBUTE Ngram1model REAL %model +@ATTRIBUTE Ngram2triage REAL %triage +@ATTRIBUTE Ngram3genes REAL %genes +@ATTRIBUTE Ngram4sampling REAL %sampling +@ATTRIBUTE Ngram5classification REAL %classification +@ATTRIBUTE Ngram6processing REAL %processing +@ATTRIBUTE Ngram7fungal REAL %fungal +@ATTRIBUTE Ngram8enzymes REAL %enzymes +@ATTRIBUTE Ngram9manual REAL %manual +@ATTRIBUTE Ngram10literature REAL %literature +@ATTRIBUTE Ngram11annotation REAL %annotation +@ATTRIBUTE Ngram12mycoclapfungalgenomicsca REAL %mycoclapfungalgenomicsca +@ATTRIBUTE Ngram13machine REAL %machine +@ATTRIBUTE Ngram14first REAL %first +@ATTRIBUTE Ngram15features REAL %features +@ATTRIBUTE Ngram16mycoclap REAL %mycoclap +@ATTRIBUTE Ngram17results REAL %results +@ATTRIBUTE Ngram18abstracttext REAL %abstracttext +@ATTRIBUTE Ngram19task REAL %task +@ATTRIBUTE Ngram20http REAL %http +@ATTRIBUTE Ngram21support REAL %support +@ATTRIBUTE Ngram22learning REAL %learning +@ATTRIBUTE Ngram23database REAL %database +@ATTRIBUTE Ngram24curation REAL %curation +@ATTRIBUTE Ngram25logistic REAL %logistic +@ATTRIBUTE Ngram26applications REAL %applications +@ATTRIBUTE Ngram27articletitle REAL %articletitle +@ATTRIBUTE class {positive, negative} +@DATA + +25754864,0,0,0,2,0,0,2,5,3,1,1,2,2,0,1,0,6,1,0,0,2,2,0,4,2,0,2,0,negative +25551575,2,4,3,0,2,2,0,0,0,1,2,0,0,4,1,2,0,1,0,2,0,2,3,0,1,2,0,0,negative + diff --git a/build.xml b/build.xml new file mode 100644 index 0000000..efbea36 --- /dev/null +++ b/build.xml @@ -0,0 +1,153 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/corpus/mycoSORTSampleTriagecorpus_test.xml b/corpus/mycoSORTSampleTriagecorpus_test.xml new file mode 100644 index 0000000..bdd6cf2 --- /dev/null +++ b/corpus/mycoSORTSampleTriagecorpus_test.xml @@ -0,0 +1,468 @@ + + + + 25754864 + + 2015 + 03 + 10 + + + 2015 + 03 + 18 + +
+ + 1758-0463 + + 2015 + + 2015 + + + Database : the journal of biological databases and curation + Database (Oxford) + + mycoCLAP, the database for characterized lignocellulose-active proteins of fungal origin: resource and text mining curation support. + + + + 10.1093/database/bav008 + bav008 + + Enzymes active on components of lignocellulosic biomass are used for industrial applications ranging from food processing to biofuels production. These include a diverse array of glycoside hydrolases, carbohydrate esterases, polysaccharide lyases and oxidoreductases. Fungi are prolific producers of these enzymes, spurring fungal genome sequencing efforts to identify and catalogue the genes that encode them. To facilitate the functional annotation of these genes, biochemical data on over 800 fungal lignocellulose-degrading enzymes have been collected from the literature and organized into the searchable database, mycoCLAP (http://mycoclap.fungalgenomics.ca). First implemented in 2011, and updated as described here, mycoCLAP is capable of ranking search results according to closest biochemically characterized homologues: this improves the quality of the annotation, and significantly decreases the time required to annotate novel sequences. The database is freely available to the scientific community, as are the open source applications based on natural language processing developed to support the manual curation of mycoCLAP. Database URL: http://mycoclap.fungalgenomics.ca. + © The Author(s) 2015. Published by Oxford University Press. + + + + Strasser + Kimchi + K + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + McDonnell + Erin + E + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Nyaga + Carol + C + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Wu + Min + M + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Wu + Sherry + S + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Almeida + Hayda + H + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Meurs + Marie-Jean + MJ + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Kosseim + Leila + L + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Powlowski + Justin + J + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Butler + Greg + G + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA gregb@encs.concordia.ca. + + + + Tsang + Adrian + A + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + eng + + Journal Article + Research Support, Non-U.S. Gov't + + + 2015 + 03 + 08 + +
+ + England + Database (Oxford) + 101517697 + 1758-0463 + + IM + + + Nat Biotechnol. 2004 Jun;22(6):695-700 + 15122302 + + + Appl Environ Microbiol. 2013 Aug;79(15):4620-34 + 23709508 + + + J Mol Biol. 1990 Oct 5;215(3):403-10 + 2231712 + + + Nature. 2008 Sep 4;455(7209):47-50 + 18769432 + + + Nucleic Acids Res. 2009 Jan;37(Database issue):D233-8 + 18838391 + + + Nucleic Acids Res. 2009 Jan;37(Database issue):D588-92 + 18984617 + + + Database (Oxford). 2011;2011:bar020 + 21622642 + + + Genome Res. 2011 Jun;21(6):885-97 + 21543515 + + + Nat Methods. 2011;8(10):785-6 + 21959131 + + + Nat Biotechnol. 2011 Oct;29(10):922-7 + 21964414 + + + Enzyme Microb Technol. 2011 Apr 7;48(4-5):397-403 + 22112956 + + + BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5 + 22595090 + + + Science. 2012 Jun 29;336(6089):1715-9 + 22745431 + + + Nucleic Acids Res. 2013 Jan;41(Database issue):D43-7 + 23161681 + + + Nucleic Acids Res. 2013 Jan;41(Database issue):D36-42 + 23193287 + + + Nat Genet. 2000 May;25(1):25-9 + 10802651 + + + PLoS One. 2014;9(12):e115892 + 25551575 + + + PMC4352688 +
+ + + + 2015 + + + + + 2015 + 3 + 11 + 6 + 0 + + + 2015 + 3 + 11 + 6 + 0 + + + 2015 + 3 + 11 + 6 + 0 + + + epublish + + bav008 + 10.1093/database/bav008 + 25754864 + PMC4352688 + + +
+ + + + 25551575 + + 2015 + 01 + 01 + + + 2015 + 01 + 13 + +
+ + 1932-6203 + + 9 + 12 + + 2014 + + + PloS one + PLoS ONE + + Machine learning for biomedical literature triage. + + e115892 + + 10.1371/journal.pone.0115892 + + This paper presents a machine learning system for supporting the first task of the biological literature manual curation process, called triage. We compare the performance of various classification models, by experimenting with dataset sampling factors and a set of features, as well as three different machine learning algorithms (Naive Bayes, Support Vector Machine and Logistic Model Trees). The results show that the most fitting model to handle the imbalanced datasets of the triage classification task is obtained by using domain relevant features, an under-sampling technique, and the Logistic Model Trees algorithm. + + + + Almeida + Hayda + H + + Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada. + + + + Meurs + Marie-Jean + MJ + + Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada. + + + + Kosseim + Leila + L + + Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada. + + + + Butler + Greg + G + + Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada; Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada. + + + + Tsang + Adrian + A + + Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada. + + + + eng + + Journal Article + Research Support, Non-U.S. Gov't + + + 2014 + 12 + 31 + +
+ + United States + PLoS One + 101285081 + 1932-6203 + + IM + + + Proc AMIA Symp. 2001;:17-21 + 11825149 + + + Artif Intell Med. 2005 Sep-Oct;35(1-2):121-34 + 16024240 + + + Mol Cell. 2006 Mar 3;21(5):589-94 + 16507357 + + + Bioinformatics. 2006 Mar 15;22(6):658-64 + 16287934 + + + Artif Intell Med. 2006 May;37(1):7-18 + 16233974 + + + Nature. 2008 Sep 4;455(7209):47-50 + 18769432 + + + IEEE Trans Syst Man Cybern B Cybern. 2009 Feb;39(1):281-8 + 19068445 + + + Database (Oxford). 2011;2011:bar020 + 21622642 + + + J Integr Bioinform. 2011;8(3):176 + 21926439 + + + Database (Oxford). 2012;2012:bas020 + 22513129 + + + BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5 + 22595090 + + + PLoS One. 2013;8(6):e65848 + 23785456 + + + PLoS One. 2013;8(12):e80503 + 24312478 + + + PLoS One. 2014;9(4):e91315 + 24705246 + + + PLoS One. 2014;9(7):e102039 + 25036529 + + + PMC4281078 +
+ + + + 2014 + + + + + 2014 + 9 + 4 + + + 2014 + 11 + 27 + + + 2014 + 12 + 31 + + + 2015 + 1 + 1 + 6 + 0 + + + 2015 + 1 + 1 + 6 + 0 + + + 2015 + 1 + 1 + 6 + 0 + + + epublish + + 10.1371/journal.pone.0115892 + PONE-D-14-39858 + 25551575 + PMC4281078 + + +
+ +
diff --git a/corpus/mycoSORTSampleTriagecorpus_train_50.xml b/corpus/mycoSORTSampleTriagecorpus_train_50.xml new file mode 100644 index 0000000..bdd6cf2 --- /dev/null +++ b/corpus/mycoSORTSampleTriagecorpus_train_50.xml @@ -0,0 +1,468 @@ + + + + 25754864 + + 2015 + 03 + 10 + + + 2015 + 03 + 18 + +
+ + 1758-0463 + + 2015 + + 2015 + + + Database : the journal of biological databases and curation + Database (Oxford) + + mycoCLAP, the database for characterized lignocellulose-active proteins of fungal origin: resource and text mining curation support. + + + + 10.1093/database/bav008 + bav008 + + Enzymes active on components of lignocellulosic biomass are used for industrial applications ranging from food processing to biofuels production. These include a diverse array of glycoside hydrolases, carbohydrate esterases, polysaccharide lyases and oxidoreductases. Fungi are prolific producers of these enzymes, spurring fungal genome sequencing efforts to identify and catalogue the genes that encode them. To facilitate the functional annotation of these genes, biochemical data on over 800 fungal lignocellulose-degrading enzymes have been collected from the literature and organized into the searchable database, mycoCLAP (http://mycoclap.fungalgenomics.ca). First implemented in 2011, and updated as described here, mycoCLAP is capable of ranking search results according to closest biochemically characterized homologues: this improves the quality of the annotation, and significantly decreases the time required to annotate novel sequences. The database is freely available to the scientific community, as are the open source applications based on natural language processing developed to support the manual curation of mycoCLAP. Database URL: http://mycoclap.fungalgenomics.ca. + © The Author(s) 2015. Published by Oxford University Press. + + + + Strasser + Kimchi + K + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + McDonnell + Erin + E + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Nyaga + Carol + C + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Wu + Min + M + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Wu + Sherry + S + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Almeida + Hayda + H + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Meurs + Marie-Jean + MJ + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Kosseim + Leila + L + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Powlowski + Justin + J + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + Butler + Greg + G + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA gregb@encs.concordia.ca. + + + + Tsang + Adrian + A + + Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA. + + + + eng + + Journal Article + Research Support, Non-U.S. Gov't + + + 2015 + 03 + 08 + +
+ + England + Database (Oxford) + 101517697 + 1758-0463 + + IM + + + Nat Biotechnol. 2004 Jun;22(6):695-700 + 15122302 + + + Appl Environ Microbiol. 2013 Aug;79(15):4620-34 + 23709508 + + + J Mol Biol. 1990 Oct 5;215(3):403-10 + 2231712 + + + Nature. 2008 Sep 4;455(7209):47-50 + 18769432 + + + Nucleic Acids Res. 2009 Jan;37(Database issue):D233-8 + 18838391 + + + Nucleic Acids Res. 2009 Jan;37(Database issue):D588-92 + 18984617 + + + Database (Oxford). 2011;2011:bar020 + 21622642 + + + Genome Res. 2011 Jun;21(6):885-97 + 21543515 + + + Nat Methods. 2011;8(10):785-6 + 21959131 + + + Nat Biotechnol. 2011 Oct;29(10):922-7 + 21964414 + + + Enzyme Microb Technol. 2011 Apr 7;48(4-5):397-403 + 22112956 + + + BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5 + 22595090 + + + Science. 2012 Jun 29;336(6089):1715-9 + 22745431 + + + Nucleic Acids Res. 2013 Jan;41(Database issue):D43-7 + 23161681 + + + Nucleic Acids Res. 2013 Jan;41(Database issue):D36-42 + 23193287 + + + Nat Genet. 2000 May;25(1):25-9 + 10802651 + + + PLoS One. 2014;9(12):e115892 + 25551575 + + + PMC4352688 +
+ + + + 2015 + + + + + 2015 + 3 + 11 + 6 + 0 + + + 2015 + 3 + 11 + 6 + 0 + + + 2015 + 3 + 11 + 6 + 0 + + + epublish + + bav008 + 10.1093/database/bav008 + 25754864 + PMC4352688 + + +
+ + + + 25551575 + + 2015 + 01 + 01 + + + 2015 + 01 + 13 + +
+ + 1932-6203 + + 9 + 12 + + 2014 + + + PloS one + PLoS ONE + + Machine learning for biomedical literature triage. + + e115892 + + 10.1371/journal.pone.0115892 + + This paper presents a machine learning system for supporting the first task of the biological literature manual curation process, called triage. We compare the performance of various classification models, by experimenting with dataset sampling factors and a set of features, as well as three different machine learning algorithms (Naive Bayes, Support Vector Machine and Logistic Model Trees). The results show that the most fitting model to handle the imbalanced datasets of the triage classification task is obtained by using domain relevant features, an under-sampling technique, and the Logistic Model Trees algorithm. + + + + Almeida + Hayda + H + + Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada. + + + + Meurs + Marie-Jean + MJ + + Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada. + + + + Kosseim + Leila + L + + Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada. + + + + Butler + Greg + G + + Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada; Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada. + + + + Tsang + Adrian + A + + Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada. + + + + eng + + Journal Article + Research Support, Non-U.S. Gov't + + + 2014 + 12 + 31 + +
+ + United States + PLoS One + 101285081 + 1932-6203 + + IM + + + Proc AMIA Symp. 2001;:17-21 + 11825149 + + + Artif Intell Med. 2005 Sep-Oct;35(1-2):121-34 + 16024240 + + + Mol Cell. 2006 Mar 3;21(5):589-94 + 16507357 + + + Bioinformatics. 2006 Mar 15;22(6):658-64 + 16287934 + + + Artif Intell Med. 2006 May;37(1):7-18 + 16233974 + + + Nature. 2008 Sep 4;455(7209):47-50 + 18769432 + + + IEEE Trans Syst Man Cybern B Cybern. 2009 Feb;39(1):281-8 + 19068445 + + + Database (Oxford). 2011;2011:bar020 + 21622642 + + + J Integr Bioinform. 2011;8(3):176 + 21926439 + + + Database (Oxford). 2012;2012:bas020 + 22513129 + + + BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5 + 22595090 + + + PLoS One. 2013;8(6):e65848 + 23785456 + + + PLoS One. 2013;8(12):e80503 + 24312478 + + + PLoS One. 2014;9(4):e91315 + 24705246 + + + PLoS One. 2014;9(7):e102039 + 25036529 + + + PMC4281078 +
+ + + + 2014 + + + + + 2014 + 9 + 4 + + + 2014 + 11 + 27 + + + 2014 + 12 + 31 + + + 2015 + 1 + 1 + 6 + 0 + + + 2015 + 1 + 1 + 6 + 0 + + + 2015 + 1 + 1 + 6 + 0 + + + epublish + + 10.1371/journal.pone.0115892 + PONE-D-14-39858 + 25551575 + PMC4281078 + + +
+ +
diff --git a/features/docIDs.txt b/features/docIDs.txt new file mode 100644 index 0000000..8da8a8a --- /dev/null +++ b/features/docIDs.txt @@ -0,0 +1,1198 @@ +14565843 positive +23073100 negative +11501467 negative +20208428 positive +9074500 negative +986853 negative +8787388 positive +20826217 positive +11471729 positive +16059706 negative +11298744 positive +21168763 negative +10424099 negative +18415096 positive +15866877 positive +2396985 negative +7838157 negative +44415 negative +11170563 negative +16128806 positive +12435269 positive +19756584 positive +21382036 negative +15830675 negative +7579664 negative +9114071 negative +21635140 negative +20070371 positive +19505579 positive +14987996 positive +12882162 positive +6767680 negative +963703 negative +15278289 positive +17119968 positive +9486422 negative +8698653 positive +16278932 positive +16488199 positive +15466516 positive +21829351 negative +17302745 negative +1479358 positive +15450181 positive +9199426 negative +14976875 negative +2703464 negative +15290142 negative +21972816 negative +16431275 positive +15541296 positive +19060407 negative +18499583 negative +22260051 negative +17027758 positive +12209794 negative +1400249 positive +21466636 negative +11916668 positive +1452095 negative +9508797 negative +19277742 positive +22226198 positive +18175902 negative +20803138 negative +17341093 negative +8948426 negative +9011379 negative +14595695 negative +22906713 negative +8524797 positive +2226842 positive +19590866 positive +9485595 positive +2332056 negative +10957961 negative +9013549 positive +10923795 positive +24212486 negative +10713452 positive +13306714 negative +22031024 positive +8978090 negative +22067437 negative +19205049 positive +16926418 negative +19393602 positive +8597544 negative +3595987 negative +19690850 positive +21306947 positive +23790084 negative +8597548 positive +15686849 positive +9931476 positive +1398098 positive +18936994 negative +18045411 positive +17229143 positive +22350290 negative +8135518 positive +1588915 negative +13031603 negative +16789551 negative +23218368 positive +14766566 negative +11930943 negative +8647098 positive +18806001 negative +21360092 negative +10802187 positive +12788920 positive +17910720 negative +1368680 positive +16404950 positive +21897016 positive +9468803 negative +23071108 positive +8647081 positive +10434062 negative +8250548 positive +23226882 negative +12162562 positive +19809198 negative +9140529 negative +21575132 negative +15054207 negative +11064202 negative +18800599 positive +23589840 negative +1392588 negative +7603444 positive +18388475 negative +17381511 positive +8804409 negative +22132148 positive +8765754 negative +9140977 positive +23625219 negative +17351093 positive +22075023 negative +16462863 positive +19556747 positive +15944854 negative +15054209 positive +8856078 positive +17662982 negative +16473771 positive +8243636 negative +8810077 positive +22795531 negative +23497862 negative +3114236 positive +15449305 negative +12591897 positive +22733825 negative +19996679 positive +7763357 negative +9803534 negative +13637982 negative +19479322 negative +18555305 negative +18500632 negative +16232740 positive +1368254 positive +10583968 negative +16133102 positive +12882555 negative +16041128 positive +16761182 negative +20109094 positive +10952011 positive +11358516 positive +20518356 negative +6169675 negative +19645671 negative +9802217 positive +18587856 negative +21237221 positive +12485115 positive +23625216 negative +9872754 positive +8503847 positive +9463945 positive +17928699 positive +1768103 negative +7614556 negative +14586108 positive +12363086 negative +12543554 positive +3117961 negative +21402188 negative +17043824 positive +7811079 positive +16343463 positive +8486628 positive +10908793 negative +8964516 negative +9648215 positive +11849507 positive +3080320 negative +12513977 negative +23508400 negative +16662849 negative +12406766 positive +23725035 negative +2337347 positive +2560409 negative +22961332 negative +2510150 positive +14716497 negative +23412069 positive +1735428 negative +21168322 negative +8987884 positive +9830097 negative +15809023 negative +15892742 negative +12908861 negative +22496740 negative +19205687 positive +9492270 positive +1429462 positive +7191044 negative +12889 negative +17111131 positive +9430631 negative +9438354 negative +17021873 positive +20562284 positive +10101286 positive +20552260 positive +24113511 negative +23170978 positive +21510637 negative +8509335 positive +9611196 negative +18388455 negative +15756814 positive +17168300 negative +23666150 negative +8987855 positive +23236275 positive +22776993 positive +22835655 positive +1644702 negative +1567377 negative +21369980 negative +17665191 positive +10606774 positive +18490176 positive +10508057 positive +11988501 negative +24077704 negative +8987622 positive +15500988 positive +12909730 negative +9813313 negative +10672446 negative +18460787 negative +16822232 positive +5861997 negative +2629784 negative +11136466 negative +8640604 positive +12147340 positive +12555575 negative +24274505 negative +10939261 negative +22425351 negative +1490609 negative +12172603 negative +15998406 negative +12073090 positive +16349883 negative +11178973 negative +1368193 positive +12455695 positive +9330667 negative +2331322 positive +17376674 positive +22624316 negative +1878999 positive +8960907 negative +19507068 positive +7859305 positive +8299175 positive +12892894 positive +8575021 positive +2146364 negative +8575023 positive +2126511 negative +16524914 negative +9732526 positive +11523809 positive +19761044 negative +8724139 negative +8589415 positive +15291818 positive +16361785 negative +23124346 positive +19500674 positive +988467 negative +22432613 negative +18068392 negative +12945177 negative +20652740 positive +8276068 negative +10091328 positive +7640003 positive +21971070 negative +18414798 negative +12743761 negative +7984103 positive +10215597 positive +16637705 negative +8589407 positive +8945534 positive +8589408 negative +12325291 negative +19527927 positive +17838811 negative +16233798 positive +18668421 negative +20129093 positive +15294290 positive +23240568 positive +20823521 negative +16244441 positive +7896713 positive +15870328 positive +22579450 negative +16614858 positive +16039872 positive +19288093 positive +18694928 negative +19189377 negative +15651 negative +19575195 positive +7488173 positive +12409103 positive +6358191 negative +16284933 negative +7574556 negative +9830143 negative +18845181 negative +15362290 negative +16233531 positive +17955189 positive +19507018 positive +16233536 positive +23298573 negative +18704748 positive +24186432 negative +12715256 negative +18550352 positive +16380244 negative +15090228 negative +18944813 positive +8598062 positive +14323029 negative +16232670 positive +19054103 positive +16232432 positive +6406022 negative +826291 negative +11829749 positive +19513709 positive +7906649 positive +9058977 negative +18023045 positive +21181156 positive +21243443 positive +8381338 negative +15136043 negative negative +18378599 negative +14685768 positive +22407682 negative +2707445 positive +18083533 positive +22132219 negative +12226497 negative +14763977 negative +14532063 positive +18563407 positive +19545999 negative +19967375 negative +16233515 positive +1787790 positive +22705517 negative +8975597 positive +23470758 negative +23728162 negative +9309656 negative +10779688 positive +1447290 negative +11257513 positive +11115392 positive +10049844 negative +19088319 positive +15950056 negative +12754825 negative +20569406 negative +21307589 positive +23844185 negative +6541478 negative +18408068 positive +20077114 positive +9169610 positive +16677342 negative +23508952 positive +12602898 negative +17433483 positive +10675564 positive +18443829 negative +17625262 positive +17599813 positive +32175 positive +7574590 positive +20212162 negative +8781176 positive +18923909 negative +10491168 positive +18264680 negative +23500559 negative +6863431 negative +9805384 positive +14524699 positive +8400376 positive +2135869 positive +14523125 positive +16129506 positive +8400378 negative +18943122 negative +16275128 positive +10493932 positive +23199732 positive +17651154 negative +12665550 positive +12224649 positive +7439182 negative +8959766 negative +9608522 negative +23199738 positive +19734721 positive +20143777 positive +23306879 positive +11376609 negative negative +9334183 negative +10049864 negative +16697997 positive +23489323 negative +21040747 positive +21442271 positive +24372593 negative +15746364 negative +9212440 negative +1952931 negative +11179652 negative +8955395 negative +12843664 positive +2158993 positive +23299456 positive +18512263 positive +14665735 positive +20043150 positive +21626020 positive +23836384 negative +7487028 negative +7961884 positive +22653604 positive +12619666 positive +3111887 negative +19107534 positive +32833 positive +15580593 positive +16901567 positive +7824933 positive +10499260 negative +9506837 positive +19527524 positive +11061997 negative +22373601 negative +9758774 positive +22074954 negative +19473250 positive +9758775 positive +17977149 negative +12845603 negative +2506439 negative +10553664 positive +22150279 negative +18456943 positive +15270720 negative +18675351 positive +10385327 negative +7629010 negative +1592808 positive +17505783 negative +18722542 negative +12843680 positive +8593683 positive +16980715 positive +16734792 positive +7487009 negative +23285046 positive +22524557 negative +16520923 positive +16107755 positive +19912637 positive +3936420 positive +7626800 negative +9464371 positive +19269961 positive +2152162 positive +22444635 positive +16666407 negative +3268297 negative +8900004 positive +10659715 negative +2760033 positive +1612414 positive +1368837 positive +23931690 negative +8756392 positive +20424835 negative +18548669 positive +10514255 negative +19922433 negative +16233124 positive +15174310 positive +17043085 negative +16140328 positive +8669913 negative +20429042 negative +3125847 negative +20592022 positive +24212538 negative +1425667 positive +7262712 negative +10525153 positive +21710260 positive +7926830 positive +14674022 negative +14735222 positive +18975142 negative +18935968 positive +20014432 negative +23184220 negative +18722595 positive +4779294 negative +15838031 positive +15025429 negative +9464399 negative +23129650 positive +7493964 positive +27428 negative +3561490 positive +8436950 positive +9761741 positive +21945415 negative +1367522 positive +20382376 negative +4269377 negative +9118231 negative +7012186 negative +19756576 negative +22360347 negative +15288024 negative +10586505 positive +9805373 positive +22442229 positive +15782637 negative +8905923 positive +15246667 negative +8688436 positive +14988022 positive +18850325 negative +7549103 positive +8901566 negative +8595661 positive +9547139 negative +15668816 negative +1368843 positive +12702357 positive +16874542 positive +23326459 positive +8935788 negative +16374635 positive +1368603 positive +9153431 positive +9987124 positive +16478498 negative +10029988 positive +8595669 positive +22080345 positive +22754023 positive +22080343 negative +8065265 positive +8961569 negative +9165762 positive +20573014 positive +22940311 positive +23303647 negative +12726996 positive +24479319 negative +9450333 positive +20727822 positive +17922847 positive +16664778 negative +16134120 negative +22072708 negative +9371889 positive +8948110 negative +20619350 positive +16474906 positive +11754346 negative +2579525 negative +20734107 negative +8464071 negative +22805919 negative +22709462 negative +9841776 negative +23100915 negative +7788716 positive +7788717 positive +8654984 positive +4040855 negative +45611 negative +12233746 negative +17614952 negative +22685137 positive +18233 negative +21948841 positive +19202090 positive +16137662 negative +21726361 negative +1368777 positive +3527986 negative +9654123 positive +18307762 positive +12597025 negative +2508563 negative +8000538 positive +560223 negative +2063624 negative +1632643 negative +21193820 positive +15194814 positive +11856 negative +9301101 negative +24020787 negative +8190078 positive +19835139 negative +22712405 negative +18377882 positive +9000377 positive +21622 negative +21364303 negative +16186619 negative +7987261 positive +8997712 negative +22940347 negative +16523351 negative +24085297 negative +1654681 negative negative +8358833 positive +8358835 positive +19725536 positive +23897210 negative +15757176 positive +2187435 positive +8616259 negative +9370370 positive +8358830 positive +17503147 positive +2509432 positive +4281647 negative +1781689 negative +22349190 positive +10508113 positive +24035805 negative +21748379 positive +1815765 negative +20541633 negative +11494757 negative +12668107 negative +10376824 positive +21490699 positive +20851958 positive +15519295 positive +24128930 negative +10773459 negative +23268348 positive +11768539 positive +16233094 positive +8709949 negative +18595320 negative +10725538 positive +11193399 positive +17646981 positive +11217409 negative +15280013 positive +15006424 positive +22309761 positive +24316358 negative +16896601 positive +24528642 negative +17115208 positive +22584433 positive +22437835 positive +23094334 negative +10517025 positive +2076554 positive +8768520 positive +23619241 negative +20623432 negative +10347026 positive +11722900 positive +15921894 negative +23190610 positive +17345128 positive +15130150 negative +8514419 negative +9990729 positive +8941946 negative +22796724 negative +19669931 positive +22160328 positive +1889394 negative +12427996 positive +9324248 positive +16233072 positive +23508399 negative +9797312 positive +9694679 negative +9128738 positive +3907189 negative +6184962 negative +3128741 negative +8431310 negative +8433972 positive +23537284 positive +22846889 positive +9929401 negative +19784554 positive +21848609 negative +18668373 negative +19239548 positive +23990297 negative +8837440 positive +8020743 negative +19736001 positive +23356577 positive +16283301 negative +2688929 positive +23858710 negative +16844780 positive +234905 negative +16121227 negative +23959893 negative +16333341 negative +11357511 positive +23261999 positive +18040681 negative +8390128 negative +20579868 negative +1172175 negative +106849 negative +20102533 negative +1511691 positive +12723619 positive +16461639 positive +11722552 positive +17905460 negative +7670182 positive +20406672 positive +17623028 positive +1748872 negative +9682473 positive +15564668 positive +16272431 negative +23263965 positive +20735824 negative +18938241 negative +2070799 positive +7750151 positive +6791629 negative +10395989 negative +10052139 negative +11925050 negative +11376040 negative +10052135 positive +15135402 positive +10933800 negative negative +12670686 negative +9019140 negative +7763458 negative +7764306 positive +14642815 positive +21532326 negative +14648113 positive +7586029 positive +23615741 positive +10974100 negative +19174189 positive +8670100 positive +8947054 negative +10588045 positive +12356463 positive +20512738 negative +1406248 positive +23844364 negative +12450128 positive +8057846 positive +19934038 negative +2834092 negative +5661593 negative +1814275 positive +20039188 negative +18553693 negative +18524918 negative +19777823 positive +16232837 positive +8297343 positive +20680265 positive +18483792 negative +22860913 negative +9535817 positive +8390581 negative +1369024 positive +7646037 positive +10092840 positive +18490069 negative +18020405 positive +22033931 positive +7708682 positive +8590631 positive +15262228 positive +6787335 negative +11157256 positive +9797333 negative +16556727 negative +12649442 positive +12501406 negative +22689149 positive +24237246 negative +6984129 negative +3265327 negative +18347828 negative +1369269 negative +19060392 positive +8452520 positive +12657297 negative +12297320 negative +11396904 negative +24416614 negative +9167273 negative +15649508 negative +20169200 negative +19039584 positive +18720841 positive +20464942 negative +12623067 positive +15716038 positive +23330392 positive +22859955 positive +23241981 negative +9023952 negative +16349528 positive +8590658 negative +6765603 positive +6440004 positive +11911612 negative +17067546 negative +18998121 positive +9514754 positive +1761224 positive +17928959 positive +1369161 positive +11692674 negative +7764056 positive +7766233 negative +15502357 positive +12137954 positive +12922166 positive +15342117 negative +20645085 positive +16887562 negative +12018245 negative +16650812 positive +6420649 negative +9756616 positive +7574642 positive +3012284 negative +10422230 negative +21124049 negative +22906186 positive +15691940 positive +9742698 positive +9169553 negative +7747967 negative +21708265 positive +12619703 negative +22225502 positive +18716810 negative +6049382 negative +10642523 positive +8477731 positive +15322773 positive +18725302 positive +10931904 positive +16657425 negative +19110429 negative +18346891 positive +19809200 positive +14633044 negative +11997095 positive +18944759 negative +23399248 positive +12949620 negative +10906956 negative +11255010 positive +3932329 negative +9043114 negative +10993164 positive +22365717 negative +21801352 negative +19502758 positive +9466262 negative +10386374 positive +20510474 negative +11368016 positive +8988359 negative +23204424 positive +21564548 positive +6248742 negative +24315640 negative +22684857 positive +19156406 positive +11272822 negative negative +21161225 positive +19661691 positive +17651209 positive +8474449 negative +18620557 negative +8437 positive +23824666 negative +23525113 positive +4255900 negative +11254576 positive +9791893 positive +11281712 negative +22120123 negative +1970434 positive +13278322 negative +23252695 positive +23583262 negative +12567244 negative +23333949 negative +23180124 positive +7864815 positive +8824176 negative +17627774 negative +9008887 positive +2308855 positive +22448043 negative +7789795 positive +12767807 positive +10377251 positive +23171402 negative +16833 negative +405328 negative +1624111 negative +15607743 positive +9758835 positive +23833180 negative +11166820 negative +18551552 negative +4661766 negative +6409895 positive +17363438 positive +15917612 negative +20936239 negative +7670194 positive +1339327 positive +16653055 negative +8440481 positive +22073551 negative +8806739 negative +15998305 negative +10665422 negative +17988729 negative +17419071 positive +20879842 negative +23332834 positive +20879840 negative +1934116 positive +22112956 positive +19453169 positive +20591661 positive +1896470 negative +19000618 positive +19580870 negative +9406381 positive +17002602 positive +6771030 negative +15555940 positive +15555941 negative +1320186 positive +8836148 positive +23101390 positive +21531609 negative +3384334 positive +23624166 negative +24313660 negative +17551789 positive +21632240 negative +15555935 positive +23318568 negative +16121561 negative +8929394 positive +22203550 negative +3440521 positive +12466887 positive +8815461 negative +8837470 positive +19924304 negative +12630320 negative +17194495 positive +1606968 positive +21498763 negative +9335167 positive +3153146 negative +15116339 negative +17277884 positive +23850557 negative +8017902 negative +15784980 positive +18661293 negative +8085821 positive +20222446 positive +22048567 negative +20336338 positive +956129 negative +1934135 positive +20652693 positive +14704857 negative +9884411 positive +24329860 negative +16988781 positive +20400566 negative +20400560 negative +16701547 negative +18072936 positive +12167544 positive +12094738 negative +19336219 negative +9145525 positive +11795847 positive +573117 negative +12237858 positive +16887700 negative +16366715 negative +17884661 negative +23107704 positive +3314909 negative +15469730 positive +16202538 positive +15280646 positive +23365723 negative +18479937 positive +9546185 positive +3596237 positive +9766241 negative +3290051 negative +11553760 negative +10584016 positive +10586675 negative +11319115 positive +23898996 negative +16694 negative +12506981 positive +15749766 negative +16362326 negative +2450787 negative +15821912 positive +15808943 positive +18633609 positive +8577701 negative +20414741 negative +507620 negative +15715951 negative +8939815 positive +23869387 negative +3246351 positive +15988573 positive +7121328 negative +10636904 positive +16572843 negative +15715941 positive +18839231 negative +17487548 positive +1366983 negative +16330537 positive +9756469 positive +8529895 positive +23768357 negative +10427736 negative +16944135 positive +16614901 positive +15850449 negative +7592488 positive +23463247 positive +16668702 negative +17851776 negative +19044008 negative +16328626 positive +16739943 negative +8455560 negative +19129654 positive +15629130 positive +12374797 positive +9325167 negative +12850270 positive +15666544 negative +22353731 positive +9002269 negative +20419375 positive +1366537 negative +17964183 positive +20235799 negative +560243 negative +3124870 negative +15316684 negative +12145937 negative +4736235 negative +2113524 positive +20850111 negative +24232491 negative +7805053 positive +4796770 negative +23292745 negative +6540443 negative +18942586 negative +10572260 positive +9003585 negative +2925681 positive +17455791 positive +16407250 negative +12664153 positive +1899374 negative +16776296 positive +2785629 negative +15819855 negative +21350668 negative +23280774 negative +9177963 negative +12489121 negative +12383257 positive +16535476 negative +11402645 negative +10830498 positive +7961928 negative +22738957 negative +11601607 negative +17876815 positive +1930835 negative +8572698 positive +23454546 negative +20675115 negative +18060506 positive +3240864 positive +2762318 negative +20807550 negative +21920035 positive +9546178 positive +9546179 positive +23956415 negative +22036533 positive +22750808 positive +23077275 positive +1643283 negative +12387863 negative +11955286 negative +3912145 negative +14558143 negative +16233469 positive +8419289 positive +2227420 positive +12400688 positive +1979298 positive +19941023 positive +10222181 positive +18327544 negative +6068063 negative +22579385 negative +6068064 negative \ No newline at end of file diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore new file mode 100644 index 0000000..6b468b6 --- /dev/null +++ b/src/analyse/.gitignore @@ -0,0 +1 @@ +*.class diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java new file mode 100644 index 0000000..dfb2d48 --- /dev/null +++ b/src/analyse/Extractor.java @@ -0,0 +1,455 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package analyse; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; + +import configure.ConfigConstants; +/** + * Implements common tools to FeatureExtractor + * and NgramExtractor classes that are used to + * extract features from doc instances + * + * @author halmeida + * + */ +public class Extractor { + + + String id; + String endId; + String openFile; + String endFile; + String openAbst; + String closeAbst; + String abstractLabel; + String openEC; + String closeEC; + String classTag; + String openTitle; + String closeTitle; + String openJournal; + String closeJournal; + String copyR; + String closeCopyR; + + /** + * Replaces special characters to clean + * text for tokenizing. + * + * @param str text to be cleaned + * @return string with cleaned text + */ + public String removeSpecialChar(String str){ + str = str.replace("}", ""); + str = str.replace("{", ""); + str = str.replace("]", ""); + str = str.replace("[", ""); + str = str.replace("\"", ""); + str = str.replace("<", ""); + str = str.replace(">", ""); + str = str.replace("/", " "); + str = str.replace("\\", " "); + str = str.replace("#", ""); + str = str.replace("*", ""); + str = str.replace(">", ""); + str = str.replace("&apos", ""); + str = str.replace("%", ""); + str = str.replace(""", ""); + str = str.replace("&", ""); + str = str.replace("=", ""); + str = str.replace("?", ""); + str = str.replace("!", ""); + str = str.replace(";", ""); + str = str.replace(":", ""); +// str = str.replace(",", ""); +// str = str.replace(".", ""); + str = str.replace(")", ""); + str = str.replace("(", ""); + str = str.replace("\t\t", "\t"); + //losing ngrams because of hifen between names + str = str.replace("-", " "); + str = str.replace(" ", ""); + + return str; + } + + /** + * Handles external tags (and multiple abstract + * text tags) present in a single paper + * @param str abstract content + * @return string without external tags + */ + + public String processAbstract(String str){ + str = str.replace(" ", ""); + + if(str.contains("Copyright") && !(str.contains("."))); + } + + //Handling the word "Copyright" before the end of abstract + if(temp.contains("Copyright ")){ + temp = ""; + do{ + i++; + //an exception here can mean that a copyright information + //tag content did not ended with a period + }while(!(remove[i]).equalsIgnoreCase(".")); + } + else sb.append(remove[i]); + } + + abstrac = sb.toString(); + abstrac = removeAbstractTags(abstrac); + + return abstrac; + } + + + /** + * Removes specific tags encountered on Abstract texts. + * This is used to clean the abstract text before + * processing the feature count on the model. + * @param str + * @return + */ + + public String removeAbstractTags(String str){ + //this order of removing tags matters to + //exclude the first tag from the abstracts. + + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("copyright", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + + return str; + } + + + /** + * Removes the markup annotations of a + * text field, and keeps its content + * + * @param str text containing markups + * @return string with cleaned text + */ + public String removeTags(String str) { + String[] remove = str.split(""); + StringBuilder sb = new StringBuilder(); + + for(int i = 0; i < remove.length; i++){ + + //iterating over the text until finding opening tag + if(remove[i].equalsIgnoreCase("<")){ + do{ + i++; + } + //skipping the content until finding closing tag + while(!(remove[i].equalsIgnoreCase(">"))); + } + else sb.append(remove[i]); + } + + return sb.toString(); + } + + + /** + * Displays the keys and values of the + * maps created. + * + * @param hash HashMap containing list, + * values, counts + */ + public void displayList(HashMap hash){ + Iterator itr = hash.keySet().iterator(); + int sum = 0; + while(itr.hasNext()){ + Object str = itr.next(); + System.out.println("key: "+str+"\t value: "+hash.get(str)); + } + } + + + /** + * Exports hashmap of values extracted + * from dataset to external file + * + * @param location folder, file name and file extension + * @param list values to be exported + */ + public void exportFile(String location, HashMap list){ + + String SEPARATOR = "\t"; + StringBuffer line = new StringBuffer(); + Iterator itr = list.keySet().iterator(); + + try{ + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8")); + + while(itr.hasNext()){ + Object str = itr.next(); + if(str != null){ + line.append(str).append(SEPARATOR).append(list.get(str)); + if(line.toString().contains("=")) + line.replace(line.indexOf("="), line.indexOf("=")+1,SEPARATOR); + //handling specificities from title content extraction + if(line.toString().contains(",")) + line.replace(line.indexOf(","), line.indexOf(",")+1,SEPARATOR); + } + if(itr.hasNext()){ + line.append(System.getProperty("line.separator")); + + } + writer.write(removeSpecialChar(line.toString())); + line.replace(0, line.length(), ""); + } + writer.flush(); + writer.close(); + } + catch(UnsupportedEncodingException e){ + e.printStackTrace(); + } + catch(FileNotFoundException e){ + e.printStackTrace(); + } + catch(IOException e){ + e.printStackTrace(); + } + + + //} + } + + + /** + * Exports list of values extracted + * from dataset to a string variable + * + * @param list list of values to be exported + * @return string containing values on list + * @deprecated + */ + public String exportContent(HashMap list){ + String SEPARATOR = "\t"; + Iterator itr = list.keySet().iterator(); + StringBuffer export = new StringBuffer(); + //try{ + while(itr.hasNext()){ + String str = itr.next(); + if(str != null){ + export.append(str).append(SEPARATOR).append(list.get(str)); + + if(export.toString().contains("=")) + export.replace(export.indexOf("="), export.indexOf("=")+1,SEPARATOR); + } + + if(itr.hasNext()){ + export.append("\n"); + } + } + return removeSpecialChar(export.toString()); + } + + + /** + * Exports list of values extracted + * from dataset to external file + * + * @param location folder, file name and file extension + * @param list list of values to be exported + * + */ + public void exportList(String location, ArrayList list){ + + String SEPARATOR = "\n"; + StringBuffer line = new StringBuffer(); + + try{ + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8")); + + for(int i = 0; i < list.size(); i++){ + String str = list.get(i); + if(str != null){ + line.append(str).append(SEPARATOR); + } + } + writer.write(removeSpecialChar(line.toString())); + + writer.flush(); + writer.close(); + } + catch(UnsupportedEncodingException e){ + e.printStackTrace(); + } + catch(FileNotFoundException e){ + e.printStackTrace(); + } + catch(IOException e){ + e.printStackTrace(); + } + + } + + + public void initialize(File featureDir, ConfigConstants pathVars){ + try{ + featureDir.mkdir(); + + }catch(Exception e){ + System.out.println("Error creating" + featureDir + "folder."); + System.exit(0); + } + } + + + /** + * Accessors and mutators methods + * for Extractor variables. + * @return + */ + + public String getid() { + return id; + } + public void setid(String id) { + this.id = id; + } + public String getendId() { + return endId; + } + public void setendId(String endId) { + this.endId = endId; + } + public String getOpenFile() { + return openFile; + } + public void setOpenFile(String openFile) { + this.openFile = openFile; + } + public String getendFile() { + return endFile; + } + public void setendFile(String endFile) { + this.endFile = endFile; + } + public String getopenAbst() { + return openAbst; + } + public void setopenAbst(String openAbst) { + this.openAbst = openAbst; + } + public String getcloseAbst() { + return closeAbst; + } + public void setcloseAbst(String closeAbst) { + this.closeAbst = closeAbst; + } + public String getOpenEC() { + return openEC; + } + public void setOpenEC(String openEC) { + this.openEC = openEC; + } + public String getCloseEC() { + return closeEC; + } + public void setCloseEC(String closeEC) { + this.closeEC = closeEC; + } + public String getAbstractLabel() { + return abstractLabel; + } + public void setAbstractLabel(String abstractLabel) { + this.abstractLabel = abstractLabel; + } + public String getClassTag() { + return classTag; + } + public void setClassTag(String classTag) { + this.classTag = classTag; + } + public String getOpenTitle() { + return openTitle; + } + public void setOpenTitle(String titleTag) { + this.openTitle = titleTag; + } + public String getCloseTitle() { + return closeTitle; + } + public void setCloseTitle(String closeTitle) { + this.closeTitle = closeTitle; + } + public String getOpenJournal() { + return openJournal; + } + public void setOpenJournal(String openJournal) { + this.openJournal = openJournal; + } + public String getCloseJournal() { + return closeJournal; + } + public void setCloseJournal(String closeJournal) { + this.closeJournal = closeJournal; + } + +} \ No newline at end of file diff --git a/src/analyse/FeatureExtractor.java b/src/analyse/FeatureExtractor.java new file mode 100644 index 0000000..1593c01 --- /dev/null +++ b/src/analyse/FeatureExtractor.java @@ -0,0 +1,544 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package analyse; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import configure.ConfigConstants; +import filter.NaiveFilter; + + +/** + * This class extracts and parses domain + * annotation features from doc instances + * + * @author Hayda Almeida + * @since 2014 + * + */ + +public class FeatureExtractor extends Extractor{ + + public FeatureExtractor(){ + + this.id = "PMID"; + this.openAbst = "AbstractText"; + this.abstractLabel = "AbstractText "; + this.openEC = "RegistryNumber"; + this.classTag = "TRIAGE"; + this.openJournal = "Title"; + this.openTitle = "ArticleTitle"; + } + + + public static void main(String[] args) { + + ConfigConstants pathVars = new ConfigConstants(); + boolean verbose = false; + + String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE; + FeatureExtractor fextrac = new FeatureExtractor(); + NaiveFilter featFilter = new NaiveFilter(); + + File featureDir = new File(pathVars.HOME_DIR + pathVars.FEATURE_DIR + "/"); + + fextrac.initialize(featureDir, pathVars); + featFilter.loadStopWords(pathVars.HOME_DIR + pathVars.STOP_LIST); + + //store all features, type and count + HashMap,Integer> abstract_count = new HashMap,Integer>(); + //store title features, type and count + HashMap, Integer> title_count = new HashMap, Integer>(); + //store title features, whole journal title content and classification + HashMap,String> title_content = new HashMap,String>(); + //store title content and EC numbers + ArrayList ec_numbers = new ArrayList(); + + //store ID, class and features + HashMap PMIDs = new HashMap(); + + + int jTitle = 0; + + try + { + //Loading file + File input = new File(AnCorpus); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + Elements corpus = doc.body().getElementsByTag("pubmedarticle"); + + //Fetching elements + + for(Element paper : corpus ){ + + //Fetching elements + Elements journalTitle = paper.getElementsByTag(fextrac.getOpenJournal()); + Elements title = paper.getElementsByTag(fextrac.getOpenTitle()); + Elements abstractC = paper.getElementsByTag(fextrac.getopenAbst()); + Elements ECnumber = paper.getElementsByTag(fextrac.getOpenEC()); + Elements classDoc = paper.getElementsByTag(fextrac.getClassTag()); + + String journal = ""; + String docID = ""; + String label = ""; + ArrayList tempList = new ArrayList(); + StringBuffer sb = new StringBuffer(); + + //fetching the paper ID - + //for all items in a paper, retrieve only PMIDs + for(Element e : paper.select(fextrac.getid())){ + //only consider the ID if the parent is medline citation + if(e.parentNode().nodeName().contains("medline")){ + docID = e.text(); + } + } + //fetch the doc label as well + if(classDoc.hasText()){ + label = classDoc.text(); + } + + PMIDs.put(docID, label); + + if(journalTitle.hasText()){ + + jTitle++; + journal = journalTitle.toString(); + journal = fextrac.removeSpecialChar(journal); + journal = fextrac.removeTags(journal); + } + + String title_annotation = ""; + if(title.hasText()){ + title_annotation = title.toString(); + // title_annotation = fextrac.removeSpecialChar(title_annotation); + + tempList.addAll(fextrac.annotations(title_annotation, title_count, featFilter, pathVars)); + fextrac.addContent(title_annotation, journal, title_content, featFilter); + } + + String abstrac = ""; + if(abstractC.hasText()){ + abstrac = abstractC.toString(); + //abstrac = fextrac.removeSpecialChar(abstrac); + //abstrac = fextrac.removeAbstractTags(abstrac); + + tempList.addAll(fextrac.annotations(abstrac, abstract_count, featFilter, pathVars)); + } + + String ecnum = ""; + if(ECnumber.hasText()){ + for(Element number : ECnumber){ + ecnum = number.toString(); + if(ecnum.contains("EC")){ + ecnum = fextrac.removeSpecialChar(ecnum); + ecnum = fextrac.removeTags(ecnum); + ec_numbers.add(ecnum); + } + } + } + + String triage = ""; + if(classDoc.hasText()){ + triage = classDoc.toString(); + triage = fextrac.removeSpecialChar(triage); + triage = fextrac.removeTags(triage); + } + + } + + } + + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + if(verbose){ + //print list of extracted features + System.out.println("\n===========TITLE==ANNOTATIONS============="); + fextrac.displayList(title_count);; + fextrac.displayList(title_content); + System.out.println("\n========ABSTRACT==ANNOTATIONS============="); + fextrac.displayList(abstract_count); + + } + + //filter features by occurence + featFilter.considerAnnotationOccurence(abstract_count, pathVars); + featFilter.considerAnnotationOccurence(title_count, pathVars); + + System.out.println("\n===========FEATURE==EXPORT==============="); + fextrac.exportFile(featureDir + "/" + pathVars.DOC_IDS, PMIDs); + System.out.println("..."+ PMIDs.size()+" document IDs listed."); + fextrac.exportList(featureDir + "/" + pathVars.ECNUM_FEATURES, ec_numbers); + System.out.println("..."+ ec_numbers.size()+" EC numbers saved."); + fextrac.exportFile(featureDir + "/" + pathVars.ANNOTATION_FEATURES, abstract_count); + System.out.println("..."+ abstract_count.size()+" unique Abstract annotations saved."); + fextrac.exportFile(featureDir + "/" + pathVars.TITLE_FEATURES, title_count); + System.out.println("..."+ title_count.size() +" unique Title annotations saved."); + fextrac.exportFile(featureDir + "/" + pathVars.JOURNAL_TITLE_FEATURES, title_content); + System.out.println("..."+jTitle+" Journal titles saved."); + System.out.println("\n=========================================\n"); + + } + + /** + * Identifies the classification on doc + * + * @param clas text containing classification (after char removal) + * @return classification of doc + */ + private String getClassif(String clas) { + + //parsing the not edited text into HTML using Jsoup + Document doc = Jsoup.parseBodyFragment(clas); + //saving the text as an Jsoup element, with a main tag (the HTML body), + //attributes and child nodes (TRIAGE tags) + Element text = doc.body(); + + Elements classification = text.getElementsByTag("TRIAGE"); + + return classification.text(); + } + + /** + * Inserts the classification + * on the list of features + * + * @param class information to insert on list + * @param list list of features used + */ + private void addClass(String element, HashMap, String> list){ + //going over list to insert + //classif on document instances + Iterator>it = list.keySet().iterator(); + + while(it.hasNext()){ + Map str = it.next(); + + if(list.get(str).contains(element)){ + //if(list.get(str).contains("positive") || list.get(str).contains("negative")){ + + } + else list.put(str, element); + } + } + + + /** + * Extract the annotations from a determined section + * of the document and add them to the specified lists. + * + * @param annotation cleaned and splitted line with annotation + * @param count list that holds annotation, its type and its count + * @param type list that holds annotation, its type and its classification + */ + private ArrayList annotations(String annot, HashMap, Integer> count, NaiveFilter filter, ConfigConstants pathVars) { + HashMap features = loadAnnotationEntities(); + ConfigConstants pathVar = new ConfigConstants(); + NgramExtractor nextrac = new NgramExtractor(); + ArrayList content = new ArrayList(); + + //parsing the not edited text into HTML using Jsoup + Document doc = Jsoup.parseBodyFragment(annot); + //saving the text as an Jsoup element, with a main tag (the HTML body), + //attributes and child nodes (annotation tags) + Element annotations = doc.body(); + + //iterating over list of entities + for(Map.Entry value : features.entrySet()){ + + String an_type = value.getKey(); + String an_level = value.getValue(); + + //for each entity, find the annotations on abstract + Elements annots = annotations.getElementsByTag(an_type); + + //for each annotation found, + for(Element an : annots){ + + //grabbing annotation content: + //if the annotation is made on the sentence level: + if(an_level.contains("sentence")){ + + //checking if sentence contains inner annotations + if(an.childNodeSize() != 0){ + + //going over list of inner annotations + for(Element child : an.children()){ + + //if child is sentence (sentence inside of sentence), + //then add annotations as ngrams on this + if(features.get(child.nodeName()).contains("sentence")) { + content.addAll(nextrac.nGrams(child.text(), filter, pathVar)); + insertAnnotation(content, an.nodeName(), count, pathVars); + } + //adding annotations on sentence as they are - no ngrams on this + else { + content.add(child.text()); + insertAnnotation(content, an.nodeName(), count, pathVars); + } + } + + //removing inner annotations from sentence, they are already added + Element tempAnnot = an.clone(); + tempAnnot.children().remove(); + + //splitting content in ngrams to whats left on the sentence + content.addAll(nextrac.nGrams(tempAnnot.text(), filter, pathVar)); + insertAnnotation(content, an.nodeName(), count, pathVars); + } + + } + else { + //keeping original annotation content for other cases + content.add(an.text()); + insertAnnotation(content, an.nodeName(), count, pathVars); + } + } + + } + return content; + + } + + + /** + * Insert annotation (or ngram list of annotation) + * on lists, used on @annotations method + * @param content content of annotation + * @param an_type type extracted from text (entity) + * @param count list of annotations and their count + */ + private void insertAnnotation(ArrayList content, String an_type, HashMap, Integer> count, ConfigConstants pathVars){ + + //iterating over list of annotations + for(int i = 0; i < content.size(); i++){ + + String current_content = content.get(i); + current_content = removeSpecialChar(current_content); + + if(current_content.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){ + + //creating the list key as: content - type mapping + Map an_content = new HashMap(); + an_content.put(current_content, an_type); + + //for each annotation (or ngram on annotation) + //insert content and related type + if(count.containsKey(an_content)){ + try{ + int cnt = count.get(an_content); + count.put(an_content, cnt+1); + + }catch(Exception e){ + count.put(an_content, 1); + } + } + else{ + count.put(an_content, 1); + } + } + } + + content.clear(); + + } + + + /** + * Inserts the text (e.g.title) content into + * a list of features (e.g.title features) + * + * @param annot text with the annotations to be handled + * @param wContent whole field to be added on the list of features + * @param list features used + * + */ + private void addContent(String annot, String wContent, HashMap,String> list, NaiveFilter filter) { + + HashMap features = loadAnnotationEntities(); + ArrayList content = new ArrayList(); + NgramExtractor nextrac = new NgramExtractor(); + ConfigConstants pathVar = new ConfigConstants(); + + //parsing not edited text into HTML using Jsoup + Document doc = Jsoup.parseBodyFragment(annot); + //saving the text as an Jsoup element, with a main tag (the HTML body), + //attributes and child nodes (annotation tags) + Element annotations = doc.body(); + + //iterating over annotation types + for(Map.Entry value : features.entrySet()){ + + String an_type = value.getKey(); + String an_level = value.getValue(); + + //for each annotation type, find all related annotations on the abstract + Elements annots = annotations.getElementsByTag(an_type); + + //for each annotation type, + for(Element an : annots){ + + //grab annotation content + if(an_level.contains("sentence")) + //splitting in ngrams for sentence level annotations + content = nextrac.nGrams(an.text(), filter, pathVar); + else + //keeping original annotation for other cases + content.add(an.text()); + + //iterating over list of annotations + for(int i = 0; i < content.size(); i++){ + + String current_content = content.get(i); + current_content = removeSpecialChar(current_content); + + Map an_content = new HashMap(); + an_content.put(current_content, wContent); + + //populating list of feature_an_types, with: + //feature--an_type--class + list.put(an_content, ""); + } + content.clear(); + } + } + } + + + /** + * Loads list of entities from external file + * + * @param str list of entities + * @param pathVar constants from + * @return + */ + public HashMap loadAnnotationEntities(){ + + String pathEntities = "entities.txt"; + HashMap values = new HashMap(); + + try{ + BufferedReader reader = new BufferedReader(new FileReader(pathEntities)); + + String line = null; + + while((line = reader.readLine()) != null){ + + if(!line.contains("#")){ + String[] value = StringUtils.split(line, " "); + values.put(value[0].toLowerCase(), value[1].toLowerCase()); + } + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + //String[] entities = values.toArray(new String[values.size()]); + + return values; + } + + @Override + public void initialize(File featureDir, ConfigConstants pathVars){ + + try{ + + if(!featureDir.exists()) + featureDir.createNewFile(); + + File ecnumbers = new File(featureDir + "/" + pathVars.ECNUM_FEATURES); + ecnumbers.createNewFile(); + + File annotations = new File(featureDir + "/" + pathVars.ANNOTATION_FEATURES); + annotations.createNewFile(); + + File titleAnnotations = new File(featureDir + "/" + pathVars.TITLE_FEATURES); + titleAnnotations.createNewFile(); + + File journaltitles = new File(featureDir + "/" + pathVars.JOURNAL_TITLE_FEATURES); + journaltitles.createNewFile(); + + }catch(Exception e){ + System.out.println(e.getMessage()); + System.exit(0); + } + } + + + /** + * Handles the content of annotations; when + * there is multiple elements, they are + * concatenated after extracted + * + * @param str list of annotation elements + * @return single string with all elements + */ + public String contentToString(String[] str){ + String cont = ""; + + for(int i = 0; i < str.length; i++){ + if(cont.contentEquals("")){ + cont = cont + str[i]; + } + else cont = cont+" "+ str[i]; + + } + + return cont; + } + + + +} diff --git a/src/analyse/NgramExtractor.java b/src/analyse/NgramExtractor.java new file mode 100644 index 0000000..b11be56 --- /dev/null +++ b/src/analyse/NgramExtractor.java @@ -0,0 +1,309 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package analyse; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import configure.ConfigConstants; +import filter.NaiveFilter; + +/** + * This class extracts and parses n-grams + * from XML doc instances. + * + * @author Hayda Almeida + * @since 2014 + * + */ + +public class NgramExtractor extends Extractor{ + + public NgramExtractor(){ + + //defining relevant paper text fields + this.id = "PMID"; + this.openJournal = "Title"; + this.openAbst = "AbstractText"; + this.openEC = "RegistryNumber"; + this.classTag = "TRIAGE"; + this.openTitle = "ArticleTitle"; + } + + + public static void main(String[] args) { + + ConfigConstants pathVars = new ConfigConstants(); + boolean verbose = false; + + String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE; + NgramExtractor nextrac = new NgramExtractor(); + NaiveFilter featFilter = new NaiveFilter(); + File featureDir = new File(pathVars.HOME_DIR + pathVars.FEATURE_DIR); + + featFilter.loadStopWords(pathVars.HOME_DIR + pathVars.STOP_LIST); + + //store abstract ngrams and its count + HashMap ngram_count = new HashMap(); + //store abstract ngrams and doc ID + HashMap ngram_ID = new HashMap(); + //store title ngrams and its count + HashMap ngram_title_count = new HashMap(); + //store title ngrams, count and "relevance(TBD)" + HashMap,Integer> ngram_title = new HashMap,Integer>(); + //store ID and label of documents + HashMap PMIDs = new HashMap(); + + nextrac.initialize(featureDir, pathVars); + + try + { + + //Loading file + File input = new File(AnCorpus); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + Elements corpus = doc.body().getElementsByTag("pubmedarticle"); + + //Fetching elements + + for(Element paper : corpus ){ + + Elements journalTitle = paper.getElementsByTag(nextrac.getOpenJournal()); + Elements title = paper.getElementsByTag(nextrac.getOpenTitle()); + Elements abstractC = paper.getElementsByTag(nextrac.getopenAbst()); + Elements ECnumber = paper.getElementsByTag(nextrac.getOpenEC()); + Elements classDoc = paper.getElementsByTag(nextrac.getClassTag()); + + String journal = ""; + String docID = ""; + String label = ""; + int jTitle = 0; + + //fetching the paper ID - + //for all items in a paper, retrieve only PMIDs + for(Element e : paper.select(nextrac.getid())){ + //only consider the ID if the parent is medline citation + if(e.parentNode().nodeName().contains("medline")){ + docID = e.text(); + } + } + //fetch the doc label as well + if(classDoc.hasText()){ + label = classDoc.text(); + } + + PMIDs.put(docID, label); + + //Extracting the Journal Title + if(journalTitle.hasText()){ + jTitle++; + journal = journalTitle.toString(); + journal = nextrac.removeSpecialChar(journal); + journal = nextrac.removeTags(journal); + } + + String tit_content = ""; + //Extracting the Paper Title + if(title.hasText()){ + tit_content = title.toString(); + tit_content = nextrac.removeSpecialChar(tit_content); + tit_content = nextrac.removeTags(tit_content); + + ArrayList title_c = nextrac.nGrams(tit_content, featFilter, pathVars); + nextrac.addNGram(title_c, ngram_title_count, pathVars); + } + + String abstrac = ""; + //Extracting the Paper abstract + if(abstractC.hasText()){ + abstrac = abstractC.toString(); + //abstrac = nextrac.removeTags(abstrac); + abstrac = nextrac.removeSpecialChar(abstrac); + abstrac = nextrac.removeAbstractTags(abstrac); + + ArrayList abstract_c = nextrac.nGrams(abstrac, featFilter, pathVars); + nextrac.addNGram(abstract_c, ngram_count, pathVars); + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + if(verbose){ + //print list of extracted n-grams + nextrac.displayList(PMIDs); + System.out.println("\n========ABSTRACT==NGRAMS============="); + nextrac.displayList(ngram_count); + nextrac.displayList(ngram_title); + System.out.println("\n===========TITLE==NGRAMS============="); + nextrac.displayList(ngram_title_count); + } + + //filter features by occurence + featFilter.considerOccurence(ngram_count, pathVars); + featFilter.considerOccurence(ngram_title_count, pathVars); + + System.out.println("\n===========NGRAMS==EXPORT===============\n"); + nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs); + System.out.println("..."+ PMIDs.size()+" document IDs listed."); + nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES, ngram_count); + System.out.println("..."+ ngram_count.size()+" unique Abstract ngrams saved."); + nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS, ngram_title_count); + System.out.println("... "+ ngram_title_count.size() +" unique Title ngrams saved."); + System.out.println("\n========================================\n"); + + } + + + /** + * Inserts ngrams into list of features + * with a mapping for ngram count + * @param str relation of ngrams extracted + * @param list_count mapping for ngram counts + * @param pathVars + */ + + private void addNGram(ArrayList str, HashMap list_count, ConfigConstants pathVars){ + + //iterating over ngram list + for(int i = 0; i < str.size(); i++){ + String currentNGram = str.get(i); + + //checking existence of current ngram on list mapping + if(list_count.containsKey(currentNGram)){ + //retrieve the amount of current ngrams on mapping + int count = list_count.get(currentNGram); + //insert the updated count of ngrams + list_count.put(currentNGram, count+1); + } + else { + //insert ngram on mapping list + if(currentNGram.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){ + list_count.put(currentNGram, 1); + } + } + } + } + + /** + * Extracts n-grams from a given content field + * + * @param str text to extract ngrams + * @return list of extracted grams + */ + public ArrayList nGrams(String str, NaiveFilter filter, ConfigConstants pathVar){ + + //removing ASCII special characters + str = str.replace("/", ""); + str = str.replace("\\", ""); + str = str.replace(" ", "-"); + str = str.replaceAll("\\s+"," "); + str = str.replace(" ", "-"); + + //Tokenizing the sentence + String[] words = StringUtils.split(str,"-"); + ArrayList ngramList = new ArrayList(); + + int ngram =Integer.parseInt(pathVar.NGRAM_SIZE); + + //Stop-words removal + if(Boolean.valueOf(pathVar.NGRAM_STOP)){ + words = StringUtils.split(filter.removeStopList(words)," "); + } + + //extracting ngrams according to gram size (1, 2, 3) + for(int i=0; i < words.length - (ngram - 1); i++){ + switch(pathVar.NGRAM_SIZE){ + case "1": + ngramList.add(words[i].toLowerCase()); + break; + case "2": + ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()); + break; + case "3": + ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()+" "+words[i+2].toLowerCase()); + break; + } + } + + return ngramList; + } + + + @Override + public void initialize(File featureDir, ConfigConstants pathVars){ + try{ + featureDir.mkdir(); + File ngrams = new File(featureDir + pathVars.NGRAM_FEATURES); + ngrams.createNewFile(); + + File titlengrams = new File(featureDir + pathVars.TITLE_NGRAMS); + titlengrams.createNewFile(); + + }catch(Exception e){ + System.out.println(e.getMessage()); + System.exit(0); + } + } + + + /** + * Displays the keys and values of the + * maps created with n-grams and counts. + * @param hash HashMap containing n-grams + */ + @Override + public void displayList(HashMap hash){ + super.displayList(hash); + //sum = sum + hash.get(str); + System.out.println("\n=======================================\n"); + System.out.println("Number of unique n-grams: "+hash.size()); + System.out.println("\n=======================================\n"); + } + + + +} diff --git a/src/arffmatrix/.gitignore b/src/arffmatrix/.gitignore new file mode 100644 index 0000000..ec5761d --- /dev/null +++ b/src/arffmatrix/.gitignore @@ -0,0 +1,2 @@ +/buildmodel.class +/buildtest.class diff --git a/src/arffmatrix/BuildModel.class b/src/arffmatrix/BuildModel.class new file mode 100644 index 0000000..022e81f Binary files /dev/null and b/src/arffmatrix/BuildModel.class differ diff --git a/src/arffmatrix/BuildModel.java b/src/arffmatrix/BuildModel.java new file mode 100644 index 0000000..c94cbf3 --- /dev/null +++ b/src/arffmatrix/BuildModel.java @@ -0,0 +1,317 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*** +* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/corpus/buildmodel.java +* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton +* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc +* +* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term +* http://creativecommons.org/licenses/by-nc/3.0/ +*/ + +package arffmatrix; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Date; + +import com.sun.org.apache.xerces.internal.impl.xs.identity.Selector.Matcher; + +import analyse.Extractor; +import arffvector.CreateVector; +import configure.ConfigConstants; + +/** + * This class reads the corpus instances and uses + * the CreateVector class to generate a model file (ARFF) * + * + * @author Hayda Almeida, Marie-Jean Meurs + * @since 2014 + * + */ + +public class BuildModel { + + + public static void main(String[] args) { + + //----------------------------------- + // instantiate classes of constants + // and configuration file. + //----------------------------------- + + ConfigConstants pathVars = new ConfigConstants(); + + Extractor model = new Extractor(); + File outputDir = new File(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL); + + model.initialize(outputDir, pathVars); + + CreateVector vectorgenerator = new CreateVector(pathVars); + String attributes = vectorgenerator.informFeatures(pathVars); + System.out.println("Features loaded ..."); + + // name output ARFF files + String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); + String arffFileName = "triage" + pathVars.EXP_TYPE + "_"+ pathVars.PERCT_POS_TRAIN + attributes +"_"+ timeStamp + ".arff"; + + try + { + //by default + String sortarffFileName = pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + arffFileName; // default + + // create file + BufferedWriter out = new BufferedWriter(new FileWriter(sortarffFileName)); + + // load ARFF header and write it + String outHeaderArff = vectorgenerator.genArffHeader(pathVars,Integer.parseInt(pathVars.EXP_TYPE)); + //System.out.println(outHeaderArff); // verbose + out.write(outHeaderArff + "\n"); + + // reader for corpus + BufferedReader reader = null; + //train corpus + if(Integer.parseInt(pathVars.EXP_TYPE) == 0) + reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE)); + //test corpus + else if(Integer.parseInt(pathVars.EXP_TYPE) ==1) + reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_FILE)); + + //-------------------------------------------- + // repeat until all lines have been read + // from the file + //-------------------------------------------- + String text = null; + String content = null; + + String abstracttext = ""; + String journaltitle = ""; + String title = ""; + String ecnumber = ""; + String classtriage = ""; + int hasText = 0; + int journaltitlecount = 0; + int abstracttitlecount = 0; + int abstracttextcount = 0; + int positivecount = 0; + int negativecount = 0; + + + while ((text = reader.readLine()) != null) { + + // detect a PubMed abstract + if (text.contains("", ""); + pmid = pmid.replace("", "").trim(); + //System.out.println("PMID : " + pmid); + + // continue to read + content = reader.readLine(); + content = content.replaceAll("\t", ""); + content = content.replaceFirst("\\s+", ""); + + while ( ! content.contains("TRIAGE")) { + + if (content.contains("")){ + + journaltitlecount++; + System.out.println("#: " + journaltitlecount + "\t PMID : " + pmid); + + content = content.replace("<Title>", ""); + content = content.replace("", ""); + journaltitle = content; + //System.out.println("Journal title : " + content); + } + + if (content.contains("")){ + + abstracttitlecount++; + + content = content.replace("", ""); + content = content.replace("", ""); + title = content; + //System.out.println("Paper title : " + content); + } + + + if (content.contains("")){ + + abstracttextcount++; + hasText = 1; // use it to indicate if the abstract has some text or not + + content = content.replace("", ""); + + //checks if there are empty lines after AbstractText tag + //and keeps reading until finds the abstract content + while(content.isEmpty()){ + content = reader.readLine(); + } + abstracttext = abstracttext + content; + // clean + abstracttext = model.removeAbstractTags(abstracttext); + + + content = reader.readLine(); + // converting toLowerCase is not relevant in bio context + // because it introduces ambiguities (ie Gene name / Enzyme alias) + // abstracttext = abstracttext.toLowerCase(); + } + + if (content.contains("")){ + temp = temp + model.processAbstract(content); + } + else{ + do{ + temp = temp + model.processAbstract(content); + content = reader.readLine(); + }while(!(content.contains(""))); + } + + newAbs = newAbs + temp; + content = newAbs + ""; + + abstracttext = content; + abstracttext = model.removeAbstractTags(abstracttext); + + content = reader.readLine(); + + } + + if (content.contains("EC ")){ + content = content.substring(content.indexOf("EC ")); + content = content.replace("", ""); + ecnumber = content; + } + +// if (content.contains("")); +// content = content.replace("", ""); +// classtriage = content; +// if(content.contains("positive")){ +// positivecount++; +// } +// if(content.contains("negative")){ +// negativecount++; +// } +// +// //System.out.println("Triage classification : " + content); +// } + + content = reader.readLine(); + content = content.replaceAll("\t", ""); + content = content.replaceFirst("\\s+", ""); + } + + if (content.contains("")); + content = content.replace("", ""); + classtriage = content; + if(content.contains("positive")){ + positivecount++; + } + if(content.contains("negative")){ + negativecount++; + } + + //System.out.println("Triage classification : " + content); + } + + //System.out.println("Abstract : " + abstracttext.toString() + "\n\n"); + + // end of if: collect data and write ARFF + String Arffline = vectorgenerator.getArffLine(pmid, + journaltitle, + title, + abstracttext, + ecnumber, + classtriage, + Integer.parseInt(pathVars.EXP_TYPE) + ); + + Arffline = Arffline + "\n"; + // write line on disc + out.write(Arffline); + // out.write(id + " " + Arffline + "\n"); // + } + + } + + System.out.println( + "Abstracts processed: " + abstracttitlecount + + "\t with text content: " + abstracttextcount + + "\t from " + journaltitlecount + " journals" + + "\nTotal of: \n" + positivecount + " positive" + + "\t and " + negativecount + " negative documents"); + out.write("\n"); + out.close(); + + reader.close(); + + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + +} + + + diff --git a/src/arffvector/.gitignore b/src/arffvector/.gitignore new file mode 100644 index 0000000..bdc0ba3 --- /dev/null +++ b/src/arffvector/.gitignore @@ -0,0 +1,7 @@ +/buildvector.class +/FeatureVector.class +/CreateVector.class +/CreateWeightedVector.class +/ArbitraryWeight.class +/CountsWeightedVector.class +/ArbitraryWeightedVector.class diff --git a/src/arffvector/CreateVector.java b/src/arffvector/CreateVector.java new file mode 100644 index 0000000..66aee23 --- /dev/null +++ b/src/arffvector/CreateVector.java @@ -0,0 +1,894 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*** +* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/vector/buildvector.java +* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton +* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc +* +* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term +* http://creativecommons.org/licenses/by-nc/3.0/ +*/ + + + + +package arffvector; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import org.apache.commons.lang3.StringUtils; +import configure.ConfigConstants; + +/** + * Uses the features extracted and the + * generated corpus to create a feature vector + * (a matrix representation of the corpus) + * + * @author Hayda Almeida, Marie-Jean Meurs + * @since 2014 + * + */ +public class CreateVector { + + ArrayList annotations = new ArrayList(); + ArrayList annotationsType = new ArrayList(); + ArrayList journalTitles = new ArrayList(); + ArrayList ecnumbers = new ArrayList(); + ArrayList titleGrams = new ArrayList(); + ArrayList titleAnnot = new ArrayList(); + ArrayList nGrams = new ArrayList(); + ArrayList docID = new ArrayList(); + + ConfigConstants pathVars = null; + + /** + * Constructor to load all features extracted + * from training files. These features will be + * used to generate the ARFF header and the + * ARFF vector lines. + * + * @param extVars Variables holding system paths + */ + + public CreateVector(ConfigConstants extVars) { + + pathVars = extVars; + + String pathJournalT = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES; + try{ + String journalT = ""; + + //receiving journal title + BufferedReader reader = new BufferedReader(new FileReader(pathJournalT)); + int featcount = 0; + while (( journalT = reader.readLine()) != null) { + + if (Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ + + String[] features = StringUtils.split(journalT,"\n"); + + for(int i = 0; i < features.length; i++){ + + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for journal titles duplicates + if(featurename[1] != "" && !(journalTitles.contains(featurename[1]))){ + journalTitles.add(featurename[1]); + } + } + } + if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} + + } + reader.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + String pathAnnotations = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES; + String pathTitleAnnot = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES; + + try{ + String abstAnnot = ""; + String tAnnot = ""; + + //receiving abstract annotations (features) + BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations)); + BufferedReader readerT = new BufferedReader(new FileReader(pathTitleAnnot)); + + int featcount = 0; + + while (( abstAnnot = reader.readLine()) != null) { + + if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ + String[] features = StringUtils.split(abstAnnot,"\n"); + + for(int i = 0; i < features.length; i++){ + + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate abstract annotations + if(featurename[0] != "" && !(annotations.contains(featurename[0]))){ + annotations.add(featurename[0]); + } + } + } + if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} + } + + + if(!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ + while((tAnnot = readerT.readLine()) != null){ + + String[] features = StringUtils.split(tAnnot,"\n"); + + for(int i = 0; i < features.length; i++){ + + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate annotations + if(featurename[0] != "" && !(annotations.contains(featurename[0]))){ + annotations.add(featurename[0]); + } + } + + } + + } + + reader.close(); + readerT.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + try{ + String abstAnType = ""; + + //receiving abstract annotation types + BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations)); + int featcount = 0; + while (( abstAnType = reader.readLine()) != null) { + + if (Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ + + String[] features = StringUtils.split(abstAnType,"\n"); + + for(int i = 0; i < features.length; i++){ + + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate abstract annotation types + if(featurename[1] != "" && !(annotationsType.contains(featurename[1]))){ + annotationsType.add(featurename[1]); + } + + } + } + if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} + + } + reader.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + + + try{ + String titAnnot = ""; + + //receiving title annotations (features) + BufferedReader reader = new BufferedReader(new FileReader(pathTitleAnnot)); + // int featcount = 0; + while (( titAnnot = reader.readLine()) != null) { + + if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ + + //String titAnnot = FeatureExtractor.getTitCount(); + + String[] features = StringUtils.split(titAnnot,"\n"); + + for(int i = 0; i < features.length; i++){ + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate title annotations + if(!(titleAnnot.contains(featurename[0]))){ + titleAnnot.add(featurename[0]); + } + } + } + } + reader.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + + String pathECNumFeatures = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES; + + try{ + String ECNum = ""; + + //receiving EC numbers (features) + BufferedReader reader = new BufferedReader(new FileReader(pathECNumFeatures)); + // int featcount = 0; + while ((ECNum = reader.readLine()) != null) { + + if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ + + //String titAnnot = FeatureExtractor.getTitCount(); + + String[] features = StringUtils.split(ECNum,"\n"); + + for(int i = 0; i < features.length; i++){ + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate EC numbers + if(!(ecnumbers.contains(featurename[0]))){ + ecnumbers.add(featurename[0]); + } + } + } + } + reader.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + + String pathTitleGrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS; + + + try{ + String titCont = ""; + // String grams = ""; + + //receiving title ngrams + BufferedReader reader = new BufferedReader(new FileReader(pathTitleGrams)); + + int featcount = 0; + while (( titCont = reader.readLine()) != null) { + + if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ + + String[] content = StringUtils.split(titCont,"\n"); + + for(int i = 0; i < content.length; i++){ + String[] featurename = StringUtils.split(content[i],"\t"); + + //check for duplicate title ngrams + if(!(titleGrams.contains(featurename[0]))){ + titleGrams.add(featurename[0]); + } + } + } + } + + reader.close(); + + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + String pathNgrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES; + try{ + String grams = ""; + String tgrams = ""; + + //receiving ngrams + BufferedReader reader = new BufferedReader(new FileReader(pathNgrams)); + BufferedReader readerT = new BufferedReader(new FileReader(pathTitleGrams)); + + // int featcount = 0; + while (( grams = reader.readLine()) != null) { + + if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ + + String[] features = StringUtils.split(grams,"\n"); + + for(int i = 0; i < features.length; i++){ + String[] featurename = StringUtils.split(features[i],"\t"); + + //check for duplicate abstract ngrams + if(!(nGrams.contains(featurename[0]))){ + nGrams.add(featurename[0]); + } + } + } + + } + + //if not using title grams separately, + // then insert title grams with abstract grams. + if (!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){ + while (( tgrams = readerT.readLine()) != null) { + + String[] features = StringUtils.split(tgrams,"\n"); + + for(int i = 0; i < features.length; i++){ + String[] featurename = StringUtils.split(features[i],"\t"); + + //check for duplicate ngrams + if(!(nGrams.contains(featurename[0]))){ + nGrams.add(featurename[0]); + } + } + } + } + + reader.close(); + readerT.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * Gathers the list of features, according to + * experimental configurations. The list of + * features will be written on the ARFF header. + * + * @param pathVars Variables holding system paths + * @param exp experiment type: train or test + * @return a String containing the ARFF header + */ + + public String genArffHeader(ConfigConstants pathVars, int exp){ + + StringBuilder headerArff = new StringBuilder(); + + switch(exp){ + case 0: + headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2015\n\n"); + break; + case 1: + headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2015\n\n"); + break; + } + + headerArff.append("@RELATION triage\n"); + + if(Boolean.valueOf(pathVars.USE_TEXT_SIZE)){ + // writing the list of text sizes + headerArff.append("@ATTRIBUTE sizeoftitle \tREAL \t\t%size of title\n"); + headerArff.append("@ATTRIBUTE sizeoftext \tREAL \t\t%size of text\n"); + } + + if(Boolean.valueOf(pathVars.USE_DOC_ID)){ + //writing the docIDs + headerArff.append("@ATTRIBUTE docID \tREAL \t\t%PMID of paper\n"); + + } + + if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ + for(int i = 0; i < journalTitles.size(); i++){ + // writing list of journal titles + String feature = journalTitles.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "journalTitle" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ + // writing list of annotation features + for(int i = 0; i < annotations.size(); i++){ + + String feature = annotations.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "annotation" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ + // writing list of annotation entities + for(int i = 0; i < annotationsType.size(); i++){ + String feature = annotationsType.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "annotationType" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ + // write list of title features + for( int i = 0; i < titleAnnot.size(); i++){ + + String feature = titleAnnot.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "titleAnnot" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + + } + + if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ + // writing list of EC numbers + for(int i = 0; i < ecnumbers.size(); i++){ + String feature = ecnumbers.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "ECnumber" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + } + } + + if (Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ + // writing list of ngrams on titles + for( int i = 0; i < titleGrams.size(); i++){ + + String feature = titleGrams.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "titleNgram" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ + // write list of ngrams + for(int i = 0; i < nGrams.size(); i++){ + + String feature = nGrams.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + String ref = "Ngram" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + // writing the dataset classes + headerArff.append("@ATTRIBUTE class {positive, negative}\n"); + headerArff.append("@DATA\n"); + + return headerArff.toString(); + } + + /** + * Iterates over the list of features and + * counts number of features containing + * on a given document. + * + * @param jTitle title of journal + * @param title title of paper + * @param text abstract content + * @param ecnum paper EC numbers + * @param classTriage triage classification: positive or negative + * @param exp experiment type: train or test + * @return String holding counts for all features found in a document + */ + + public String getArffLine(String paperID, String jTitle, String title, String text, String ecnum, String classTriage, int exp){ + //String vectorArff = ""; + StringBuilder vectorArff = new StringBuilder(); + + paperID = removeSpecialChar(paperID.toLowerCase()); + text = removeSpecialChar(text.toLowerCase()); + title = removeSpecialChar(title.toLowerCase()); + jTitle = removeSpecialChar(jTitle.toLowerCase()); + ecnum = removeSpecialChar(ecnum); + + int emptyabs = 0; + + // fill title and text sizes (number of words) + // annotation markups do not matter because + // they do not introduce blank spaces hence + // they do not modify the number of words found + if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){ + + String[] titleGrams = StringUtils.split(title," "); + int titlesize = titleGrams.length; + + String[] abstractcontent = StringUtils.split(text," "); + int abstractsize = abstractcontent.length; + + if(abstractsize == 1){ + emptyabs++; + } + + vectorArff.append(titlesize).append(",").append(abstractsize).append(","); + } + + //fill ID of documents + if(Boolean.valueOf(pathVars.USE_DOC_ID)){ + + if(paperID.length()>0){ + vectorArff.append(paperID).append(","); + } + else{ + vectorArff.append("0,"); + } + } + + //fill values of journal titles + if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ + + for(int i = 0; i < journalTitles.size(); i++){ + String jfeat = ""; + int jfeatcount = 0; + jfeat = journalTitles.get(i).replaceFirst(" ", ""); + + if(jTitle.contains(jfeat)){ + jfeatcount = StringUtils.countMatches(jTitle, jfeat); + vectorArff.append(jfeatcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + // fill values of annotation types taken into account + // either only the abstract or abstract and title + // adds on vector the count of occurrences + if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ + + for(int i = 0; i < annotations.size(); i++){ + String anfeat = ""; + int anfeatcount = 0; + anfeat = annotations.get(i).replaceFirst(" ", "").toLowerCase(); + + //in case the text has current annotation + if (text.contains(anfeat)){ + //check the count of the annotation + if((Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ + anfeatcount = StringUtils.countMatches(text, anfeat); + } + //adding title annot count to annotations + else if (!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ + anfeatcount = StringUtils.countMatches(text, anfeat); + //in case title has annotation, add to count + if(title.contains(anfeat)){ + anfeatcount = anfeatcount + StringUtils.countMatches(title, anfeat); + } + } + vectorArff.append(anfeatcount).append(","); + } + //handles the case that only the title (but not abstract) has current annotation + else if((!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)))){ + if(title.contains(anfeat)){ + anfeatcount = StringUtils.countMatches(title, anfeat); + } + vectorArff.append(anfeatcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + //fill values of abstract annotation types + if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ + + for(int i = 0; i < annotationsType.size(); i++){ + String antype = ""; + int antypecount = 0; + antype = annotationsType.get(i).replaceFirst(" ", "").toLowerCase(); + + if (text.contains(antype)){ + //divided by 2 to match occurance + //(count considers open and close tags) + antypecount = (StringUtils.countMatches(text, antype))/2; + vectorArff.append(antypecount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + + } + + //fill values of title annotations + if (Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ + + for( int i =0; i < titleAnnot.size(); i++){ + String titfeat = ""; + int titfeatcount = 0; + titfeat = titleAnnot.get(i).replaceFirst(" ", "").toLowerCase(); + + if (title.contains(titfeat)){ + titfeatcount = StringUtils.countMatches(title, titfeat); + vectorArff.append(titfeatcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ + + for(int i = 0; i < ecnumbers.size(); i++){ + String ecfeat = ""; + int ecnumcount = 0; + ecfeat = ecnumbers.get(i); + + if(ecnum.contains(ecfeat)){ + ecnumcount = StringUtils.countMatches(ecnum, ecfeat); + vectorArff.append(ecnumcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + // fill only values of title ngrams + if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ + + String cleanTitle = removeTags(title.toLowerCase()); + + for( int i =0; i < titleGrams.size(); i++){ + String titgram = ""; + int titgramcount = 0; + titgram = titleGrams.get(i).toLowerCase(); + + //in case the title has current ngram + if (cleanTitle.contains(titgram)){ + //check the count of the ngram + titgramcount = StringUtils.countMatches(cleanTitle, titgram); + + //adding weight to current ngram count + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ + titgramcount = applyWeight(titgramcount, Integer.parseInt(pathVars.WEIGHT)); + } + vectorArff.append(titgramcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + // fill values of ngrams + if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ + String cleanText = removeTags(text.toLowerCase()); + String cleanTitle = removeTags(title.toLowerCase()); + + for( int i = 0; i < nGrams.size(); i++){ + String ngramfeat = ""; + int ngramcount = 0; + ngramfeat = nGrams.get(i).toLowerCase(); + + //in case the text has current ngram + if (cleanText.contains(ngramfeat)){ + //check the count of the ngram + if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ + ngramcount = StringUtils.countMatches(cleanText, ngramfeat); + + //adding weight to current ngram count + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ + ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); + } + } + //checking if title ngrams should be added to the count + else if(!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){ + ngramcount = StringUtils.countMatches(cleanText, ngramfeat); + + //in case title has ngram, add to count + if(cleanTitle.contains(ngramfeat)){ + ngramcount += StringUtils.countMatches(cleanTitle, ngramfeat); + } + + //adding weight to current ngram count + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ + ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); + } + } + + vectorArff.append(ngramcount).append(","); + } + ////handles the case that only the title (but not abstract) has current ngram + else if (!(cleanText.contains(ngramfeat))){ + //in case only the title has the ngram, add to count + if(cleanTitle.contains(ngramfeat)){ + ngramcount = StringUtils.countMatches(cleanTitle, ngramfeat); + + //adding weight to ngram count + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ + ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); + } + } + vectorArff.append(ngramcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + + //if(exp == 0){ + if (classTriage.contains("positive")){ + vectorArff.append("positive"); + //vectorArff.append("?"); + } + else { + vectorArff.append("negative"); + //vectorArff.append("?"); + } + //} + + /*else if (exp == 1){ + vectorArff.append("?"); + } */ + + return vectorArff.toString(); + } + + /** + * Cleans a given String from special characters + * + * @param str String to be cleaned + * @return String without special characters + */ + + public String removeSpecialChar(String str){ + str = str.replace("}", ""); + str = str.replace("{", ""); + str = str.replace("]", ""); + str = str.replace("[", ""); + str = str.replace("#", ""); + str = str.replace("*", ""); + str = str.replace(">", ""); + str = str.replace(""", ""); + str = str.replace("&apos", ""); + str = str.replace("&", ""); + str = str.replace("%", ""); + str = str.replace("/", ""); + str = str.replace("\\", ""); + str = str.replace("&", ""); + str = str.replace("=", ""); + str = str.replace("?", ""); + str = str.replace(",", ""); + str = str.replace(":", ""); + str = str.replace(";", ""); + str = str.replace(".", ""); + str = str.replace(")", ""); + str = str.replace("(", ""); + str = str.replace("\t\t", "\t"); + str = str.replace("-", ""); + str = str.replace(" ", ""); + + return str; + } + + /** + * + * @param str + * @return + */ + public String removeTags(String str){ + String[] remove = StringUtils.split(str,""); + StringBuilder sb = new StringBuilder(); + + for(int i = 0; i < remove.length; i++){ + + if(remove[i].equalsIgnoreCase("<")){ + do{ + i++; + } + while(!(remove[i].equalsIgnoreCase(">"))); + } + else sb.append(remove[i]); + } + + return sb.toString(); + } + + public int applyWeight(int count, int weight){ + + if(weight > 0){ + count = count * weight; + } + return count; + } + + + public String informFeatures(ConfigConstants pathVars){ + String value = ""; + if(Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)) + value = value + "_annotations"; + if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)) + value = value + "_types"; + if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)) + value = value + "_journal"; + if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE) || Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)) + value = value + "_title"; + if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)) + value = value + "_ecnum"; + if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)) + value = value + "_ngrams_size"+ pathVars.NGRAM_SIZE; + if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE) && Boolean.valueOf(pathVars.NGRAM_STOP)) + value = value + "_stopwords"; + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)) + value = value + "_weight"+ pathVars.WEIGHT; + + return value; + } + + +} diff --git a/src/classifier/.gitignore b/src/classifier/.gitignore new file mode 100644 index 0000000..b92cc15 --- /dev/null +++ b/src/classifier/.gitignore @@ -0,0 +1,3 @@ +/test.class +/train.class +/Trainer.class diff --git a/src/classifier/Trainer.java b/src/classifier/Trainer.java new file mode 100644 index 0000000..5bb317c --- /dev/null +++ b/src/classifier/Trainer.java @@ -0,0 +1,483 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +package classifier; +import java.util.ArrayList; +import weka.classifiers.Classifier; +import weka.classifiers.Evaluation; +import weka.classifiers.bayes.NaiveBayes; +import weka.classifiers.evaluation.Prediction; +import weka.classifiers.evaluation.output.prediction.PlainText; +import weka.classifiers.functions.LibSVM; +import weka.classifiers.trees.LMT; +import weka.core.Instances; +import weka.core.Range; +import weka.core.converters.ConverterUtils.DataSource; +import weka.filters.Filter; +import weka.filters.unsupervised.attribute.Remove; +import configure.ConfigConstants; +import filter.InformedFilter; + +/** + * Trains and tests a classifier, + * executes k-fold cross validation on train data + * and outputs the classification results. + * + * @author Hayda Almeida + * @since 2014 + * + */ + +public class Trainer { + + public static int SEED = 1; //the seed for randomizing the data + public static int FOLDS = 5; //the # of folds to generate + double[][] ranking; + String rank; + + boolean verbose = true; + + + /** + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception { + + + String classifier = ""; + + //for(int i = 0; i < args.length; i++){ + try{ + + classifier = args[0]; + + if(classifier.length() > 1){ + if(classifier.contains("lmt")) + classifier = "lmt"; + else if(classifier.contains("svm")) + classifier = "svm"; + else classifier = "nb"; + } + + } + catch(Exception e){ + // else{ + System.out.println("A classifier must be given as argument. Use: \n" + + "-lmt -> a LMT classifier; \n" + + "-svm -> a SVM classifier; \n" + + "-nb -> a Naive Bayes classifier. "); + System.exit(0); + } + // } + + ConfigConstants pathVars = new ConfigConstants(); + Trainer evaluator = new Trainer(); + InformedFilter filter = new InformedFilter(); + Classifier cls; + + //Creating classifier + if(classifier.contains("lmt")) + cls = (Classifier) new LMT(); + else if (classifier.contains("svm")) + cls = (Classifier) new LibSVM(); + else + cls = (Classifier) new NaiveBayes(); + + //Loading train data + DataSource sourceTrain = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.ARFF_TRAIN); + Instances trainData = sourceTrain.getDataSet(); + + //Flagging the class index on the training data + trainData.setClassIndex(trainData.numAttributes()-1); + System.out.println("Class index set on training data."); + + System.out.println("Training data loaded. Number of instances: " + trainData.numInstances() + "\n"); + + + //Loading test data + DataSource sourceTest = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.ARFF_TEST); + Instances testData = sourceTest.getDataSet(); + + //Flagging the class index on the training data + testData.setClassIndex(trainData.numAttributes()-1); + System.out.println("Class index set on testing data."); + + System.out.println("Test data loaded. Number of instances: " + testData.numInstances() + "\n"); + + + //filter the file IDs, consider the new training set + Instances filteredTrainData = evaluator.filteredIDs(trainData); + Instances filteredTestData = evaluator.filteredIDs(testData); + + if(Boolean.valueOf(pathVars.USE_ODDS_RATIO)){ + //Calculate OddsRatio for all instances + double[] OR = evaluator.loadFeatureFilter(filteredTrainData, filter, 1, Integer.parseInt(pathVars.OR_THRESHOLD)); + + //Apply Odds Ratio filtering in instances + filteredTrainData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTrainData); + filteredTestData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTestData); + } + + if(Boolean.valueOf(pathVars.USE_IDF)){ + //Calculate idf for all instances + double[] idf = evaluator.loadFeatureFilter(filteredTrainData, filter, 2, Integer.parseInt(pathVars.IDF_THRESHOLD)); + + //Apply idf filtering in instances + filteredTrainData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTrainData); + filteredTestData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTestData); + } + + //Training and testing classifier + evaluator.classify(filteredTrainData, filteredTestData, cls, testData); + + } + + /** + * Loads evaluation of attributes according + * to feature selection method provided. + * + * @param data data instances + * @param filter informed filter instance + * @param method identifier for selection method + * @return + */ + private double[] loadFeatureFilter(Instances data, InformedFilter filter, int method, int threshold){ + + double[] values = new double[data.numAttributes()]; + + switch(method){ + + case 1: + values = filter.oddsRatio(data, threshold); + break; + case 2: + values = filter.idf(data, threshold); + break; + } + + return values; + } + + /** + * Uses evaluation of features according to + * selection method to remove attributes from + * the dataset before training phase. + * + * @param threshold selection method threshold + * @param values evaluation of attributes according to method + * @param data dataset instances + * @return filtered dataset instances + * @throws Exception + */ + private Instances applyFilter(String threshold, double[] values, Instances data) throws Exception{ + int numberRemoved = 0; + + String indexRemove = ""; + + for(int i = 0; i < values.length; i++){ + if(values[i] == 0){ + + int ind = i+1; + + if(indexRemove.length()==0) indexRemove = ind + ""; + else indexRemove = indexRemove + "," + ind; + + numberRemoved++; + } + } + + try{ + indexRemove = indexRemove.substring(0, indexRemove.length()-1); + //if(verbose) + System.out.println("\n = = = = => Filter removed " + numberRemoved +" attributes: " + indexRemove.toString() ); + } + catch (Exception e){ + System.out.println("\n = = = = => Filter threshold did not remove any attribute."); + } + + Remove remove = new Remove(); + remove.setAttributeIndices(indexRemove); + remove.setInvertSelection(false); + remove.setInputFormat(data); + + Instances dataSubset = Filter.useFilter(data, remove); + return dataSubset; + } + + + /** + * Removes the ID attribute (index 1) + * from a given dataset + * + * @param data instances + * @return filtered dataset + * @throws Exception + */ + private Instances filteredIDs(Instances data) throws Exception { + Remove remove = new Remove(); + //setting index to be removed + remove.setAttributeIndices("1"); + remove.setInvertSelection(false); + remove.setInputFormat(data); + + Instances dataSubset = Filter.useFilter(data, remove); + return dataSubset; + } + + + /** + * Trains and tests a classifier when two separated + * datasets are provided. + * + * @param train training data to build classifier + * @param test test data to evaluate classifier + * @param classif type of classifier applied + * @throws Exception + */ + public void classify(Instances filteredTrain, Instances filteredTest, Classifier classif, Instances test) throws Exception{ + + StringBuffer sb = new StringBuffer(); + PlainText prediction = new PlainText(); + Range attributesToShow = null; + prediction.setBuffer(sb); + prediction.setHeader(test); + prediction.setOutputDistribution(true); + + classif.buildClassifier(filteredTrain); + + Evaluation evaluateClassifier = new Evaluation(filteredTrain); + evaluateClassifier.evaluateModel(classif, filteredTest, prediction, attributesToShow, true); + //evaluateClassifier.evaluateModel(classif, filteredTest); + + stats(evaluateClassifier, classif); + + ArrayList output = evaluateClassifier.predictions(); + + if(verbose){ + for(int i = 0; i < output.size(); i++){ + double act = output.get(i).actual(); + String actual; + if(act == 1.0) actual = "negative"; else actual = "positive"; + + double pred = output.get(i).predicted(); + String predicted; + if(pred == 1.0) predicted = "negative"; else predicted = "positive"; + + String value = test.instance(i).toString(0); + + System.out.println("PMID: "+ value + "\t" + + "Actual: " + actual + "\t" + + "Predicted: " + predicted + ); + } } + } + + + /** + * Outputs classifier results. + * + * @param eval Evaluation model built by a classifier + * @param classif type of classifier applied + * @throws Exception + */ + public void stats(Evaluation eval, Classifier classif) throws Exception{ + System.out.println("Number of attributes: " + eval.getHeader().numAttributes()); + System.out.println(eval.toSummaryString("\n======== RESULTS ========\n", false)); + System.out.println(eval.toClassDetailsString("\n\n======== Detailed accuracy by class ========\n")); + System.out.println(eval.toMatrixString("\n\n======== Confusion Matrix ========\n")); + } + + + //Training and testing costSensitive classifier + //evaluator.classify(trainData, testData, evaluator.classifySensitive(cls)); + +// /** +// * Trains and tests a classifier using a +// * provided Cost matrix +// * +// * @param classif type of classifier to be trained +// * @return CostSensitive classifier with costs and classifier +// * @throws Exception +// */ +// public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{ +// CostSensitiveClassifier costSensitive = new CostSensitiveClassifier(); +// CostMatrix matrix = new CostMatrix(2); +// matrix.setElement(0, 1, 4); +// matrix.setElement(1, 0, 1); +// costSensitive.setClassifier(classif); +// costSensitive.setCostMatrix(matrix); +// +// return costSensitive; +// } + + //Executing k-fold cross validation on filtered classifiers + //evaluator.crossFold(trainData, PCAclassifier); + //evaluator.crossFold(trainData, LSAclassifier); + +// /** +// * Executes k-fold cross validation +// * on a given dataset +// * @param data training data provided +// * @param classif type of classifier usedsearch +// * @throws Exception +// */ +// public void crossFold(Instances data, Classifier classif) throws Exception{ +// +// Random random = new Random(SEED); //creating seed number generator +// Evaluation evaluateClassifier = new Evaluation(data); +// +// System.out.println("Classifier working...\n\n"); +// //Classifier should not be trained when cross-validation is executed. +// //because subsequent calls to buildClassifier method will return the same results always. +// evaluateClassifier.crossValidateModel(classif, data, FOLDS, random); +// +// stats(evaluateClassifier, classif); +// } + + + //Creating filtered classifiers + //AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls); + //AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls); + //AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls); + //AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls); + +// /** +// * Implements a Filtered GainRatio classifier, +// * using the ranker as a search method. +// * +// * @param classif type of classifier to be used +// * @return filtered classif with Correlation analysis +// */ +// public AttributeSelectedClassifier setGRFilter(Classifier classif){ +// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); +// +// //Creating evaluator and search method +// GainRatioAttributeEval GR = new GainRatioAttributeEval(); +// Ranker rank = new Ranker(); +// //return the attributes with evaluation greater than 0 +// double threshold = 0.0; +// rank.setThreshold(threshold); +// +// //Setting GainRatio filtered classifier +// fClassif.setClassifier(classif); +// fClassif.setEvaluator(GR); +// fClassif.setSearch(rank); +// +// return fClassif; +// +// } +// +// /** +// * Implements a Filtered Correlation classifier, +// * using the ranker as a search method. +// * +// * @param classif type of classifier to be used +// * @return filtered classif with Correlation analysis +// */ +// public AttributeSelectedClassifier setCorrFilter(Classifier classif){ +// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); +// +// //Creating evaluator and search method +// CorrelationAttributeEval Corr = new CorrelationAttributeEval(); +// Ranker rank = new Ranker(); +// +// //return the attributes with evaluation greater than 0 +// double threshold = 0.03; +// rank.setThreshold(threshold); +// +// //Setting GainRatio filtered classifier +// fClassif.setClassifier(classif); +// fClassif.setEvaluator(Corr); +// fClassif.setSearch(rank); +// +// return fClassif; +// +// } +// +// /** +// * Implements a Filtered PCA classifier, +// * using the ranker as a search method. +// * +// * @param classif type of classifier to be used +// * @return filtered classif with PCA analysis config +// */ +// public AttributeSelectedClassifier setPCAFilter(Classifier classif){ +// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); +// +// //Creating evaluator and search method +// PrincipalComponents PCA = new PrincipalComponents(); +// PCA.setMaximumAttributeNames(-1); +// Ranker rank = new Ranker(); +// //return the attributes with evaluation greater than 0 +// rank.setThreshold(0); +// +// //Setting the PCA classifier configurations +// fClassif.setClassifier(classif); +// fClassif.setEvaluator(PCA); +// fClassif.setSearch(rank); +// +// return fClassif; +// } +// +// /** +// * Implements a Filtered LSA classifier, +// * using the ranker as a search method +// * @param classif +// * @return +// */ +// private AttributeSelectedClassifier setLSAFilter(Classifier classif) { +// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); +// +// //Creating evaluator +// LatentSemanticAnalysis LSA = new LatentSemanticAnalysis(); +// LSA.setMaximumAttributeNames(-1); +// //value between 0 and 1 includes proportion of total latent variables +// //greater than 1 = exact # of variables to include; +// //less than or equal zero = include all; +// //default = 0.95 (proportional) +// double defaul = 0; +// LSA.setRank(defaul); +// //Creating search method +// Ranker rank = new Ranker(); +// rank.setThreshold(0); +// +// //Setting the LSA classifier configurations +// fClassif.setClassifier(classif); +// fClassif.setEvaluator(LSA); +// fClassif.setSearch(rank); +// +// return fClassif; +// } + + + +} diff --git a/src/configure/.gitignore b/src/configure/.gitignore new file mode 100644 index 0000000..26ecd44 --- /dev/null +++ b/src/configure/.gitignore @@ -0,0 +1,2 @@ +/DeprecatedVariables.class +/PathConstants.class diff --git a/src/configure/ConfigConstants.java b/src/configure/ConfigConstants.java new file mode 100644 index 0000000..eb6b602 --- /dev/null +++ b/src/configure/ConfigConstants.java @@ -0,0 +1,220 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*** +* This class re-uses https://code.google.com/p/semlinker/source/browse/trunk/src/configure/NistKBPConfiguration.java +* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton +* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc +* +* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. +*/ + +package configure; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * + * Variables used by the software + * + * @author Hayda Almeida, Marie-Jean Meurs + * @since 2014 + * + */ +public class ConfigConstants { + + public static String CONFIG_FILE = "config.cfg"; + + /** + * Default constructor + */ + public ConfigConstants() { + initVars(); + } + + /** + * Constructor with custom parameter file. + * @param configfile + */ +// public ConfigConstants(String configfile) { +// CONFIG_FILE = configfile; +// initVars(); +// } + + + //public static String CONFIG_FILE = "config.cfg"; + public HashMap CONFIG_MAP = new HashMap(); + + //Input files + public String HOME_DIR; + public String CORPUS_DIR; + public String DUP_DIR; + public String POS_DIR; + public String NEG_DIR; + public String TRAIN_DIR; + public String TEST_DIR; + public String FEATURE_DIR; + public String OUTPUT_MODEL; + + public String TRAINING_FILE; + public String TEST_FILE; + public String ARFF_TRAIN; + public String ARFF_TEST; + public String STOP_LIST; + + //corpus sampling + public String SAMPLE_TRAIN; + public String SAMPLE_TEST; + public String PERCT_TEST; + public String PERCT_POS_TRAIN; + public String PERCT_POS_TEST; + + //Output files + public String JOURNAL_TITLE_FEATURES; + public String ECNUM_FEATURES; + public String ANNOTATION_FEATURES; + public String TITLE_FEATURES; + public String NGRAM_FEATURES; + public String TITLE_NGRAMS; + public String DOC_IDS; + + //Feature setup + public String USE_TEXT_SIZE; + public String USE_JOURNAL_TITLE_FEATURE; + public String USE_ECNUM_FEATURE; + public String FEATURE_MIN_FREQ; + public String FEATURE_MIN_LENGTH; + + //Feature setup - Annotations + public String USE_ANNOTATION_FEATURE; + public String USE_ANNOTATION_TYPE; + public String USE_TITLE_FEATURE; + public String USE_DOC_ID; + + //Feature setup - Ngrams + public String USE_NGRAM_FEATURE; + public String USE_TITLE_NGRAMS; + public String NGRAM_STOP; + public String NGRAM_SIZE; + public String USE_WEIGHTED_NGRAM; + public String WEIGHT; + + //Feature filtering + public String USE_ODDS_RATIO; + public String OR_THRESHOLD; + public String USE_IDF; + public String IDF_THRESHOLD; + + //Task setup + public String EXP_TYPE; + public String NB_PARAMS; + + + private void initVars() { + String text = null; + + try { + BufferedReader reader = new BufferedReader(new InputStreamReader( + this.getClass().getClassLoader().getResourceAsStream(CONFIG_FILE))); + //BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + while ((text = reader.readLine()) != null) { + if (! text.startsWith("#")) { + String label = text.split("=")[0]; + String value = text.split("=")[1]; + CONFIG_MAP.put(label, value); + } + } + reader.close(); + } catch (IOException ex) { + Logger.getLogger(ConfigConstants.class.getName()).log(Level.SEVERE, null, ex); + } + HOME_DIR = CONFIG_MAP.get("HOME_DIR"); + CORPUS_DIR = CONFIG_MAP.get("CORPUS_DIR"); + DUP_DIR = CONFIG_MAP.get("DUP_DIR"); + POS_DIR = CONFIG_MAP.get("POS_DIR"); + NEG_DIR = CONFIG_MAP.get("NEG_DIR"); + TRAIN_DIR = CONFIG_MAP.get("TRAIN_DIR"); + TEST_DIR = CONFIG_MAP.get("TEST_DIR"); + FEATURE_DIR = CONFIG_MAP.get("FEATURE_DIR"); + OUTPUT_MODEL = CONFIG_MAP.get("OUTPUT_MODEL"); + + TRAINING_FILE = CONFIG_MAP.get("TRAINING_FILE"); + TEST_FILE = CONFIG_MAP.get("TEST_FILE"); + ARFF_TRAIN = CONFIG_MAP.get("ARFF_TRAIN"); + ARFF_TEST = CONFIG_MAP.get("ARFF_TEST"); + STOP_LIST = CONFIG_MAP.get("STOP_LIST"); + + SAMPLE_TRAIN = CONFIG_MAP.get("SAMPLE_TRAIN"); + SAMPLE_TEST = CONFIG_MAP.get("SAMPLE_TEST"); + PERCT_TEST = CONFIG_MAP.get("PERCT_TEST"); + PERCT_POS_TRAIN = CONFIG_MAP.get("PERCT_POS_TRAIN"); + PERCT_POS_TEST = CONFIG_MAP.get("PERCT_POS_TEST"); + + JOURNAL_TITLE_FEATURES = CONFIG_MAP.get("JOURNAL_TITLE_FEATURES"); + ECNUM_FEATURES = CONFIG_MAP.get("ECNUM_FEATURES"); + ANNOTATION_FEATURES = CONFIG_MAP.get("ANNOTATION_FEATURES"); + TITLE_FEATURES = CONFIG_MAP.get("TITLE_FEATURES"); + NGRAM_FEATURES = CONFIG_MAP.get("NGRAM_FEATURES"); + TITLE_NGRAMS = CONFIG_MAP.get("TITLE_NGRAMS"); + DOC_IDS = CONFIG_MAP.get("DOC_IDS"); + + USE_TEXT_SIZE = CONFIG_MAP.get("USE_TEXT_SIZE"); + USE_JOURNAL_TITLE_FEATURE = CONFIG_MAP.get("USE_JOURNAL_TITLE_FEATURE"); + USE_ECNUM_FEATURE = CONFIG_MAP.get("USE_ECNUM_FEATURE"); + FEATURE_MIN_FREQ = CONFIG_MAP.get("FEATURE_MIN_FREQ"); + FEATURE_MIN_LENGTH = CONFIG_MAP.get("FEATURE_MIN_LENGTH"); + + USE_ANNOTATION_FEATURE = CONFIG_MAP.get("USE_ANNOTATION_FEATURE"); + USE_ANNOTATION_TYPE = CONFIG_MAP.get("USE_ANNOTATION_TYPE"); + USE_TITLE_FEATURE = CONFIG_MAP.get("USE_TITLE_FEATURE"); + USE_DOC_ID = CONFIG_MAP.get("USE_DOC_ID"); + + USE_NGRAM_FEATURE = CONFIG_MAP.get("USE_NGRAM_FEATURE"); + USE_TITLE_NGRAMS = CONFIG_MAP.get("USE_TITLE_NGRAMS"); + NGRAM_STOP = CONFIG_MAP.get("NGRAM_STOP"); + NGRAM_SIZE = CONFIG_MAP.get("NGRAM_SIZE"); + USE_WEIGHTED_NGRAM = CONFIG_MAP.get("USE_WEIGHTED_NGRAM"); + WEIGHT = CONFIG_MAP.get("WEIGHT"); + + USE_ODDS_RATIO = CONFIG_MAP.get("USE_ODDS_RATIO"); + OR_THRESHOLD = CONFIG_MAP.get("OR_THRESHOLD"); + USE_IDF = CONFIG_MAP.get("USE_IDF"); + IDF_THRESHOLD = CONFIG_MAP.get("IDF_THRESHOLD"); + + EXP_TYPE = CONFIG_MAP.get("EXP_TYPE"); + NB_PARAMS = CONFIG_MAP.get("NB_PARAMS"); + + } +} diff --git a/src/filter/InformedFilter.java b/src/filter/InformedFilter.java new file mode 100644 index 0000000..4b125db --- /dev/null +++ b/src/filter/InformedFilter.java @@ -0,0 +1,182 @@ +package filter; + +import weka.core.Attribute; +import weka.core.Instances; + +/** + * This class implements informed feature selection + * methods, to be used as filters after vector + * generation and pre-model building + * + * @author Hayda Almeida + * @since 2015 + * + */ +public class InformedFilter { + + private boolean verbose = true; + + /** + * Calculates oddsRatio of each feature + * in a given set of Instances + * + * @param data set of instances, read from ARFF file + * @return oddsRatio for each attribute in the matrix + */ + public double[] oddsRatio(Instances data, int threshold){ + + double[] oddsRatio = new double[data.numAttributes()]; + + + for(int i = 0; i < data.numAttributes()-1; i++ ){ + + double OR = 0; + + Attribute current = data.attribute(i); + double pos_docs = 0, //number of documents in class C + pos_oc = 0, //number of times term t occured in class C + pos_term_docs = 0, //number of docs in class C that have term + pos_not_docs = 0, //number of docs in class C that do not have term + neg_term_docs = 0, //number of docs not in class C with term + neg_not_docs = 0, //number of docs not in class C nor with term + neg_docs = 0; //number of documents not in class C + + for(int j = 0; j < data.size(); j++){ + + double current_value = data.instance(j).value(current); + double current_class = data.instance(j).classValue(); + + //class is positive + if(current_class < 1){ + pos_docs = pos_docs + 1; + + //the feature occurred in the document + if(current_value > 0){ + pos_oc = pos_oc + current_value; + pos_term_docs = pos_term_docs +1; + } + //the feature did not occur in positive docs + else pos_not_docs = pos_not_docs + 1; + } + //class is negative + else{ + neg_docs = neg_docs+1; + + //the feature occurred in the document + if(current_value > 0){ + neg_term_docs = neg_term_docs +1; + } + //the feature did not occur in negative docs + else neg_not_docs = neg_not_docs + 1; + } + + } + + OR = ( ( (pos_term_docs / pos_docs) / (pos_not_docs/ pos_docs) ) / + ( (neg_term_docs / neg_docs) / (neg_not_docs / neg_docs) ) ); + + // OR = (pos_term_docs / pos_not_docs) / (neg_term_docs / neg_not_docs); + + + //99% confidence: 2.575 + //95% confidence: 1.96 + double confidenceLow = Math.exp(Math.log(OR) - (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs)))); + double confidenceHigh = Math.exp(Math.log(OR) + (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs)))); + + //checking if OR value is within the confidence interval + //and if it satisfies the threshold + if( ((OR <= confidenceHigh) && (OR >= confidenceLow) + && !(OR == threshold)) + //checking if the confidence interval holds the null hypothesis (i.e., spans 1.0) + && !(confidenceLow <=1 && confidenceHigh >=1)) + oddsRatio[i] = OR; + else + oddsRatio[i] = 0; + + if(verbose){ + System.out.println("Attribute: "+ data.attribute(i).toString() +"\t\t OddsRatio: " + oddsRatio[i] + + "\tConfidenceLow: " + confidenceLow + "\tConfidenceHigh: "+ confidenceHigh); + } + } + + return oddsRatio; + } + + /** + * Calculates the inverse document frequency + * for each attribute in the dataset. + * + * @param data instances + * @param threshold + * @return list of idfs for each attribute + */ + public double[] idf(Instances data, int threshold){ + + double[] idf = new double[data.numAttributes()]; + + for(int i = 0; i < data.numAttributes()-1; i++ ){ + + double idf_at = 0; + double idf_at2 = 0; + + Attribute current = data.attribute(i); + double pos_docs = 0, //number of documents in class C + pos_term_docs = 0, //number of docs in class C that have term + neg_term_docs = 0, //number of docs not in class C with term + neg_docs = 0; //number of documents not in class C + + for(int j = 0; j < data.size(); j++){ + + double current_value = data.instance(j).value(current); + double current_class = data.instance(j).classValue(); + + //class is positive + if(current_class < 1){ + pos_docs = pos_docs + 1; + + //the feature occurred in the document + if(current_value > 0){ + pos_term_docs = pos_term_docs +1; + } + } + else{ + //class is negative + neg_docs = neg_docs+1; + + //the feature occurred in the document + if(current_value > 0){ + neg_term_docs = neg_term_docs +1; + } + } + } + +// double idf_pos = Math.log((pos_docs)/(pos_term_docs)); +// double idf_neg = Math.log((neg_docs)/(neg_term_docs)); + + //check if the idf in the "positive" collection + //is greater than the idf in the "negative" collection +// if (idf_pos > idf_neg) +// idf_at = idf_pos; +// +// else idf_at = 0; + + idf_at = Math.log((pos_docs + neg_docs)/(pos_term_docs + neg_term_docs)); + + if(idf_at <= threshold) + idf[i] = 0; + else + idf[i] = idf_at; + } + + if(verbose){ + for(int i = 0; i < idf.length; i++){ + if(idf[i]>0) + System.out.println("Attribute: "+ data.attribute(i).toString()+ "\t\t\t IDF: " + idf[i]); + } + } + + return idf; + } + + +} diff --git a/src/filter/NaiveFilter.java b/src/filter/NaiveFilter.java new file mode 100644 index 0000000..761787c --- /dev/null +++ b/src/filter/NaiveFilter.java @@ -0,0 +1,139 @@ +package filter; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; + +import configure.ConfigConstants; + +/** + * + * This class implements naive feature filtering methods + * to be used by the extractor processes pre-vector building + * + * @author Hayda Almeida + * @since 2015 + * + */ +public class NaiveFilter { + + + private boolean verbose = true; + private String[] stopWords; + + + /** + * Removes from feature list all features with + * frequency not statistically relevant (2 or less) + * @param list to be cleaned + */ + public void considerAnnotationOccurence(HashMap,Integer> list, ConfigConstants vars){ + //going over the list of annotations and removing the + //features with occurance lower than specified. + + Iterator> iterator = list.keySet().iterator(); + + while(iterator.hasNext()){ + Map key = iterator.next(); + int valor = list.get(key).intValue(); + + if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ + iterator.remove(); + } + } + } + + /** + * Removes from feature list all features with + * frequency not statistically relevant (2 or less) + * @param list to be cleaned + */ + public void considerOccurence(HashMap list, ConfigConstants vars){ + //going over the list of annotations and removing the + //statistically not significant features - frequency less than 2 + Iterator iterator = list.values().iterator(); + + while(iterator.hasNext()){ + Integer key = iterator.next(); + + if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ + iterator.remove(); + } + } + } + + /** + * Load the list of PubMed stopwords + * @param path file with stopwords list + * @return list of stopwords + */ + public void loadStopWords(String path){ + + StringBuilder cleaned = new StringBuilder(); + + try{ + + BufferedReader reader = new BufferedReader(new FileReader(path)); + + String line = null; + //loading stop-words list + while((line = reader.readLine()) != null){ + this.stopWords = StringUtils.split(line,","); + System.out.println(""); + //line = reader.readLine(); + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + + /** + * Removes stopwords from ngrams list + * + * @param str list of ngrams + * @param constants + * @return cleaned list of ngrams + */ + public String removeStopList(String[] str){ + + StringBuilder cleaned = new StringBuilder(); + + //iteraing over text to be cleaned + for(int i = 0; i < str.length; i++){ + //iterating over stop-words list + for(int j = 0; j < this.stopWords.length; j++){ + + //when stop-word is encountered, replace it + if(str[i].equalsIgnoreCase(this.stopWords[j])){ + str[i] = str[i].replace(str[i],"*"); + } + } + //retrieve the text without stop-words replacements + if(!(str[i].contentEquals("*"))){ + cleaned.append(str[i]).append(" "); + } + } + return cleaned.toString().replace(" ", " "); + } + + public String[] getStopWords() { + return stopWords; + } + + public void setStopWords(String[] stopWords) { + this.stopWords = stopWords; + } + +} diff --git a/src/preprocessing/ConcatXML.java b/src/preprocessing/ConcatXML.java new file mode 100644 index 0000000..89e255f --- /dev/null +++ b/src/preprocessing/ConcatXML.java @@ -0,0 +1,717 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +package preprocessing; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Date; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import configure.PathConstants; + +/** + * Generates a corpus from raw XML doc instances, + * so that features can be extracted from it + * + * @author Hayda Almeida + * @since 2014 + * + */ +public class ConcatXML{ + + private String tag1; + private String tag2; + private String tag3; + private String tag4; + private String id; + private String corpusTag; + private String corpusTagC; + + + public ConcatXML(){ + + this.setId("PMID"); + this.setTag1("(?s)<.*?xml.*?>"); + this.setTag2("(?s)<.*?!DOCTYPE.*?>"); + this.setTag3("(?s)<.*?corpus.*?>"); + this.seTag4("(?s)<.*?/corpus.*?>"); + this.setCorpusTag(""); + this.setCorpusTag(""); + } + + + + public static void main(String[] args) throws Exception { + + PathConstants pathVars = new PathConstants(); + + String xmlDir = ""; + if(Integer.parseInt(pathVars.EXP_TYPE)== 1) + xmlDir = "test"; + else xmlDir = "train"; + + String sourceDir = "", duplicatesDir = ""; + + Boolean dc = false, df = false, cl = false, cc = false; + + for(int i = 0; i < args.length; i++){ + try{ + if(args[i].matches("-dc")) dc = true; + if(args[i].matches("-df")) df = true; + if(args[i].matches("-cl")) cl = true; + if(args[i].matches("-cc")) cc = true; + } + catch(Exception e){ + System.out.println("Use: \n" + + "-tr -> train, -ts -> test; \n " + + "-dc -> check duplicates in corpus vs. folder; \n " + + "-df -> check duplicates in two folders; \n" + + "-cl -> clean a source folder; \n" + + "-cc -> concatenate files in a folder "); + System.exit(0); + }; + } + + String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); + String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE; + + sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + xmlDir; + duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.DUP_DIR; + + String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml"; + String tagCorpus = concatCorpus; + + ConcatXML concat = new ConcatXML(); + + //================= Checking for duplicates =====================// + if(dc) concat.checkDupCorpus(trainCorpusPath, sourceDir); + if(df) concat.checkDupFolder(sourceDir, duplicatesDir); + + //================== Creating corpus ==========================// + if(cl){ + concat.cleanXML(sourceDir); + if(duplicatesDir.length()>1) + concat.cleanXML(duplicatesDir); + } + if(cc){ + concat.concatenateXML(sourceDir, "", concatCorpus); + concat.tagCorpus(tagCorpus); + } + } + + /** + * Returns the ID of a XML jsoup document + * @param doc a XML doc parsed by jsoup + * @return ID string + * @throws IOException + */ + public String returnID(Document doc) throws IOException{ + + String id = ""; + + Elements paper = doc.body().getElementsByTag("pubmedarticleset"); + + //fetching the paper ID - + //for all items in a paper, retrieve only PMIDs + for(Element e : paper.select(getId())){ + //only consider the ID if the parent is medline citation + if(e.parentNode().nodeName().contains("medline")){ + id = e.text(); + } + } + return id; + } + + /** + * Reads the file IDs in a folder and + * checks a second folder for duplicates. + * + * @param dirSrc source folder + * @param dirDup folder to check for duplicates + */ + + public void checkDupFolder(String dirSrc, String dirDup){ + ArrayList sourceIDs = new ArrayList(); + ArrayList duplicated = new ArrayList(); + ArrayList dupIDs = new ArrayList(); + int ids = 0; + + if(dirSrc.contentEquals(dirDup)){ + System.out.println("Source and duplicates directories are the same.\n\n========================\n"); + } + else { + + File sourceDir = new File(dirSrc); + File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the source dir + for (File xml : srcXMLs){ + + try{ + + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + sourceIDs.add(id); + ids++; + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println(ids + " source file IDs encountered."); + ids = 0; + + File dupDir = new File(dirDup); + + File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the possibly duplicated dir + for (File xml : dupXMLs){ + + try{ + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + dupIDs.add(id); + String dupFileID = id; + ids++; + + for(int j = 0; j < sourceIDs.size(); j++){ + if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){ + + //add ID to duplicated list + duplicated.add(dupFileID); + + //rename the original file + Path from = xml.toPath(); //convert from File to Path + Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + //count number of existing papers on possibly duplicated folder + //just to make sure we are gathering all IDs + System.out.println(ids + " new file IDs encountered."); + ids = 0; + //sorting the list of duplicated IDs + Collections.sort(duplicated, new Comparator(){ + @Override + public int compare(String one, String two){ + return one.compareTo(two); + } + }); + + System.out.println("\nReaded source files: " + sourceIDs.size()); + System.out.println("Readed new files: " + dupIDs.size()); + + System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); + + System.out.println("\nDuplicated files IDs: "); + for(int i = 0; i < duplicated.size(); i++){ + System.out.println(duplicated.get(i)); + } + + System.out.println("\n========================\n"); + } + + + } + + /** + * Reads the corpus and checks the papers IDs + * to identify duplicates in case new papers + * are being concatenated to corpus. + * + * @param corpus path to current corpora to check + * @param dir path to folder with new files to be concatenated + */ + + public void checkDupCorpus(String corpus, String dir){ + ArrayList trainingIDs = new ArrayList(); + ArrayList duplicated = new ArrayList(); + ArrayList newFiles = new ArrayList(); + + int ids = 0; + + try + { + File input = new File(corpus); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + Elements corp = doc.body().getElementsByTag("pubmedarticleset"); + + String id = ""; + + for(Element paper : corp){ + Document thisDoc = Jsoup.parseBodyFragment(paper.toString()); + + //fetching the document ID + id = returnID(thisDoc); + + if(!id.isEmpty()){ + trainingIDs.add(id); + ids++; + } + } + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + System.out.println(ids + " training file IDs encountered."); + ids = 0; + + File corpusDir = new File(dir); + File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the corpus dir + for (File xml : newXMLs){ + + try{ + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + + newFiles.add(id); + String newFileID = id; + ids++; + + + for(int j = 0; j < trainingIDs.size(); j++){ + if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){ + + //add ID to duplicated list + duplicated.add(newFileID); + + //moving the original file + Path from = xml.toPath(); //convert from File to Path + Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + } + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + //count number of existing papers on the training file + //just to make sure we are gathering all IDs + System.out.println(ids + " new file IDs encountered."); + ids = 0; + + + //sorting the list of duplicated IDs + Collections.sort(duplicated, new Comparator(){ + @Override + public int compare(String one, String two){ + return one.compareTo(two); + } + }); + + System.out.println("\nReaded training files: " + trainingIDs.size()); + System.out.println("Readed new files: " + newFiles.size()); + + System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); + + System.out.println("\nDuplicated files IDs: "); + for(int i = 0; i < duplicated.size(); i++){ + System.out.println(duplicated.get(i)); + } + + System.out.println("\n========================\n"); + + } + + + /** + * Reads and edits a list of XMLs files in a folder + * to remove XML and previous corpus tags, + * preparing the files to be concatenated. + * + * @param dir string with folder path + */ + + public void cleanXML(String dir){ + + //listing files on corpus dir + File sourceDir = new File(dir); + + File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + System.out.println("... Files list loaded."); + + try{ + //for each file on the corpus dir + for (File xml : newXMLs){ + + try{ + BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); + + String line = null; + ArrayList allLines = new ArrayList(); + String content = null; + + while((line = reader.readLine()) != null){ + content = line; + + //cleaning XML markups + content = content.replaceFirst(getTag1(), ""); + content = content.replaceFirst(getTag2(), ""); + //cleaning previous corpus tags + content = content.replaceFirst(getTag3(), ""); + content = content.replaceFirst(getTag4(), ""); + allLines.add(content); + } + + PrintWriter writer = new PrintWriter(xml.getPath()); + + for (String l : allLines){ + writer.println(l); + } + reader.close(); + writer.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println("... Files cleaned and saved."); + System.out.println("Ready for concatenation."); + System.out.println("\n========================\n"); + } + + + + /** + * Concatenates all XMLs in one folder or between two folders. + * @param sourceDir main directory with XML files. + * @param duplicDir second directory with duplicated XML files + * @param concatFile path name to saved concatenated corpus + */ + + public void concatenateXML(String sourceDir, String duplicDir, String concatFile){ + + final int BUFFER = 1024 << 8; + byte[] buffer = new byte[BUFFER]; + + //listing files on corpus dir + File srcDir = new File(sourceDir); + File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + File dupDir = new File(duplicDir); + File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name) { + return name.endsWith(".xml"); + } + }); + + System.out.println("... Files list loaded."); + + //defining the output file (concatenated) + File newCorpus = new File(concatFile); + + try{ + OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus)); + + + //for each file on the corpus dir + for (File xmls : srcXMLs){ + InputStream input = new FileInputStream(xmls); + int count; + + //if the file is not empty/finished + try{ + while((count = input.read(buffer)) >= 0){ + + //write it on the concatenated final file + output.write(buffer, 0, count); + } + }finally{ + input.close(); + } + } + + if(dupXMLs != null){ + for(File xmld : dupXMLs){ + InputStream input = new FileInputStream(xmld); + int count; + + //if the file is not empty/finished + try{ + while((count = input.read(buffer)) >= 0){ + + //write it on the concatenated final file + output.write(buffer, 0, count); + } + }finally{ + input.close(); + } + } + } + output.flush(); + output.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println("... File concatenated and saved."); + System.out.println("Ready for corpus tagging."); + System.out.println("\n========================\n"); + } + + /** + * Inserts corpus tag on XML file + * + * @param pathToCorpus path to + * concatenated corpus + */ + + public void tagCorpus(String pathToCorpus){ + + //tagging as corpus + try{ + BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus)); + + String line = null; + String edit = null; + List allLines = new ArrayList(); + + //adds tag at beggining of corpus + allLines.add(getCorpusTag()); + + while((line = reader.readLine()) != null){ + + allLines.add(line); + } + //adds tag at the end of corpus + allLines.add(getCorpusTagC()); + + System.out.println("... Corpus loaded and tagged."); + //re-writting the file + PrintWriter writer = new PrintWriter(pathToCorpus); + + for (String l : allLines){ + writer.println(l); + } + reader.close(); + writer.close(); + + System.out.println("... File saved as tagged corpus."); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(IOException e){ + e.printStackTrace(); + } + } + + private String getCorpusTagC() { + return corpusTagC; + } + + private String getCorpusTag() { + // TODO Auto-generated method stub + return corpusTag; + } + + public String getTag1() { + return tag1; + } + + public void setTag1(String tag1) { + this.tag1 = tag1; + } + + public String getTag2() { + return tag2; + } + + public void setTag2(String tag2) { + this.tag2 = tag2; + } + + private String getTag4() { + // TODO Auto-generated method stub + return tag4; + } + + private String getTag3() { + // TODO Auto-generated method stub + return tag3; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + private void setCorpusTag(String string) { + this.corpusTag = string; + + } + + private void seTag4(String string) { + this.tag4 = string; + + } + + private void setTag3(String string) { + this.tag3 = string; + + } + +} + + diff --git a/src/preprocessing/CorpusHandler.java b/src/preprocessing/CorpusHandler.java new file mode 100644 index 0000000..94b5786 --- /dev/null +++ b/src/preprocessing/CorpusHandler.java @@ -0,0 +1,754 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +package preprocessing; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Date; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import configure.ConfigConstants; + +/** + * Generates a corpus from raw XML doc instances, + * so that features can be extracted from it + * + * @author Hayda Almeida + * @since 2014 + * + */ +public class CorpusHandler{ + + private String tag1; + private String tag2; + private String tag3; + private String tag4; + private String id; + private String corpusTag; + private String corpusTagC; + + + public CorpusHandler(){ + + this.setId("PMID"); + this.setTag1("(?s)<.*?xml.*?>"); + this.setTag2("(?s)<.*?!DOCTYPE.*?>"); + this.setTag3("(?s)<.*?corpus.*?>"); + this.seTag4("(?s)<.*?/corpus.*?>"); + this.setCorpusTag(""); + this.setCorpusTag(""); + } + + + + public static void main(String[] args) throws Exception { + + ConfigConstants pathVars = new ConfigConstants(); + + String xmlDir = ""; + if(Integer.parseInt(pathVars.EXP_TYPE)== 1) + xmlDir = pathVars.TEST_DIR.substring(0, pathVars.TEST_DIR.length()-1); + else xmlDir = pathVars.TRAIN_DIR.substring(0, pathVars.TRAIN_DIR.length()-1) + "_" + pathVars.PERCT_POS_TRAIN; + + String sourceDir = "", duplicatesDir = ""; + + Boolean dc = false, df = false, cl = false, cc = false; + + String param = ""; + + try{ + param = args[0]; + + if(param.length() > 1){ + if(param.contains("dc")) + dc = true; + if(param.contains("df")) + df = true; + if(param.contains("cl")) + cl = true; + if(param.contains("cc")) + cc = true; + } + } + catch(Exception e){ + System.out.println("Use: \n" + + "-dc -> check duplicates in corpus vs. folder; \n " + + "-df -> check duplicates in two folders; \n" + + "-cl -> clean a source folder; \n" + + "-cc -> concatenate files in a folder "); + System.exit(0); + }; + + String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); + String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE; + + sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + xmlDir; + duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.DUP_DIR; + + String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml"; + String tagCorpus = concatCorpus; + + CorpusHandler concat = new CorpusHandler(); + + //================= Checking for duplicates =====================// + // + //Check for duplicates between training file and a specific folder + if(dc) concat.checkDupCorpus(trainCorpusPath, sourceDir); + // + //---------------------------------------------------- + // + //Check for duplicates between two folders (duplicates found being sinalized in duplicatesDir) + if(df) concat.checkDupFolder(sourceDir, duplicatesDir); + // + //==================== Creating corpus ==========================// + // + //Removing XML tags from files + if(cl){ + concat.cleanXML(sourceDir, xmlDir); + if(duplicatesDir.length()>1 && (dc || df)) + concat.cleanXML(duplicatesDir, xmlDir); + } + // + //------------------------------------ + // + //Concatenating files from folders and outputting a corpus file + //Inserting tag in file + if(cc){ + concat.concatenateXML(sourceDir, "", concatCorpus, xmlDir); + concat.tagCorpus(tagCorpus, xmlDir); + } + // + //===============================================================// + } + + /** + * Returns the ID of a XML jsoup document + * @param doc a XML doc parsed by jsoup + * @return ID string + * @throws IOException + */ + public String returnID(Document doc) throws IOException{ + + String id = ""; + + Elements paper = doc.body().getElementsByTag("pubmedarticleset"); + + //fetching the paper ID - + //for all items in a paper, retrieve only PMIDs + for(Element e : paper.select(getId())){ + //only consider the ID if the parent is medline citation + if(e.parentNode().nodeName().contains("medline")){ + id = e.text(); + } + } + return id; + } + + /** + * Reads the file IDs in a folder and + * checks a second folder for duplicates. + * + * @param dirSrc source folder + * @param dirDup folder to check for duplicates + */ + + public void checkDupFolder(String dirSrc, String dirDup){ + ArrayList sourceIDs = new ArrayList(); + ArrayList duplicated = new ArrayList(); + ArrayList dupIDs = new ArrayList(); + int ids = 0; + + if(dirSrc.contentEquals(dirDup)){ + System.out.println("Source and duplicates directories are the same.\n\n========================\n"); + } + else { + + System.out.println("Source directory: "+ dirSrc + " \n"); + System.out.println("Duplicates directory: " + dirDup + " \n"); + + //Loading files in the source folder + File sourceDir = new File(dirSrc); + File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the source dir + for (File xml : srcXMLs){ + + try{ + + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + sourceIDs.add(id); + ids++; + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println(ids + " source file IDs encountered."); + ids = 0; + + File dupDir = new File(dirDup); + + //Loading files in the duplicated folder + File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the possibly duplicated dir + for (File xml : dupXMLs){ + + try{ + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + dupIDs.add(id); + String dupFileID = id; + ids++; + + for(int j = 0; j < sourceIDs.size(); j++){ + if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){ + + //add ID to duplicated list + duplicated.add(dupFileID); + + //rename the original file + Path from = xml.toPath(); //convert from File to Path + Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + //count number of existing papers on possibly duplicated folder + //just to make sure we are gathering all IDs + System.out.println(ids + " new file IDs encountered."); + ids = 0; + //sorting the list of duplicated IDs + Collections.sort(duplicated, new Comparator(){ + @Override + public int compare(String one, String two){ + return one.compareTo(two); + } + }); + + System.out.println("\nReaded source files: " + sourceIDs.size()); + System.out.println("Readed new files: " + dupIDs.size()); + + System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); + + System.out.println("\nDuplicated files IDs: "); + for(int i = 0; i < duplicated.size(); i++){ + System.out.println(duplicated.get(i)); + } + + System.out.println("\n========================\n"); + } + + + } + + /** + * Reads the corpus and checks the papers IDs + * to identify duplicates in case new papers + * are being concatenated to corpus. + * + * @param corpus path to current corpora to check + * @param dir path to folder with new files to be concatenated + */ + + public void checkDupCorpus(String corpus, String dir){ + ArrayList trainingIDs = new ArrayList(); + ArrayList duplicated = new ArrayList(); + ArrayList newFiles = new ArrayList(); + + int ids = 0; + + try + { + System.out.println("Corpus directory: "+ corpus + " \n"); + System.out.println("Duplicates directory: " + dir + " \n"); + + File input = new File(corpus); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + Elements corp = doc.body().getElementsByTag("pubmedarticleset"); + + String id = ""; + + for(Element paper : corp){ + Document thisDoc = Jsoup.parseBodyFragment(paper.toString()); + + //fetching the document ID + id = returnID(thisDoc); + + if(!id.isEmpty()){ + trainingIDs.add(id); + ids++; + } + } + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + System.out.println(ids + " training file IDs encountered."); + ids = 0; + + File corpusDir = new File(dir); + File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the corpus dir + for (File xml : newXMLs){ + + try{ + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + + newFiles.add(id); + String newFileID = id; + ids++; + + + for(int j = 0; j < trainingIDs.size(); j++){ + if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){ + + //add ID to duplicated list + duplicated.add(newFileID); + + //moving the original file + Path from = xml.toPath(); //convert from File to Path + Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + } + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + //count number of existing papers on the training file + //just to make sure we are gathering all IDs + System.out.println(ids + " new file IDs encountered."); + ids = 0; + + + //sorting the list of duplicated IDs + Collections.sort(duplicated, new Comparator(){ + @Override + public int compare(String one, String two){ + return one.compareTo(two); + } + }); + + System.out.println("\nReaded training files: " + trainingIDs.size()); + System.out.println("Readed new files: " + newFiles.size()); + + System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); + + System.out.println("\nDuplicated files IDs: "); + for(int i = 0; i < duplicated.size(); i++){ + System.out.println(duplicated.get(i)); + } + + System.out.println("\n========================\n"); + + } + + + /** + * Reads and edits a list of XMLs files in a folder + * to remove XML and previous corpus tags, + * preparing the files to be concatenated. + * + * @param dir string with folder path + */ + + public void cleanXML(String dir, String xmlDir){ + + //listing files on corpus dir + File sourceDir = new File(dir); + + File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + System.out.println("... Files list loaded: "+ dir); + + try{ + //for each file on the corpus dir + for (File xml : newXMLs){ + + try{ + BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); + + String line = null; + ArrayList allLines = new ArrayList(); + String content = null; + + while((line = reader.readLine()) != null){ + content = line; + + //cleaning XML markups + content = content.replaceFirst(getTag1(), ""); + content = content.replaceFirst(getTag2(), ""); + //cleaning previous corpus tags + content = content.replaceFirst(getTag3(), ""); + content = content.replaceFirst(getTag4(), ""); + allLines.add(content); + } + + PrintWriter writer = new PrintWriter(xml.getPath()); + + for (String l : allLines){ + writer.println(l); + } + reader.close(); + writer.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println("... Files cleaned and saved for " + xmlDir + "."); + System.out.println("Ready for concatenation."); + System.out.println("\n========================\n"); + + } + + + + /** + * Concatenates all XMLs in one folder or between two folders. + * @param sourceDir main directory with XML files. + * @param duplicDir second directory with duplicated XML files + * @param concatFile path name to saved concatenated corpus + */ + + public void concatenateXML(String sourceDir, String duplicDir, String concatFile, String xmlDir){ + + final int BUFFER = 1024 << 8; + byte[] buffer = new byte[BUFFER]; + + //listing files on corpus dir + File srcDir = new File(sourceDir); + File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + File dupDir = new File(duplicDir); + File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name) { + return name.endsWith(".xml"); + } + }); + + System.out.println("... Files list loaded: "+ xmlDir + "."); + + //defining the output file (concatenated) + File newCorpus = new File(concatFile); + + try{ + OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus)); + + + //for each file on the corpus dir + for (File xmls : srcXMLs){ + InputStream input = new FileInputStream(xmls); + int count; + + //if the file is not empty/finished + try{ + while((count = input.read(buffer)) >= 0){ + + //write it on the concatenated final file + output.write(buffer, 0, count); + } + }finally{ + input.close(); + } + } + + if(dupXMLs != null){ + for(File xmld : dupXMLs){ + InputStream input = new FileInputStream(xmld); + int count; + + //if the file is not empty/finished + try{ + while((count = input.read(buffer)) >= 0){ + + //write it on the concatenated final file + output.write(buffer, 0, count); + } + }finally{ + input.close(); + } + } + } + output.flush(); + output.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println("... File concatenated and saved for "+ xmlDir+ "."); + System.out.println("Ready for corpus tagging."); + System.out.println("\n========================\n"); + } + + /** + * Inserts corpus tag on XML file + * + * @param pathToCorpus path to + * concatenated corpus + */ + + public void tagCorpus(String pathToCorpus, String xmlDir){ + + //tagging as corpus + try{ + BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus)); + + String line = null; + String edit = null; + List allLines = new ArrayList(); + + //adds tag at beggining of corpus + allLines.add(getCorpusTag()); + + while((line = reader.readLine()) != null){ + + allLines.add(line); + } + //adds tag at the end of corpus + allLines.add(getCorpusTagC()); + + System.out.println("... Corpus loaded and tagged."); + //re-writting the file + PrintWriter writer = new PrintWriter(pathToCorpus); + + for (String l : allLines){ + writer.println(l); + } + reader.close(); + writer.close(); + + System.out.println("... File saved as tagged " + xmlDir + " corpus."); + System.out.println("... DONE!"); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(IOException e){ + e.printStackTrace(); + } + } + + private String getCorpusTagC() { + return corpusTagC; + } + + private String getCorpusTag() { + // TODO Auto-generated method stub + return corpusTag; + } + + public String getTag1() { + return tag1; + } + + public void setTag1(String tag1) { + this.tag1 = tag1; + } + + public String getTag2() { + return tag2; + } + + public void setTag2(String tag2) { + this.tag2 = tag2; + } + + private String getTag4() { + // TODO Auto-generated method stub + return tag4; + } + + private String getTag3() { + // TODO Auto-generated method stub + return tag3; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + private void setCorpusTag(String string) { + this.corpusTag = string; + + } + + private void setCorpusTagC(String string) { + this.corpusTagC = string; + + } + + private void seTag4(String string) { + this.tag4 = string; + + } + + private void setTag3(String string) { + this.tag3 = string; + + } + +} + + diff --git a/src/preprocessing/SampleCorpus.java b/src/preprocessing/SampleCorpus.java new file mode 100644 index 0000000..bc65331 --- /dev/null +++ b/src/preprocessing/SampleCorpus.java @@ -0,0 +1,233 @@ +package preprocessing; + +import java.io.File; +import java.io.FilenameFilter; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +import configure.ConfigConstants; + +/** + * Performs document instances sampling + * generating training and test files + * with specific balance input by user. + * + * @author Hayda Almeida + * @since 2015 + * + */ +public class SampleCorpus { + + public static void main(String[] args) throws Exception { + + ConfigConstants pathVars = new ConfigConstants(); + SampleCorpus sampling = new SampleCorpus(); + + String pathToLiteratureFolder = pathVars.HOME_DIR + pathVars.CORPUS_DIR; + + String positiveDir = pathToLiteratureFolder + pathVars.POS_DIR; + List positives = new LinkedList(); + + String negativeDir = pathToLiteratureFolder + pathVars.NEG_DIR; + List negatives = new LinkedList(); + + //train or test sampling + Boolean training = Boolean.valueOf(pathVars.SAMPLE_TRAIN); + Boolean testing = Boolean.valueOf(pathVars.SAMPLE_TEST); + + //% of test corpus WRT the collection, % positive on test set, % positive on training set + int percTs = Integer.parseInt(pathVars.PERCT_TEST); + int percPosTr = Integer.parseInt(pathVars.PERCT_POS_TRAIN); + int percPosTs = Integer.parseInt(pathVars.PERCT_POS_TEST); + + if(!(training || testing)){ + System.out.println("Training or Test sampling: not set up.\n" + + "Please define sampling params in file: \n" + + "@ config.cfg."); + System.exit(0); + } + + positives = sampling.loadFiles(positiveDir); + negatives = sampling.loadFiles(negativeDir); + + if(testing) sampling.sampleTest(pathToLiteratureFolder + pathVars.TEST_DIR, positives, negatives, percTs, percPosTs); + + if(training) sampling.sampleTrain(pathToLiteratureFolder + pathVars.TRAIN_DIR, positives, negatives, percPosTr); + + + } + + /** + * Lists XML files within a folder + * @param dirSrc folder path + * @return returns list of file IDs + */ + public List loadFiles(String dirSrc){ + + List fileIDs = new LinkedList(); + + File sourceDir = new File(dirSrc); + File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + fileIDs = new LinkedList(Arrays.asList(srcXMLs)); + + return fileIDs; + } + + /** + * Moves a specific number of files + * in a list from origin folder to a test folder + * @param pathVars + * @param files List of file IDs + * @param numFiles number of files to be moved + */ + public void moveFile(String path, List files, int numFiles){ + + Iterator filesList = files.iterator(); + File testDir = new File(path); + + if(!testDir.exists()){ + try{ + testDir.mkdir(); + }catch(Exception e){ + System.out.println("Error creating Test folder."); + System.exit(0); + } + } + + while(filesList.hasNext() && numFiles > 0){ + try{ + File file = (File) filesList.next(); + File newFile = new File(testDir + "/" + file.getName()); + + Files.move(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + + filesList.remove(); + numFiles--; + } + catch(Exception e){ + System.out.println("Error moving files."); + System.exit(0); + } + } + + } + + /** + * Copies a specific number of files + * in a list from origin folder to a train folder + * @param pathVars + * @param files List of file IDs + * @param numFiles number of files to be moved + */ + public void copyFile(String path, List files, int numFiles, int percPos){ + + Iterator filesList = files.iterator(); + String trainPath = path.substring(0, path.length()-1) + "_" + percPos + "/"; + File trainDir = new File(trainPath); + + if(!trainDir.exists()) + try{ + trainDir.mkdir(); + }catch(Exception e){ + System.out.println("Error creating Training folder."); + System.exit(0); + } + + while(filesList.hasNext() && numFiles > 0){ + try{ + File file = (File) filesList.next(); + File newFile = new File(trainDir + "/"+ file.getName()); + + Files.copy(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + catch(Exception e){ + System.out.println("Error copying files."); + System.exit(0); + } + } + + } + + /** + * Samples document instances from the collection + * to generate a test set. + * + * @param pathVars + * @param positives list of positive documents IDs + * @param negatives list of negative documents IDs + * @param total percentage of the document collection for test + * @param pos percentage of positive documents in the test set + */ + public void sampleTest(String path, List positives, List negatives, int total, int pos){ + + int instances = positives.size() + negatives.size(); + int testSize = (instances * total) / 100; + int posSize = (testSize * pos) / 100; + int negSize = testSize - posSize; + + Collections.shuffle(negatives); + System.out.println("===== Test > Negative instances shuffled for test set."); + moveFile(path, negatives, negSize); + System.out.println("===== Test > Negative instances moved to test folder. \n"); + + Collections.shuffle(positives); + System.out.println("===== Test > Positive instances shuffled for test set."); + moveFile(path, positives, posSize); + System.out.println("===== Test > Positive instances moved to test folder. \n"); + + } + + /** + * Samples document instances from the collection + * to generate a training set. + * + * @param pathVars + * @param positives list of positive documents IDs + * @param negatives list of negative documents IDs + * @param pos percentage of positive documents in the training set + */ + public void sampleTrain(String path, List positives, List negatives, int percPos){ + + int posSize = positives.size(); + int trainSize = (100 * posSize) / percPos; + + int negSize = trainSize - posSize; + + if(positives.size() < posSize){ + System.out.println("Not enough positive instances for training set."); + System.exit(0); + } + else if(negatives.size() < negSize){ + System.out.println("Not enough negative instances for training set."); + System.exit(0); + } + else{ + Collections.shuffle(negatives); + System.out.println("===== Training > Negative instances shuffled for training set."); + copyFile(path, negatives, negSize, percPos); + System.out.println("===== Training > Negative instances copied to training folder. \n"); + + Collections.shuffle(positives); + System.out.println("===== Training > Positive instances shuffled for training set."); + copyFile(path, positives, posSize, percPos); + System.out.println("===== Training > Positive instances copied to training folder. \n"); + } + + } + + + + +} diff --git a/usermanual/.gitignore b/usermanual/.gitignore new file mode 100644 index 0000000..5190f77 --- /dev/null +++ b/usermanual/.gitignore @@ -0,0 +1,12 @@ +# User manual files # +################### +*.aux +*.bbl +*.blg +*.log +*.out +*.backup +*.toc +*.*~ + + diff --git a/usermanual/Makefile b/usermanual/Makefile new file mode 100644 index 0000000..822e19f --- /dev/null +++ b/usermanual/Makefile @@ -0,0 +1,31 @@ +DOC=usermanual +BIB=usermanual +PDFLATEX=pdflatex +BIBTEX=bibtex +GRAPHICSIDR=graphics + +all: paper +#all: images paper + +paper: $(DOC).tex $(BIB).bib + $(PDFLATEX) $(DOC).tex + $(BIBTEX) $(DOC) + $(PDFLATEX) $(DOC).tex + $(PDFLATEX) $(DOC).tex + $(BIBTEX) $(DOC) + +#images: +# cd $(GRAPHICSIDR) && make + +check: + $(TEXIDATE) $(DOC).tex + +wc: + @echo "Paper has:" `pdftotext $(DOC).pdf - | wc -w 2> /dev/null` "words!" + +clean: + -rm $(DOC).log $(DOC).aux $(DOC).blg $(DOC).bbl $(DOC).dvi $(DOC).ps $(DOC).out $(DOC).toc $(DOC).lof $(DOC).lot +# cd $(GRAPHICSIDR) && make clean + +hyperclean: clean + -rm *~ *.backup diff --git a/usermanual/datetime.sty b/usermanual/datetime.sty new file mode 100644 index 0000000..cc6580a --- /dev/null +++ b/usermanual/datetime.sty @@ -0,0 +1,487 @@ +%% +%% This is file `datetime.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% datetime.dtx (with options: `datetime') +%% Copyright (C) 2000 Nicola Talbot, all rights reserved. +%% If you modify this file, you must change its name first. +%% You are NOT ALLOWED to distribute this file alone. You are NOT +%% ALLOWED to take money for the distribution or use of either this +%% file or a changed version, except for a nominal charge for copying +%% etc. +%% \CharacterTable +%% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z +%% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z +%% Digits \0\1\2\3\4\5\6\7\8\9 +%% Exclamation \! Double quote \" Hash (number) \# +%% Dollar \$ Percent \% Ampersand \& +%% Acute accent \' Left paren \( Right paren \) +%% Asterisk \* Plus \+ Comma \, +%% Minus \- Point \. Solidus \/ +%% Colon \: Semicolon \; Less than \< +%% Equals \= Greater than \> Question mark \? +%% Commercial at \@ Left bracket \[ Backslash \\ +%% Right bracket \] Circumflex \^ Underscore \_ +%% Grave accent \` Left brace \{ Vertical bar \| +%% Right brace \} Tilde \~} +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{datetime}[2004/11/01 Date Time Package version 2.42] +\RequirePackage{fmtcount} +\newif\if@dt@nodate +\@dt@nodatefalse +\newif\ifshowdow % show the day of week if true + +\providecommand{\formatdate}[3]{} + +\newcount\@day +\newcount\@month +\newcount\@year + +\providecommand{\longdate}{% +\renewcommand{\formatdate}[3]{\ifshowdow\dayofweekname{##1}{##2}{##3} \fi +\@day=##1\relax\@month=##2\relax\@year=##3\relax +\@ordinal{\@day}\ \monthname[\@month], \the\@year}} + +\providecommand{\shortdate}{% +\renewcommand{\formatdate}[3]{\ifshowdow\shortdayofweekname{##1}{##2}{##3} \fi +\@day=##1\relax\@month=##2\relax\@year=##3\relax +\@ordinal{\@day}\ \shortmonthname[\@month], \the\@year}} + +\providecommand{\twodigit}[1]{{\@dtctr=#1\relax\ifnum\@dtctr<10 0\fi\the\@dtctr}} + +\providecommand{\ddmmyyyydate}{% +\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax +\twodigit\@day/\twodigit\@month/\the\@year}} + +\providecommand{\dmyyyydate}{% +\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax +\the\@day/\the\@month/\the\@year}} + +\providecommand{\ddmmyydate}{\renewcommand{\formatdate}[3]{% +\@day=##1\relax\@month=##2\relax\@year=##3\relax +\@dtctr=\@year% +\@modulo{\@dtctr}{100}% +\twodigit\@day/\twodigit\@month/\twodigit\@dtctr}} + +\providecommand{\dmyydate}{\renewcommand{\formatdate}[3]{% +\@day=##1\relax\@month=##2\relax\@year=##3\relax +\@dtctr=\@year% +\@modulo{\@dtctr}{100}% +\the\@day/\the\@month/\twodigit\@dtctr}} + +\providecommand{\textdate}{% +\renewcommand{\formatdate}[3]{\ifshowdow\dayofweekname{##1}{##2}{##3} the \fi +\@day=##1\relax\@month=##2\relax\@year=##3\relax +\@Ordinalstring\@day\ of \monthname[\@month], \@Numberstring\@year}} + +\providecommand{\usdate}{% +\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax +\monthname[\@month]\ \the\@day, \the\@year}} + +\providecommand{\mmddyyyydate}{% +\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax +\twodigit\@month/\twodigit\@day/\the\@year}} + +\providecommand{\mdyyyydate}{% +\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax +\the\@month/\the\@day/\the\@year}} + +\providecommand{\mmddyydate}{\renewcommand{\formatdate}[3]{% +\@day=##1\relax\@month=##2\relax\@year=##3\relax +\@dtctr=\@year% +\@modulo{\@dtctr}{100}% +\twodigit\@month/\twodigit\@day/\twodigit\@dtctr}} + +\providecommand{\mdyydate}{\renewcommand{\formatdate}[3]{% +\@day=##1\relax\@month=##2\relax\@year=##3\relax +\@dtctr=\@year% +\@modulo{\@dtctr}{100}% +\the\@month/\the\@day/\twodigit\@dtctr}} + +\providecommand{\currenttime}{\xxivtime} + +\providecommand{\settimeformat}[1]{\def\currenttime{\csname#1\endcsname}} +\longdate\renewcommand{\fmtord}[1]{\textsuperscript{#1}}\showdowtrue +\InputIfFileExists{datetime.cfg}{\typeout{Loading local datetime configurations}}{\typeout{No datetime.cfg file found}} +\RequirePackage{fmtcount} +\DeclareOption{long}{\longdate} +\DeclareOption{short}{\shortdate} +\DeclareOption{ddmmyyyy}{\ddmmyyyydate} +\DeclareOption{dmyyyy}{\dmyyyydate} +\DeclareOption{ddmmyy}{\ddmmyydate} +\DeclareOption{dmyy}{\dmyydate} +\DeclareOption{text}{\textdate} +\DeclareOption{us}{\usdate} +\DeclareOption{mmddyyyy}{\mmddyyyydate} +\DeclareOption{mdyyyy}{\mdyyyydate} +\DeclareOption{mmddyy}{\mmddyydate} +\DeclareOption{mdyy}{\mdyydate} +\DeclareOption{level}{\renewcommand{\fmtord}[1]{#1}} +\DeclareOption{raise}{\renewcommand{\fmtord}[1]{\textsuperscript{#1}}} +\DeclareOption{dayofweek}{\showdowtrue} +\DeclareOption{nodayofweek}{\showdowfalse} +\DeclareOption{nodate}{\@dt@nodatetrue} +\DeclareOption{24hr}{\settimeformat{xxivtime}} +\DeclareOption{12hr}{\settimeformat{ampmtime}} +\DeclareOption{oclock}{\settimeformat{oclock}} + + +\DeclareOption{austrian}{\input{dt-austrian.def}} +\DeclareOption{bahasa}{\input{dt-bahasa.def}} +\DeclareOption{basque}{\input{dt-basque.def}} +\DeclareOption{breton}{\input{dt-breton.def}} +\DeclareOption{bulgarian}{\input{dt-bulgarian.def}} +\DeclareOption{catalan}{\input{dt-catalan.def}} +\DeclareOption{croatian}{\input{dt-croatian.def}} +\DeclareOption{czech}{\input{dt-czech.def}} +\DeclareOption{danish}{\input{dt-danish.def}} +\DeclareOption{dutch}{\input{dt-dutch.def}} +\DeclareOption{esperanto}{\input{dt-esperanto.def}} +\DeclareOption{estonian}{\input{dt-estonian.def}} +\DeclareOption{finnish}{\input{dt-finnish.def}} +\DeclareOption{french}{\input{dt-french.def}} +\DeclareOption{galician}{\input{dt-galician.def}} +\DeclareOption{german}{\input{dt-german.def}} +\DeclareOption{greek}{\input{dt-greek.def}} +\DeclareOption{hebrew}{\input{dt-hebrew.def}} +\DeclareOption{icelandic}{\input{dt-icelandic.def}} +\DeclareOption{irish}{\input{dt-irish.def}} +\DeclareOption{italian}{\input{dt-italian.def}} +\DeclareOption{latin}{\input{dt-latin.def}} +\DeclareOption{lsorbian}{\input{dt-lsorbian.def}} +\DeclareOption{magyar}{\input{dt-magyar.def}} +\DeclareOption{naustrian}{\input{dt-naustrian.def}} +\DeclareOption{ngerman}{\input{dt-ngerman.def}} +\DeclareOption{norsk}{\input{dt-norsk.def}} +\DeclareOption{polish}{\input{dt-polish.def}} +\DeclareOption{portuges}{\input{dt-portuges.def}} +\DeclareOption{romanian}{\input{dt-romanian.def}} +\DeclareOption{russian}{\input{dt-russian.def}} +\DeclareOption{samin}{\input{dt-samin.def}} +\DeclareOption{scottish}{\input{dt-scottish.def}} +\DeclareOption{serbian}{\input{dt-serbian.def}} +\DeclareOption{slovak}{\input{dt-slovak.def}} +\DeclareOption{slovene}{\input{dt-slovene.def}} +\DeclareOption{spanish}{\input{dt-spanish.def}} +\DeclareOption{swedish}{\input{dt-swedish.def}} +\DeclareOption{turkish}{\input{dt-turkish.def}} +\DeclareOption{ukraineb}{\input{dt-ukraineb.def}} +\DeclareOption{usorbian}{\input{dt-usorbian.def}} +\DeclareOption{welsh}{\input{dt-welsh.def}} + +\ProcessOptions +\RequirePackage{ifthen} +\if@dt@nodate +\typeout{datetime package message : option "nodate" used, so not defining \protect\monthname} +\else +\providecommand{\monthnameenglish}[1][\month]{% +\@orgargctr=#1\relax +\ifcase\@orgargctr +\PackageError{datetime}{Invalid Month number \the\@orgargctr}{Month numbers should go from 1 (January) to 12 (December)}% +\or January% +\or February% +\or March% +\or April% +\or May% +\or June% +\or July% +\or August% +\or September% +\or October% +\or November% +\or December% +\else \PackageError{datetime}{Invalid Month number \the\@orgargctr}{Month numbers should go from 1 (January) to 12 (December)}% +\fi} + +\let\monthname=\monthnameenglish +\fi +\if@dt@nodate +\typeout{datetime package message : option "nodate" used, so not defining \protect\shortmonthname} +\else +\providecommand{\shortmonthnameenglish}[1][\month]{% +\@orgargctr=#1\relax +\ifcase\@orgargctr +\PackageError{datetime}{Invalid Month number \the\@orgargctr}{Month numbers should go from 1 (jan) to 12 (dec)}% +\or Jan% +\or Feb% +\or Mar% +\or Apr% +\or May% +\or Jun% +\or Jul% +\or Aug% +\or Sept% +\or Oct% +\or Nov% +\or Dec% +\else% +\PackageError{datetime}{Invalid Month number \the\@orgargctr}{Month numbers should go from 1 (jan) to 12 (dec)}% +\fi} + +\let\shortmonthname=\shortmonthnameenglish + +\fi +\newif\ifleapyear + +\newcount\@dtctr + +\if@dt@nodate +\typeout{datetime package message : option "nodate" used, so not defining \protect\dayofweek \space or \protect\shortdayofweek} +\else + +\providecommand{\testifleapyear}[1]{% +\leapyearfalse +\@year=#1\relax +\@dtctr=\@year +\@modulo{\@dtctr}{400}% +\ifnum\@dtctr=0\relax +\leapyeartrue % year mod 400 = 0 => leap year +\else +\@dtctr=\@year +\@modulo{\@dtctr}{100}% +\ifnum\@dtctr=0\relax +\leapyearfalse % year mod 100 = 0 && year mod 400 != 0 => not a leap year +\else +\@dtctr=\@year +\@modulo{\@dtctr}{4}% +\ifnum\@dtctr=0\relax +\leapyeartrue % year mod 4 = 0 && year mod 100 != 0 => leap year +\fi +\fi +\fi +} + +\newcount\dayofyear + + +\providecommand{\computedayofyear}[3]{% +\testifleapyear{#3}% +\dayofyear=0\relax +\@day=#1\relax \@month=#2\relax \@year=#3\relax +\ifcase\@month +\or +\or \advance\dayofyear by 31\relax +\or \advance\dayofyear by 59\relax +\or \advance\dayofyear by 90\relax +\or \advance\dayofyear by 120\relax +\or \advance\dayofyear by 151\relax +\or \advance\dayofyear by 181\relax +\or \advance\dayofyear by 212\relax +\or \advance\dayofyear by 243\relax +\or \advance\dayofyear by 273\relax +\or \advance\dayofyear by 304\relax +\or \advance\dayofyear by 334\relax +\else +\PackageError{datetime}{Invalid month number}{The second argument to \protect\computedayofyear +should lie in the range 1-12} +\fi +\ifnum\@month>2\relax +\ifleapyear\advance\dayofyear by 1\relax\fi +\fi +\advance\dayofyear by \@day\relax +} + +\newcount\dayofweek + + +\providecommand{\computedayofweek}[3]{% +\computedayofyear{#1}{#2}{#3}% +\@dtctr=#3\relax +\advance\@dtctr by -1901\relax +\@modulo{\@dtctr}{28}% +\dayofweek=\@dtctr +\divide\dayofweek by 4\relax +\advance\dayofweek by \@dtctr +\advance\dayofweek by 2\relax +\@modulo{\dayofweek}{7}% +\advance\dayofweek by \dayofyear +\advance\dayofweek by -1\relax +\@modulo{\dayofweek}{7}% +\advance\dayofweek by 1\relax} + +\providecommand{\dayofweekname}[3]{% +\computedayofweek{#1}{#2}{#3}% +\ifcase\dayofweek +\or Sunday% +\or Monday% +\or Tuesday% +\or Wednesday% +\or Thursday% +\or Friday% +\or Saturday% +\fi} + +\providecommand{\shortdayofweekname}[3]{% +\computedayofweek{#1}{#2}{#3}% +\ifcase\dayofweek +\or Sun% +\or Mon% +\or Tue% +\or Wed% +\or Thu% +\or Fri% +\or Sat% +\fi} +\fi +\if@dt@nodate +\else +\def\today{\formatdate{\day}{\month}{\year}} +\fi +\if@dt@nodate +\else +\@ifundefined{dateenglish}{}{\let\dateenglish\longdate} +\@ifundefined{dateaustrian}{}{\input{dt-austrian.def}} +\@ifundefined{datebahasa}{}{\input{dt-bahasa.def}} +\@ifundefined{datebasque}{}{\input{dt-basque.def}} +\@ifundefined{datebreton}{}{\input{dt-breton.def}} +\@ifundefined{datebulgarian}{}{\input{dt-bulgarian.def}} +\@ifundefined{datecatalan}{}{\input{dt-catalan.def}} +\@ifundefined{datecroatian}{}{\input{dt-croatian.def}} +\@ifundefined{dateczech}{}{\input{dt-czech.def}} +\@ifundefined{datedanish}{}{\input{dt-danish.def}} +\@ifundefined{datedutch}{}{\input{dt-dutch.def}} +\@ifundefined{dateesperanto}{}{\input{dt-esperanto.def}} +\@ifundefined{dateestonian}{}{\input{dt-estonian.def}} +\@ifundefined{datefinnish}{}{\input{dt-finnish.def}} +\@ifundefined{datefrench}{}{\input{dt-french.def}} +\@ifundefined{dategalician}{}{\input{dt-galician.def}} +\@ifundefined{dategerman}{}{\input{dt-german.def}} +\@ifundefined{dategreek}{}{\input{dt-greek.def}} +\@ifundefined{datehebrew}{}{\input{dt-hebrew.def}} +\@ifundefined{dateicelandic}{}{\input{dt-icelandic.def}} +\@ifundefined{dateirish}{}{\input{dt-irish.def}} +\@ifundefined{dateitalian}{}{\input{dt-italian.def}} +\@ifundefined{datelatin}{}{\input{dt-latin.def}} +\@ifundefined{datelsorbian}{}{\input{dt-lsorbian.def}} +\@ifundefined{datemagyar}{}{\input{dt-magyar.def}} +\@ifundefined{datenaustrian}{}{\input{dt-naustrian.def}} +\@ifundefined{datengerman}{}{\input{dt-ngerman.def}} +\@ifundefined{datenorsk}{}{\input{dt-norsk.def}} +\@ifundefined{datepolish}{}{\input{dt-polish.def}} +\@ifundefined{dateportuges}{}{\input{dt-portuges.def}} +\@ifundefined{dateromanian}{}{\input{dt-romanian.def}} +\@ifundefined{daterussian}{}{\input{dt-russian.def}} +\@ifundefined{datesamin}{}{\input{dt-samin.def}} +\@ifundefined{datescottish}{}{\input{dt-scottish.def}} +\@ifundefined{dateserbian}{}{\input{dt-serbian.def}} +\@ifundefined{dateslovak}{}{\input{dt-slovak.def}} +\@ifundefined{dateslovene}{}{\input{dt-slovene.def}} +\@ifundefined{datespanish}{}{\input{dt-spanish.def}} +\@ifundefined{dateswedish}{}{\input{dt-swedish.def}} +\@ifundefined{dateturkish}{}{\input{dt-turkish.def}} +\@ifundefined{dateukraineb}{}{\input{dt-ukraineb.def}} +\@ifundefined{dateusorbian}{}{\input{dt-usorbian.def}} +\@ifundefined{datewelsh}{}{\input{dt-welsh.def}} +\fi +\if@dt@nodate +\typeout{datetime package message : option "nodate" used, so not defining \protect\newdateformat} +\else + +\providecommand\THEDAY{\the\@day} +\providecommand\THEMONTH{\the\@month} +\providecommand\THEYEAR{\the\@year} +\let\c@DAY=\@day +\let\c@MONTH=\@month +\let\c@YEAR=\@year + +\providecommand{\dateformat}[4]{\@day=#2\relax\@month=#3\relax\@year=#4\relax#1} + +\providecommand{\newdateformat}[2]{% +\@ifundefined{#1}{\expandafter\def\csname#1\endcsname{\renewcommand{\formatdate}{\dateformat{#2}}}}{% +\PackageError{datetime}{Can't create new date format, command \textbackslash#1 already defined}{You will need to +give you new date format a different name}}} +\fi + +\newcount\c@HOUR +\newcount\c@HOURXII +\newcount\c@MINUTE +\newcount\c@TOHOUR +\newcount\c@TOMINUTE +\def\THEHOUR{\the\c@HOUR} +\def\THEHOURXII{\the\c@HOURXII} +\def\THEMINUTE{\the\c@MINUTE} +\def\THETOHOUR{\the\c@TOHOUR} +\def\THETOMINUTE{\the\c@TOMINUTE} + +\providecommand{\newtimeformat}[2]{% +\@ifundefined{#1}{% +\expandafter\def\csname#1\endcsname{% +\c@HOUR=\time% +\divide\c@HOUR by 60\relax +\c@HOURXII=\c@HOUR +\ifnum\c@HOURXII>12 +\advance\c@HOURXII by -12\relax +\fi +\c@MINUTE=\time% +\@modulo{\c@MINUTE}{60}% +\c@TOHOUR=\c@HOURXII +\advance\c@TOHOUR by 1\relax +\@modulo{\c@TOHOUR}{12}% +\c@TOMINUTE=\c@MINUTE +\advance\c@TOMINUTE by -60\relax +\multiply\c@TOMINUTE by -1\relax +#2 +}}{% +\PackageError{datetime}{Command \textbackslash#1 already defined}{% +You can't create a new time format called "#1" as the command \textbackslash#1 already exists}}} +\newtimeformat{xxivtime}{\twodigit\THEHOUR:\twodigit\THEMINUTE} + +\newtimeformat{ampmtime}{\THEHOURXII:\twodigit\THEMINUTE\ifthenelse{\value{HOUR}<12}{\amname}{\ifthenelse{\time=720}{ \noon}{\pmname}}} + +\newtimeformat{oclock}{\ifthenelse{\time=0 \or \time=720}{% +\ifthenelse{\time=0}{\midnight}{\noon}}{% +\ifthenelse{\value{MINUTE}=0}{% +\Numberstring{HOUR} \oclock}{% +\ifthenelse{\value{MINUTE}=15}{% +\quarterpast\ \Numberstring{HOUR}}{% +\ifthenelse{\value{MINUTE}=30}{% +\halfpast\ \Numberstring{HOUR}}{% +\ifthenelse{\value{MINUTE}=45}{% +\quarterto\ \Numberstring{TOHOUR}}{% +\ifthenelse{\value{MINUTE}<30}{% +\Numberstring{MINUTE} \ifthenelse{\value{MINUTE}=1}{minute}{minutes} past \Numberstring{HOURXII}}{% +\Numberstring{TOMINUTE} \ifthenelse{\value{TOMINUTE}=1}{minute}{minutes} to \Numberstring{TOHOUR}}}}}} +\ifthenelse{\value{HOUR}<12}{% +\amstring}{% +\pmstring}}} + +\providecommand{\amname}{am} +\providecommand{\pmname}{pm} +\providecommand{\amstring}{in the morning} +\providecommand{\pmstring}{in the afternoon} +\providecommand{\halfpast}{Half past} +\providecommand{\quarterpast}{Quarter past} +\providecommand{\quarterto}{Quarter to} +\providecommand{\noon}{Noon} +\providecommand{\midnight}{Midnight} +\providecommand{\oclockstring}{O'Clock} + +\newcount\pdftimectr +\newcount\pdfdatectr + +\pdftimectr=0\relax +\@dtctr=\time% +\divide\@dtctr by 60\relax +\multiply\@dtctr by 10000\relax +\pdftimectr=\@dtctr +\@dtctr=\time% +\@modulo{\@dtctr}{60}% +\multiply\@dtctr by 100\relax +\advance\pdftimectr by \@dtctr +\pdfdatectr=\day +\@dtctr=\month +\multiply\@dtctr by 100\relax +\advance\pdfdatectr by \@dtctr +\@dtctr=\year +\multiply\@dtctr by 10000\relax +\advance\pdfdatectr by \@dtctr +\ifnum\pdftimectr<100000 +\edef\pdfdate{\the\pdfdatectr0\the\pdftimectr} +\else +\edef\pdfdate{\the\pdfdatectr\the\pdftimectr} +\fi +\endinput +%% +%% End of file `datetime.sty'. diff --git a/usermanual/fmtcount.sty b/usermanual/fmtcount.sty new file mode 100644 index 0000000..304bacb --- /dev/null +++ b/usermanual/fmtcount.sty @@ -0,0 +1,587 @@ +%% +%% This is file `fmtcount.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% datetime.dtx (with options: `fmtcount') +%% Copyright (C) 2000 Nicola Talbot, all rights reserved. +%% If you modify this file, you must change its name first. +%% You are NOT ALLOWED to distribute this file alone. You are NOT +%% ALLOWED to take money for the distribution or use of either this +%% file or a changed version, except for a nominal charge for copying +%% etc. +%% \CharacterTable +%% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z +%% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z +%% Digits \0\1\2\3\4\5\6\7\8\9 +%% Exclamation \! Double quote \" Hash (number) \# +%% Dollar \$ Percent \% Ampersand \& +%% Acute accent \' Left paren \( Right paren \) +%% Asterisk \* Plus \+ Comma \, +%% Minus \- Point \. Solidus \/ +%% Colon \: Semicolon \; Less than \< +%% Equals \= Greater than \> Question mark \? +%% Commercial at \@ Left bracket \[ Backslash \\ +%% Right bracket \] Circumflex \^ Underscore \_ +%% Grave accent \` Left brace \{ Vertical bar \| +%% Right brace \} Tilde \~} +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{fmtcount}[2004/10/22 v1.0] +\RequirePackage{ifthen} +\newcount\@DT@modctr +\def\@modulo#1#2{% +\@DT@modctr=#1\relax +\divide \@DT@modctr by #2\relax +\multiply \@DT@modctr by #2\relax +\advance #1 by -\@DT@modctr} +\providecommand{\fmtord}[1]{#1} +\newcount\@ordinalctr +\newcount\@orgargctr +\def\@ordinal#1{% +\@orgargctr=#1\relax +\@ordinalctr=#1% +\@modulo{\@ordinalctr}{100}% +\ifnum\@ordinalctr=11 +\the\@orgargctr\fmtord{th}% +\else +\ifnum\@ordinalctr=12 +\the\@orgargctr\fmtord{th}% +\else +\ifnum\@ordinalctr=13 +\the\@orgargctr\fmtord{th}% +\else +\@modulo{\@ordinalctr}{10}% +\ifcase\@ordinalctr +\the\@orgargctr\fmtord{th}% case 0 +\or \the\@orgargctr\fmtord{st}% case 1 +\or \the\@orgargctr\fmtord{nd}% case 2 +\or \the\@orgargctr\fmtord{rd}% case 3 +\else +\the\@orgargctr\fmtord{th}% default case +\fi +\fi +\fi +\fi +} +\newcommand{\@@unitstring}[1]{% +\ifcase#1\relax +zero% +\or one% +\or two% +\or three% +\or four% +\or five% +\or six% +\or seven% +\or eight% +\or nine% +\fi +} + +\newcommand{\@@tenstring}[1]{% +\ifcase#1\relax +\or ten% +\or twenty% +\or thirty% +\or fourty% +\or fifty% +\or sixty% +\or seventy% +\or eighty% +\or ninety% +\fi +} + +\newcommand{\@@teenstring}[1]{% +\ifcase#1\relax +ten% +\or eleven% +\or twelve% +\or thirteen% +\or fourteen% +\or fifteen% +\or sixteen% +\or seventeen% +\or eighteen% +\or nineteen% +\fi +} + +\newcommand{\@@Unitstring}[1]{% +\ifcase#1\relax +Zero% +\or One% +\or Two% +\or Three% +\or Four% +\or Five% +\or Six% +\or Seven% +\or Eight% +\or Nine% +\fi +} + +\newcommand{\@@Tenstring}[1]{% +\ifcase#1\relax +\or Ten% +\or Twenty% +\or Thirty% +\or Fourty% +\or Fifty% +\or Sixty% +\or Seventy% +\or Eighty% +\or Ninety% +\fi +} + +\newcommand{\@@Teenstring}[1]{% +\ifcase#1\relax +Ten% +\or Eleven% +\or Twelve% +\or Thirteen% +\or Fourteen% +\or Fifteen% +\or Sixteen% +\or Seventeen% +\or Eighteen% +\or Nineteen% +\fi +} + +\newcount\strctr +\newcommand{\@@numberstring}[1]{% +\ifnum#1>99000 +\PackageError{fmtcount}{Out of range}% +{This macro only works for values less than 100000}% +\else +\ifnum#1<0 +\PackageError{fmtcount}{Negative numbers not permitted}% +{This macro does not work for negative numbers, however +you can try typing "minus" first, and then pass the modulus of +this number}% +\fi +\fi +\strctr=#1\relax \divide\strctr by 1000\relax +\ifnum\strctr>9 +\divide\strctr by 10 +\ifnum\strctr>1 +\@tenstring{\strctr}% +\strctr=#1 \divide\strctr by 10000 +\ifnum\strctr>0 -\@unitstring{\strctr}\fi +\else +\strctr=#1 \divide\strctr by 1000 +\@teenstring{\strctr}% +\fi +\ \@thousand% +\else +\ifnum\strctr>0 \@unitstring{\strctr}\ \@thousand\fi +\fi +\strctr=#1\relax \@modulo{\strctr}{1000}% +\divide\strctr by 100 +\ifnum\strctr>0 +\ifnum#1>1000 \ \fi\@unitstring{\strctr}\ \@hundred% +\fi +\strctr=#1\relax \@modulo{\strctr}{100}% +\ifnum#1>100 \ifnum\strctr>0 \ and \fi\fi +\ifnum\strctr>19 +\divide\strctr by 10 +\@tenstring{\strctr}% +\strctr=#1\relax \@modulo{\strctr}{10}% +\ifnum\strctr>0 +-\@unitstring{\strctr}% +\fi +\else +\ifnum\strctr<10 +\ifnum\strctr=0 +\ifnum#1<100 \@unitstring{\strctr}\fi +\else +\@unitstring{\strctr}% +\fi +\else +\@modulo{\strctr}{10}% +\@teenstring{\strctr}% +\fi +\fi +} + +\newcommand{\@numberstring}[1]{% +\let\@unitstring=\@@unitstring \let\@teenstring=\@@teenstring \let\@tenstring=\@@tenstring +\def\@hundred{hundred}\def\@thousand{thousand}% +\@@numberstring{#1}} + +\newcommand{\@Numberstring}[1]{% +\let\@unitstring=\@@Unitstring \let\@teenstring=\@@Teenstring \let\@tenstring=\@@Tenstring +\def\@hundred{Hundred}\def\@thousand{Thousand}% +\@@numberstring{#1}} +\newcommand{\@@unitthstring}[1]{% +\ifcase#1\relax +zeroth% +\or first% +\or second% +\or third% +\or fourth% +\or fifth% +\or sixth% +\or seventh% +\or eighth% +\or nineth% +\fi +} + +\newcommand{\@@tenthstring}[1]{% +\ifcase#1\relax +\or tenth% +\or twentieth% +\or thirtieth% +\or fourtieth% +\or fiftieth% +\or sixtieth% +\or seventieth% +\or eightieth% +\or ninetieth% +\fi +} + +\newcommand{\@@teenthstring}[1]{% +\ifcase#1\relax +tenth% +\or eleventh% +\or twelfth% +\or thirteenth% +\or fourteenth% +\or fifteenth% +\or sixteenth% +\or seventeenth% +\or eighteenth% +\or nineteenth% +\fi +} + +\newcommand{\@@Unitthstring}[1]{% +\ifcase#1\relax +Zeroth% +\or First% +\or Second% +\or Third% +\or Fourth% +\or Fifth% +\or Sixth% +\or Seventh% +\or Eighth% +\or Nineth% +\fi +} + +\newcommand{\@@Tenthstring}[1]{% +\ifcase#1\relax +\or Tenth% +\or Twentieth% +\or Thirtieth% +\or Fourtieth% +\or Fiftieth% +\or Sixtieth% +\or Seventieth% +\or Eightieth% +\or Ninetieth% +\fi +} + +\newcommand{\@@Teenthstring}[1]{% +\ifcase#1\relax +Tenth% +\or Eleventh% +\or Twelfth% +\or Thirteenth% +\or Fourteenth% +\or Fifteenth% +\or Sixteenth% +\or Seventeenth% +\or Eighteenth% +\or Nineteenth% +\fi +} + +\newcommand{\@@ordinalstring}[1]{% +\ifnum#1>99000 +\PackageError{fmtcount}{Out of range}% +{This macro only works for values less than 100000}% +\else +\ifnum#1<0 +\PackageError{fmtcount}{Negative numbers not permitted}% +{This macro does not work for negative numbers, however +you can try typing "minus" first, and then pass the modulus of +this number}% +\fi +\fi +\strctr=#1\relax \divide\strctr by 1000\relax +\ifnum\strctr>9 +\divide\strctr by 10 +\ifnum\strctr>1 +\@tenstring{\strctr}% +\strctr=#1 \divide\strctr by 10000 +\ifnum\strctr>0 -\@unitstring{\strctr}\fi +\else +\strctr=#1 \divide\strctr by 1000 +\@teenstring{\strctr}% +\fi +\strctr=#1\relax \@modulo{\strctr}{1000}% +\ifnum\strctr=0\ \@thousandth\else\ \@thousand \fi +\else +\ifnum\strctr>0\relax +\@unitstring{\strctr}% +\strctr=#1\relax \@modulo{\strctr}{1000}% +\ifnum\strctr=0\ \@thousandth\else\ \@thousand\fi +\fi +\fi +\strctr=#1\relax \@modulo{\strctr}{1000}% +\divide\strctr by 100 +\ifnum\strctr>0 +\ifnum#1>1000 \ \fi\@unitstring{\strctr}% +\strctr=#1\relax \@modulo{\strctr}{100}% +\ifnum\strctr=0\ \@hundredth\else\ \@hundred\fi +\fi +\strctr=#1\relax \@modulo{\strctr}{100}% +\ifnum#1>100 \ifnum\strctr>0\ and \fi\fi +\ifnum\strctr>19 +\@dtctr=\strctr +\divide\strctr by 10 +\@modulo{\@dtctr}{10}% +\ifnum\@dtctr=0 \@tenthstring{\strctr}\else \@tenstring{\strctr}\fi +\strctr=#1\relax \@modulo{\strctr}{10}% +\ifnum\strctr>0 +-\@unitthstring{\strctr}% +\fi +\else +\ifnum\strctr<10 +\ifnum\strctr=0 +\ifnum#1<100 \@unitthstring{\strctr}\fi +\else +\@unitthstring{\strctr}% +\fi +\else +\@modulo{\strctr}{10}% +\@teenthstring{\strctr}% +\fi +\fi +} + +\newcommand{\@ordinalstring}[1]{% +\let\@unitthstring=\@@unitthstring \let\@teenthstring=\@@teenthstring \let\@tenthstring=\@@tenthstring +\let\@unitstring=\@@unitstring \let\@teenstring=\@@teenstring \let\@tenstring=\@@tenstring +\def\@hundred{hundred}\def\@thousand{thousand}% +\def\@hundredth{hundredth}\def\@thousandth{thousandth}% +\@@ordinalstring{#1}} + +\newcommand{\@Ordinalstring}[1]{% +\let\@unitthstring=\@@Unitthstring \let\@teenthstring=\@@Teenthstring \let\@tenthstring=\@@Tenthstring +\let\@unitstring=\@@Unitstring \let\@teenstring=\@@Teenstring \let\@tenstring=\@@Tenstring +\def\@hundred{Hundred}\def\@thousand{Thousand}% +\def\@hundredth{Hundredth}\def\@thousandth{Thousandth}% +\@@ordinalstring{#1}} +\newcount\c@padzeroesN +\c@padzeroesN=1\relax +\providecommand{\padzeroes}[1][17]{\c@padzeroesN=#1} + +\newif\if@DT@padzeroes +\newcount\@DT@loopN +\newcount\@DT@X +\newcommand{\@binary}[1]{% +\@DT@padzeroestrue +\@DT@loopN=17\relax +\strctr=65536\relax +\@DT@X=#1\relax +\loop +\@DT@modctr=\@DT@X +\divide\@DT@modctr by \strctr +\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\the\@DT@modctr}% +\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi +\multiply\@DT@modctr by \strctr +\advance\@DT@X by -\@DT@modctr +\divide\strctr by 2\relax +\advance\@DT@loopN by -1\relax +\ifnum\strctr>1 +\repeat +\the\@DT@X} +\newcommand{\@octal}[1]{% +\ifnum#1>32768 +\PackageError{fmtcount}{Value of counter too large for \protect\@octal}{Maximum value 32768} +\else +\@DT@padzeroestrue +\@DT@loopN=6\relax +\strctr=32768\relax +\@DT@X=#1\relax +\loop +\@DT@modctr=\@DT@X +\divide\@DT@modctr by \strctr +\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\the\@DT@modctr}% +\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi +\multiply\@DT@modctr by \strctr +\advance\@DT@X by -\@DT@modctr +\divide\strctr by 8\relax +\advance\@DT@loopN by -1\relax +\ifnum\strctr>1 +\repeat +\the\@DT@X +\fi} +\newcommand{\@@hexadecimal}[1]{\ifcase#10\or1\or2\or3\or4\or5\or6\or7\or8\or9\or a\or b\or c\or d\or e\or f\fi} + +\newcommand{\@hexadecimal}[1]{% +\@DT@padzeroestrue +\@DT@loopN=5\relax +\strctr=65536\relax +\@DT@X=#1\relax +\loop +\@DT@modctr=\@DT@X +\divide\@DT@modctr by \strctr +\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\@@hexadecimal\@DT@modctr}% +\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi +\multiply\@DT@modctr by \strctr +\advance\@DT@X by -\@DT@modctr +\divide\strctr by 16\relax +\advance\@DT@loopN by -1\relax +\ifnum\strctr>1 +\repeat +\@@hexadecimal\@DT@X} + +\newcommand{\@@Hexadecimal}[1]{\ifcase#10\or1\or2\or3\or4\or5\or6\or7\or8\or9\or A\or B\or C\or D\or E\or F\fi} + +\newcommand{\@Hexadecimal}[1]{% +\@DT@padzeroestrue +\@DT@loopN=5\relax +\strctr=65536\relax +\@DT@X=#1\relax +\loop +\@DT@modctr=\@DT@X +\divide\@DT@modctr by \strctr +\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\@@Hexadecimal\@DT@modctr}% +\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi +\multiply\@DT@modctr by \strctr +\advance\@DT@X by -\@DT@modctr +\divide\strctr by 16\relax +\advance\@DT@loopN by -1\relax +\ifnum\strctr>1 +\repeat +\@@Hexadecimal\@DT@X} + +\newcommand{\@aaalph}[1]{% +\@DT@loopN=#1\relax +\advance\@DT@loopN by -1\relax +\divide\@DT@loopN by 26\relax +\@DT@modctr=\@DT@loopN +\multiply\@DT@modctr by 26\relax +\@DT@X=#1\relax +\advance\@DT@X by -1\relax +\advance\@DT@X by -\@DT@modctr +\advance\@DT@loopN by 1\relax +\advance\@DT@X by 1\relax +\loop +\@alph\@DT@X +\advance\@DT@loopN by -1\relax +\ifnum\@DT@loopN>0 +\repeat +} + +\newcommand{\@AAAlph}[1]{% +\@DT@loopN=#1\relax +\advance\@DT@loopN by -1\relax +\divide\@DT@loopN by 26\relax +\@DT@modctr=\@DT@loopN +\multiply\@DT@modctr by 26\relax +\@DT@X=#1\relax +\advance\@DT@X by -1\relax +\advance\@DT@X by -\@DT@modctr +\advance\@DT@loopN by 1\relax +\advance\@DT@X by 1\relax +\loop +\@Alph\@DT@X +\advance\@DT@loopN by -1\relax +\ifnum\@DT@loopN>0 +\repeat +} + +\newcommand{\@abalph}[1]{% +\ifnum#1>17576 +\PackageError{fmtcount}{Value of counter too large for \protect\@abalph}{Maximum value 17576} +\else +\@DT@padzeroestrue +\strctr=17576\relax +\@DT@X=#1\relax +\advance\@DT@X by -1\relax +\loop +\@DT@modctr=\@DT@X +\divide\@DT@modctr by \strctr +\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=1\)}{}{\@alph\@DT@modctr}% +\ifnum\@DT@modctr=1\else\@DT@padzeroesfalse\fi +\multiply\@DT@modctr by \strctr +\advance\@DT@X by -\@DT@modctr +\divide\strctr by 26\relax +\ifnum\strctr>1 +\repeat +\advance\@DT@X by 1\relax +\@alph\@DT@X +\fi} + +\newcommand{\@ABAlph}[1]{% +\ifnum#1>17576 +\PackageError{fmtcount}{Value of counter too large for \protect\@ABAlph}{Maximum value 17576} +\else +\@DT@padzeroestrue +\strctr=17576\relax +\@DT@X=#1\relax +\advance\@DT@X by -1\relax +\loop +\@DT@modctr=\@DT@X +\divide\@DT@modctr by \strctr +\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=1\)}{}{\@Alph\@DT@modctr}% +\ifnum\@DT@modctr=1\else\@DT@padzeroesfalse\fi +\multiply\@DT@modctr by \strctr +\advance\@DT@X by -\@DT@modctr +\divide\strctr by 26\relax +\ifnum\strctr>1 +\repeat +\advance\@DT@X by 1\relax +\@Alph\@DT@X +\fi} + +\newcommand{\@decimal}[1]{% +\ifnum#1>10000 +\PackageError{fmtcount}{Value of counter too large for \protect\@decimal}{Maximum value 10000} +\else +\@DT@padzeroestrue +\@DT@loopN=6\relax +\strctr=10000\relax +\@DT@X=#1\relax +\loop +\@DT@modctr=\@DT@X +\divide\@DT@modctr by \strctr +\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\the\@DT@modctr}% +\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi +\multiply\@DT@modctr by \strctr +\advance\@DT@X by -\@DT@modctr +\divide\strctr by 10\relax +\advance\@DT@loopN by -1\relax +\ifnum\strctr>1 +\repeat +\the\@DT@X +\fi} +\providecommand{\ordinal}[1]{\expandafter\protect\expandafter\@ordinal{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\ordinalstring}[1]{\expandafter\protect\expandafter\@ordinalstring{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\Ordinalstring}[1]{\expandafter\protect\expandafter\@Ordinalstring{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\numberstring}[1]{\expandafter\protect\expandafter\@numberstring{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\Numberstring}[1]{\expandafter\protect\expandafter\@Numberstring{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\binary}[1]{\expandafter\protect\expandafter\@binary{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\aaalph}[1]{\expandafter\protect\expandafter\@aaalph{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\AAAlph}[1]{\expandafter\protect\expandafter\@AAAlph{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\abalph}[1]{\expandafter\protect\expandafter\@abalph{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\ABAlph}[1]{\expandafter\protect\expandafter\@ABAlph{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\hexadecimal}[1]{\expandafter\protect\expandafter\@hexadecimal{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\Hexadecimal}[1]{\expandafter\protect\expandafter\@Hexadecimal{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\octal}[1]{\expandafter\protect\expandafter\@octal{\expandafter\the\csname c@#1\endcsname}} +\providecommand{\decimal}[1]{\expandafter\protect\expandafter\@decimal{\expandafter\the\csname c@#1\endcsname}} +\endinput +%% +%% End of file `fmtcount.sty'. diff --git a/usermanual/graphics/Makefile b/usermanual/graphics/Makefile new file mode 100644 index 0000000..d34f02e --- /dev/null +++ b/usermanual/graphics/Makefile @@ -0,0 +1 @@ +all: diff --git a/usermanual/graphics/concordialogo.png b/usermanual/graphics/concordialogo.png new file mode 100644 index 0000000..8019bd1 Binary files /dev/null and b/usermanual/graphics/concordialogo.png differ diff --git a/usermanual/graphics/genomicslogogreen.jpg b/usermanual/graphics/genomicslogogreen.jpg new file mode 100644 index 0000000..27e5692 Binary files /dev/null and b/usermanual/graphics/genomicslogogreen.jpg differ diff --git a/usermanual/graphics/genozymeslogo.jpg b/usermanual/graphics/genozymeslogo.jpg new file mode 100644 index 0000000..7f8a2e1 Binary files /dev/null and b/usermanual/graphics/genozymeslogo.jpg differ diff --git a/usermanual/usefulsymbols.sty b/usermanual/usefulsymbols.sty new file mode 100644 index 0000000..68d50b2 --- /dev/null +++ b/usermanual/usefulsymbols.sty @@ -0,0 +1,55 @@ +% packages +\usepackage{float} % for algorithm float +\usepackage{graphicx} % for figures, plots, etc +\graphicspath{{./data/}} % '' +\usepackage{amssymb} % special math fonts +\usepackage{amsmath} +\usepackage{epsf,amsfonts,amsmath,amssymb} % defaults +\usepackage{url} +% for footnotes in the author fields +%\newcommand{\footnoteremember}[2]{ +% \footnote{#2} +% \newcounter{#1} +% \setcounter{#1}{\value{footnote}} +%} +%\newcommand{\footnoterecall}[1]{ +% \footnotemark[\value{#1}] +%} +\usepackage[usenames]{color} % for colour +\newcommand{\blue}{\color{blue}} % " +% math +\renewcommand{\vec}{\mathbf} +\newcommand{\mat}[1]{\boldsymbol#1} +\newcommand{\s}{\vec{s}} +\newcommand{\w}{\vec{w}} +\newcommand{\x}{\vec{x}} +\newcommand{\xtest}{\vec{\tilde{x}}} +\newcommand{\y}{\vec{y}} +\newcommand{\z}{\vec{z}} +\renewcommand{\a}{\vec{a}} +\renewcommand{\b}{\vec{b}} +\renewcommand{\c}{\vec{c}} +\renewcommand{\o}{\vec{o}} +\newcommand{\p}{\vec{p}} +\newcommand{\argmax}{\operatornamewithlimits{argmax}} +\newcommand{\R}{\mathbb{R}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\X}{\mathcal{X}} +\newcommand{\Y}{\mathcal{Y}} +\renewcommand{\L}{\mathcal{L}} +\newcommand{\tup}[1]{\langle#1\rangle} +% refs +\newcommand{\code}[1]{Algorithm~\ref{#1}} +\newcommand{\Fig}[1]{Figure~\ref{#1}} +\newcommand{\tab}[1]{Table~\ref{#1}} +\newcommand{\Sec}[1]{Section~\ref{#1}} +% custom float +\floatstyle{ruled} +\newfloat{algorithm}{thp}{lop} +\floatname{algorithm}{Algorithm} +% special definitions +\newcommand{\keyword}[1]{{\it #1}} +\newcommand{\framework}[1]{{\sc #1}} +% ref styles +% text +\newcommand{\ie}{i.e., } diff --git a/usermanual/usermanual.bib b/usermanual/usermanual.bib new file mode 100644 index 0000000..8141f6d --- /dev/null +++ b/usermanual/usermanual.bib @@ -0,0 +1,349 @@ +@article{Mathes2004, +author = {Mathes, Adam}, +journal = {Computer Mediated Communication}, +pages = {1--13}, +title = {{Folksonomies-cooperative classification and communication through shared metadata}}, +year = {2004} +} + +@book{manning2008introduction, + title={Introduction to information retrieval}, + author={Manning, Christopher D and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich}, + volume={1}, + year={2008}, + publisher={Cambridge University Press Cambridge} +} + +@article{Macgregor2006, +author = {Macgregor, G and McCulloch, E}, +journal = {Library review}, +pages = {291--300}, +title = {{Collaborative tagging as a knowledge organisation and resource discovery tool}}, +volume = {55}, +year = {2006} +} + +@INPROCEEDINGS{sebastiani2005, +author = {Fabrizio Sebastiani}, +title = {Text categorization}, +booktitle = {Text Mining and its Applications to Intelligence, CRM and Knowledge Management}, +pages = {109--129}, +publisher = {WIT Press}, +year = {2005} +} + +@article{Voss2004, +author = {Voss, Jakob}, +journal = {arXiv preprint cs/0604036}, +keywords = {classification,ddc,ontology,tagging,thesaurus,wikipedia}, +number = {1}, +pages = {1--7}, +title = {Collaborative thesaurus tagging the Wikipedia way}, +volume = {1}, +year = {2006} +} + +@inproceedings{Charton2010a, +author = {Charton, Eric and Torres-Moreno, J.M.}, +publisher = {Proceedings of LREC 2010, the International Conference on Language Resources and Evaluation}, +title = {{NLGbAse: a free linguistic resource for Natural Language Processing systems}}, +year = {2010} +} + +@InProceedings{zhang2012, +author = {Ziqi Zhang and Philip Webster and Victoria Uren and Andrea Varga and Fabio Ciravegna}, +title = {Automatically Extracting Procedural Knowledge from Instructional Texts using Natural Language Processing}, +booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, +year = {2012}, +month = {may}, +date = {23-25}, +address = {Istanbul, Turkey}, +isbn = {978-2-9517408-7-7}, +language = {english} +} + +@inproceedings{schumacher2012, +address = {Lyon, France}, +title = {Extraction of Procedural Knowledge from the Web}, +booktitle = {Workshop Proceedings: {WWW'12}}, +author = {Schumacher, Pol and Minor, Mirjam and Walter, Kirstin and Bergmann, Ralph}, +year = {2012} +} + +@article{Schein2002, +author = {Schein, AI and Popescul, Alexandrin}, +isbn = {1581135610}, +journal = {Proceedings of the 25th annual international ACM SIGIR conference on Research and development in information retrieval}, +number = {Sigir}, +pages = {253--260}, +url = {http://dl.acm.org/citation.cfm?id=564421}, +title = {{Methods and metrics for cold-start recommendations}}, +year = {2002} +} + +@article{Dave2003, +author = {Dave, Kushal and Lawrence, Steve and Pennock, DM}, +isbn = {1581136803}, +journal = {WWW '03 Proceedings of the 12th international conference on World Wide Web }, +title = {{Mining the peanut gallery: Opinion extraction and semantic classification of product reviews}}, +url = {http://dl.acm.org/citation.cfm?id=775226}, +year = {2003} +} + +@inproceedings{Groin2007, +author = {Groin, Cyril and Berthelin, Jean-Baptiste and Ayari, Sarra El and Heitz, Thomas and Hurault-plantet, Martine and Jardino, Michele}, +booktitle = {AFIA 2007}, +pages = {1--8}, +title = {{Pr\'{e}sentation de DEFT'07}}, +year = {2007} +} + +@article{pang2008, +author = {Pang, Bo and Lee, Lillian}, +doi = {10.1561/1500000001}, +issn = {1554-0669}, +journal = {Foundations and Trends in Information Retrieval}, +number = {2}, +pages = {91--231}, +title = {{Opinion Mining and Sentiment Analysis}}, +volume = {1}, +year = {2008} +} + +@article{Koppel2006, +author = {Koppel, Moshe and Shtrimberg, Itai}, +journal = {Computing attitude and affect in text: Theory and Application, The Information Retrieval Series}, +keywords = {automated labeling,financial analysis,sentiment analysis}, +title = {{Good news or bad news? let the market decide}}, +volume = {20}, +pages = {297--301}, +url = {http://link.springer.com/chapter/10.1007/1-4020-4102-0\_22}, +year = {2006} +} + +@article{Wu2004, +archivePrefix = {arXiv}, +author = {Wu, Fang and Huberman, BA}, +journal = {arXiv preprint cond-mat/0407252}, +title = {{Social structure and opinion formation}}, +year = {2004} +} + +@article{Grouin2013, +author = {Grouin, Cyril and Zweigenbaum, Pierre and Paroubek, Patrick}, +journal = {Actes du neuvi\`{e}me D\'{E}fi Fouille de Texte}, +number = {June}, +pages = {3--16}, +title = {{DEFT2013 se met \`{a} table: pr\'{e}sentation du d\'{e}fi et r\'{e}sultats}}, +url = {http://deft.limsi.fr/actes/actes\_deft2013.pdf\#page=13}, +year = {2013} +} + +@inproceedings{wang2008, + author = {Wang, Liping and Li, Qing and Li, Na and Dong, Guozhu and Yang, Yu}, + title = {Substructure similarity measurement in chinese recipes}, + booktitle = {Proceedings of the 17th international conference on World Wide Web}, + series = {WWW '08}, + year = {2008}, + isbn = {978-1-60558-085-2}, + location = {Beijing, China}, + pages = {979--988}, + numpages = {10}, + url = {http://doi.acm.org/10.1145/1367497.1367629}, + doi = {10.1145/1367497.1367629}, + publisher = {ACM} +} + +@inproceedings{wang2006, +author = {Wang, Liping and Li, Qing and Li, Yu and Meng, Xiaofeng}, +booktitle = {Semantics, Knowledge and Grid, 2006. SKG '06. Second International Conference on}, + pages = 6, + publisher = {IEEE Computer Society}, + title = {Dish Master: an Intelligent and Adaptive Manager for a Web-based Recipe Database System.}, + year = 2006 +} + +@inproceedings{blatak2004, + author = {Blat\'{a}k, Jan and Mr\'{a}kov\'{a}, Eva and Popel\'{\i}nsk\'{y}, Lubo\v{s}}, + title = {Fragments and text categorization}, + booktitle = {Proceedings of the ACL 2004 on Interactive poster and demonstration sessions}, + series = {ACLdemo '04}, + year = {2004}, + location = {Barcelona, Spain}, + articleno = {34}, + url = {http://dx.doi.org/10.3115/1219044.1219078}, + doi = {10.3115/1219044.1219078}, + acmid = {1219078}, + publisher = {Association for Computational Linguistics}, + address = {Stroudsburg, PA, USA}, +} + + +@phdthesis{hall1999correlation, + title={Correlation-based feature selection for machine learning}, + author={Hall, Mark A}, + year={1999}, + school={The University of Waikato} +} + +@ARTICLE{Pearl1986, + author = {Pearl, J}, + title = {Fusion, propagation, and structuring in belief networks}, + journal = {Artificial Intelligence}, + year = {1986}, + volume = {29}, + pages = {241--288}, + number = {3}, + address = {Essex, UK}, + doi = {http://dx.doi.org/10.1016/0004-3702(86)90072-X}, + issn = {0004-3702}, + publisher = {Elsevier Science Publishers Ltd.} +} + +@BOOK{Pearl1998, + title = {Bayesian networks}, + publisher = {MIT Press}, + year = {1998}, + author = {Pearl, Judea}, + pages = {149--153}, + address = {Cambridge, MA, USA}, + book = {The handbook of brain theory and neural networks}, + isbn = {0-262-51102-9} +} + +@article{cooper1992bayesian, + title={A Bayesian method for the induction of probabilistic networks from data}, + author={Cooper, Gregory F and Herskovits, Edward}, + journal={Machine learning}, + volume={9}, + number={4}, + pages={309--347}, + year={1992}, + publisher={Springer} +} + +@article{cooper1992bayesian, + title={A Bayesian method for the induction of probabilistic networks from data}, + author={Cooper, Gregory F and Herskovits, Edward}, + journal={Machine learning}, + volume={9}, + number={4}, + pages={309--347}, + year={1992}, + publisher={Springer} +} + +@article{hall2009weka, + title={The WEKA data mining software: an update}, + author={Hall, Mark and Frank, Eibe and Holmes, Geoffrey and Pfahringer, Bernhard and Reutemann, Peter and Witten, Ian H}, + journal={ACM SIGKDD Explorations Newsletter}, + volume={11}, + number={1}, + pages={10--18}, + year={2009}, + publisher={ACM} +} + +@article{lmt2005, + title={Logistic model trees}, + author={Landwehr, Niels and Hall, Mark and Frank, Eibe}, + journal={Machine Learning}, + volume={59}, + number={1-2}, + pages={161--205}, + year={2005}, + publisher={Springer} +} + +@article{collins2002logistic, + title={Logistic regression, AdaBoost and Bregman distances}, + author={Collins, Michael and Schapire, Robert E and Singer, Yoram}, + journal={Machine Learning}, + volume={48}, + number={1-3}, + pages={253--285}, + year={2002}, + publisher={Springer} +} + +@incollection{lmtspeeding2005, + title={Speeding up logistic model tree induction}, + author={Sumner, Marc and Frank, Eibe and Hall, Mark}, + booktitle={Knowledge Discovery in Databases: PKDD 2005}, + pages={675--683}, + year={2005}, + publisher={Springer} +} + +@inproceedings{Charton2007b, +author = {Charton, Eric and Acuna-Agost, Rodrigo}, +booktitle = {DEFT}, +title = {{Quel mod\`{e}le pour d\'{e}tecter une opinion? Trois propositions pour g\'{e}n\'{e}raliser lextraction dune id\'{e}e dans un corpus}}, +year = {2007} +} + +@BOOK{Vapnik1995, + title = {The Nature of Statistical Learning Theory}, + publisher = {Springer-Verlag}, + year = {1995}, + author = {Vapnik, Vladimir} +} + +@ARTICLE{multiclasssvm2002, +author={Chih-Wei Hsu and Chih-Jen Lin}, +journal={Neural Networks, IEEE Transactions on}, +title={A comparison of methods for multiclass support vector machines}, +year={2002}, +volume={13}, +number={2}, +pages={415-425} +} + +@article{chang2011libsvm, + title={LIBSVM: a library for support vector machines}, + author={Chang, Chih-Chung and Lin, Chih-Jen}, + journal={ACM Transactions on Intelligent Systems and Technology (TIST)}, + volume={2}, + number={3}, + pages={27}, + year={2011}, + publisher={ACM} +} + +@article{el2005wlsvm, + title={WLSVM: Integrating libsvm into WEKA environment}, + author={El-Manzalawy, Yasser and Honavar, Vasant}, + journal={Software available at http://www. cs. iastate. edu/yasser/wlsvm}, + year={2005} +} + +@inproceedings{Charton2013, +address = {Sables d'Olonnes}, +author = {Charton, Eric and Jean-Louis, Ludovic and Meurs, Marie-Jean and Gagnon, Michel}, +booktitle = {Actes de DEFT2013}, +editor = {Cyril, Groin}, +publisher = {ACLWeb}, +title = {{Trois recettes d'apprentissage automatique pour un syst\`{e}me d'extraction d'information et de classification de recettes de cuisines}}, +url = {http://deft.limsi.fr/2013/}, +year = {2013} +} + +@book{quinlan1993c4, + title={C4.5: programs for machine learning}, + author={Quinlan, John Ross}, + volume={1}, + year={1993}, + publisher={Morgan kaufmann} +} + +@Article{chartoninformatics2013, +AUTHOR = {Eric Charton, Marie-Jean Meurs, Ludovic Jean-Louis, Michel Gagnon}, +TITLE = {Using Collaborative Tagging for Text Classification: From Text Classification to Opinion Mining}, +JOURNAL = {Informatics}, +VOLUME = {1}, +YEAR = {2014}, +NUMBER = {1}, +PAGES = {32--51}, +URL = {https://www.mdpi.com/2227-9709/1/1/32}, +DOI = {10.3390/informatics1010032} +} \ No newline at end of file diff --git a/usermanual/usermanual.pdf b/usermanual/usermanual.pdf new file mode 100644 index 0000000..3649ce7 Binary files /dev/null and b/usermanual/usermanual.pdf differ diff --git a/usermanual/usermanual.tex b/usermanual/usermanual.tex new file mode 100644 index 0000000..2d7ab9b --- /dev/null +++ b/usermanual/usermanual.tex @@ -0,0 +1,600 @@ +\documentclass[11pt]{article} +\usepackage[margin=1.1in]{geometry} +\usepackage{usefulsymbols} +\usepackage{hyperref} +\newcommand{\mycos}{{\bf{mycoSORT{ }}}} +\newcommand{\homefolder}{\texttt{mycosort-pck-\version{ }}} +\newcommand{\configfile}{\texttt{config.cfg{ }}} +\newcommand{\configsample}{\texttt{config-sample.cfg{ }}} + +\def\version{{\tt 1.0}} +\usepackage{listings} +\usepackage{courier} +\lstset{basicstyle=\small\ttfamily,breaklines=true,frame=L,xleftmargin=\parindent} +%\lstset{framextopmargin=50pt,frame=bottomline} +%\lstset{breaklines=true} +%\lstset{breakatwhitespace=true} + +% if you just need a simple heading +% Usage: +% \heading{the text of the heading} +\newcommand{\heading}[1]{ + \vspace{0.3cm} \noindent \textbf{#1} \newline +} + +\usepackage{datetime} +\newdateformat{mydate}{\monthname[\THEMONTH] \THEYEAR} + + +\graphicspath{{./graphics/}} + +\begin{document} + +\title{\mycos{} ~ \version \\~\\~\\User Manual\\~\\} + +\author{Hayda Almeida\\Marie-Jean Meurs\\~\\~\\Tsang Lab} + +\date{\mydate\today} + +\maketitle + +\begin{center} + \includegraphics[width=0.2\textwidth]{genomicslogogreen}$\qquad$\includegraphics[width=0.2\textwidth]{genozymeslogo}$\qquad$\includegraphics[width=0.25\textwidth]{concordialogo}\\ +\end{center} + +\pagestyle{empty} + +\pagebreak +\tableofcontents + +% \pagestyle{empty} +\pagebreak + +\section{Introduction to mycoSORT} + +\mycos{} is an open source text classification software program written in Java. +The software is based on data sampling and machine learning methods. +\mycos{} was primarily developed to perform text classification of scientific literature related to fungal enzymes. +However, it can also execute classification of literature related to different topics. +This tool can be applied to support scientific researchers in the selection of relevant documents when performing literature review. + +\mycos{} utilizes a labeled document collection to learn from, +and generate a classification model through supervised learning, +which will be used to predict a label for new scientific papers. +In order to obtain a classification prediction for a given document, +\mycos{} must first train or load a classification model. + +To generate the classification models, \mycos{} makes use of the standard implementation +of classification algorithms provided by the Weka workbench~\cite{hall2009weka}, +which is also developed in Java. +In addition, \mycos{} utilizes the following packages: +% commons-lang3-3.2.1 +% jsoup-1.7.3 +\begin{itemize} + \item Apache Commons Lang\footnote{\url{https://commons.apache.org}}(version 3.2.1 or above), a Java utility package; + \item jsoup\footnote{\url{http://jsoup.org/}}(version 1.7.3 or above), a Java HTML/XML parser; + \item LIBSVM\footnote{\url{http://www.csie.ntu.edu.tw/~cjlin/libsvm/}}(version 3.2 or above), a wrapper library for the SVM classifier; + \item Apache Ant\footnote{\url{http://ant.apache.org/}}(version 1.9.3 or above), a Java library used to build Java applications. +\end{itemize} + +In the following sections, you will find instructions on how to +have access to the system source code, install, and run the software tool. +\mycos{} toolkit is available at \url{https://github.com/TsangLab/mycosort}. +This user manual describes \mycos{} version {\version}. + +\section{Getting Started} +\label{sec:start} +To download \mycos{} toolkit, please access \url{https://github.com/TsangLab/mycosort}. +This user manual is written for version \version; +and assumes that you have downloaded and extracted the \homefolder{} as a folder, +and that this folder is currently in your working directory. + +\subsection{Package Content} +\label{subsec:pckcontent} +\mycos{} toolkit contains several folders and files in its root folder. +These items are used to provide inputs and keep outpus for the system subtasks. +Their usage and content are further explained here. + +\paragraph{Folders list} +\textit{\texttt{arff} $\rightarrow$} contains the .ARFF files representing the data as vector matrices. These files are used to generate the classification models. \\ +\textit{\texttt{corpus} $\rightarrow$} keeps the training and test sets in .XML format, used to build and apply the classification models. \\ +\textit{\texttt{executables} $\rightarrow$} holds the .JAR files that compose the system and are used to perform the system tasks. \\ +\textit{\texttt{features} $\rightarrow$} contains the features extracted from the training sets, saved in .TXT format. \\ +\textit{\texttt{jars} $\rightarrow$} contains the external .JAR packages which are bundled with the system.\\ +\textit{\texttt{src} $\rightarrow$} holds the system .JAVA source files. + +\paragraph{Files list} +\textit{\texttt{build.xml} $\rightarrow$} master file used by Apache Ant to build \mycos{} executables. \\ +\textit{\configsample $\rightarrow$} a sample of the configuration file used to set specific parameters for the system different tasks. \\ +\textit{\texttt{entities.txt} $\rightarrow$} a list of bioentities that are found annotated in the dataset. \\ +\textit{\texttt{stopList.txt} $\rightarrow$} a list of stop-words to be considered for feature extraction. + +\subsection{Requirements} +\mycos{} requires Java JDK (version 1.8.0 or above) and Apache Ant (version 1.9.3 or above) to be installed.\\ +For further information on how to install Java JDK, please refer to: \\ \url{http://www.oracle.com/technetwork/java/javase/downloads/index.html}. \\ +For information on how to install Apache Ant, please refer to: \\ \url{http://ant.apache.org/manual/install.html}. + +% \section{Dataset Format} +% \label{sec:format} +% +% \mycos{} uses the following file formats in its processing: +% \begin{itemize} +% \item XML - eXtensible Markup Language - dataset format containing annotated papers +% \item TXT - Text File - \mycos{} features extraction format +% \item ARFF - Attribute-Relation File Format\footnote{\url{http://cs.waikato.ac.nz/~ml/weka/arff}} - instance and features list format +% \end{itemize} + +\section{Configuration Setup} +\label{sec:configuration} +The general working environment and configurations of \mycos{} are defined in a file named \configfile{}. +To generate a \configfile{} file, create a copy of \configsample{} and rename it to \configfile{}. +Before compiling and running \mycos{}, it is required to edit the \configfile{} file. + +\subsection{Directory Setting} +\label{directory} +To set up the main directory for using \mycos{}, firstly update the \texttt{HOME\_DIR} directory for your own desired folder, as in the following example. +The \texttt{HOME\_DIR} should contain the path of the system main folder, where the \mycos{} toolkit was extracted. +\begin{lstlisting} +HOME_DIR=/home/usr/mycosort-pck-version/ +\end{lstlisting} + +The following directories are set by default, and generally should not be changed, +since they refer to folder paths inside of your \texttt{HOME\_DIR}. +The corpus directory contains the dataset .XML files, as well as the training and testing files. +\begin{lstlisting} +CORPUS_DIR=corpus/ +\end{lstlisting} + +The positive and negative folders contain the positive and negative .XML instances, and are found inside of \texttt{CORPUS\_DIR}. +\begin{lstlisting} +POS_DIR=positives/ +NEG_DIR=negatives/ +\end{lstlisting} + +The train and test folders will contain the train and test .XML instances, and are found inside of \texttt{CORPUS\_DIR}. +\begin{lstlisting} +TRAIN_DIR=train/ +TEST_DIR=test/ +\end{lstlisting} + +The arff directory contains the .ARFF files, used to feed the classification algorithms. +\begin{lstlisting} +OUTPUT_MODEL=arff/ +\end{lstlisting} + +The feature directory contains the .TXT files listing all feature types extracted from the training sets. +\begin{lstlisting} +FEATURE_DIR=features/ +\end{lstlisting} + +The duplicates directory is a directory in which the user wants to look for duplicates. +Its value should be edited to fit the folder name, which should be placed inside of \texttt{CORPUS\_DIR}. +\begin{lstlisting} +DUP_DIR=test/ +\end{lstlisting} + +\subsection{Corpus Sampling Setting} +\label{subsec:corpussamp} +Data sampling can be used to split the document collection into training and test collections, +as well as to generate several training collections with different class distributions. +To enable the training or test sampling, set the following variables to true: +\begin{lstlisting} +SAMPLE_TRAIN=false +SAMPLE_TEST=false +\end{lstlisting} +The following variables will control the data sampling settings. +To determine the size of the test set with regards to the entire document collection, use \texttt{PERCT\_TEST} to set the percentage of the test collection. +\begin{lstlisting} +PERCT_TEST=15 +\end{lstlisting} +To generate a training collection, first define the percentage of positive instances to be sampled for this corpus. +This variable is also used when generating .ARFF files. +\begin{lstlisting} +PERCT_POS_TRAIN=50 +\end{lstlisting} +To generate a test collection, first determine its the percentage of positive instances. +\begin{lstlisting} +PERCT_POS_TEST=10 +\end{lstlisting} + +\subsection{File Setting} +\label{subsec:fileset} +We describe here the files used as input for \mycos{}. +The \texttt{TRAINING\_FILE} and \texttt{TEST\_FILE} should contain the name of the XML files generated as training and test sets. +The training file is used by the extractors to extract features, and to build the .ARFF files. +\begin{lstlisting} +TRAINING_FILE=triage0.xml +TEST_FILE=triage1.xml +\end{lstlisting} + +The .ARFF files are used to feed the classification models. +When wanting to re-train a model, \texttt{ARFF\_TRAIN} should contain the .ARFF file name used for training. +When wanting to test new instances, \texttt{ARFF\_TEST} should contain the .ARFF file name used for testing. +\begin{lstlisting} +ARFF_TRAIN=triage0.arff +ARFF_TEST=triage1.arff +\end{lstlisting} + +The stopwords list used by the extractors is defined here. +We recommend to keep this variable as it is defined in the \configsample. +\begin{lstlisting} +STOP_LIST=stopList.txt +\end{lstlisting} + +When executing subtasks, \mycos{} produces the following files as output, +which are later on used as input for new subtasks. +These files contain the features extracted from a given training set. +\begin{lstlisting} +ECNUM_FEATURES=ecnumbers.txt +JOURNAL_TITLE_FEATURES=journaltitles.txt +ANNOTATION_FEATURES=annotations.txt +TITLE_FEATURES=titleAnnotations.txt +NGRAM_FEATURES=ngrams_features.txt +TITLE_NGRAMS=titleGrams.txt +DOC_IDS=docIDs.txt +\end{lstlisting} + +\subsection{Feature Setting} +\label{subsec:featureset} +The feature configuration is taken into account when generating .ARFF files. +In order to choose a feature type to be used when creating an .ARFF file, simply set its value to ``true'', as the examples below. \\ +\paragraph{General features} +More than one feature can be combined when generating .ARFF files. +The following variables will load general features: the size of a paper abstract, +the name of the publication journal, and the EC numbers found in a paper. +\begin{lstlisting} +USE_TEXT_SIZE=true +USE_JOURNAL_TITLE_FEATURE=true +USE_ECNUM_FEATURE=true +\end{lstlisting} + +The \texttt{USE\_DOC\_ID} variable extracts the paper PMID. +This variable must be maintained with its value set to ``true'', +since it is needed to output the classification predictions according to the document ID. +\begin{lstlisting} +USE_DOC_ID=true +\end{lstlisting} + +The following variables set specific conditions for feature frequency (number of times it was found in the training set) +and feature lenght (number of characters in a feature) to be taking into account when extracting the feature list. +The default parameters are defined below, but they can be adjusted according to the user needs. +\begin{lstlisting} +FEATURE_MIN_FREQ=2 +FEATURE_MIN_LENGTH=3 +\end{lstlisting} + +\paragraph{Annotation features} +The following variables will provide annotation features to generate .ARFF files. +To load the bioentity annotations extracted from the training set, the value of \texttt{USE\_ANNOTATION\_FEATURE} must be set to true. +When setting \texttt{USE\_ANNOTATION\_TYPE} to true, the bioentity types will be loaded to generate .ARFFs. +Finally, when setting \texttt{USE\_TITLE\_FEATURE} to true, the bioentities annotated in paper titles will be +considered separately from the annotations found in abstracts. +\begin{lstlisting} +USE_ANNOTATION_FEATURE=true +USE_ANNOTATION_TYPE=true +USE_TITLE_FEATURE=true +\end{lstlisting} + +\paragraph{N-Gram features} +The following variables will provide n-gram features to generate .ARFF files. +To load the n-grams extracted from the training set, the value of \texttt{USE\_NGRAM\_FEATURE} must be set to true. +When setting \texttt{USE\_TITLE\_NGRAMS} to true, the n-grams found in paper titles will be +considered separately from the n-grams found in abstracts. +Use \texttt{NGRAM\_STOP} to remove stopwords from the feature list. +\begin{lstlisting} +USE_NGRAM_FEATURE=true +USE_TITLE_NGRAMS=false +NGRAM_STOP=true +\end{lstlisting} +The variable \texttt{NGRAM\_SIZE} determines the number of words used to +form n-grams. The default value is 1, however the system is also capable of generating +bigrams (\texttt{NGRAM\_SIZE=2}) and trigrams (\texttt{NGRAM\_SIZE=3}). +\begin{lstlisting} +NGRAM_SIZE=1 +\end{lstlisting} + +To apply a weight in a n-gram, set the following variable to true and +determine the value of the weight. +This configuration will simply multiply the current n-gram frequency by the value provided in \texttt{WEIGHT}. +\begin{lstlisting} +USE_WEIGHTED_NGRAM=false +WEIGHT=3 +\end{lstlisting} + +\subsection{Feature Selection Setting} +\label{subsec:featselec} +The feature selection configuration is taken into account before feeding .ARFF files to the classification algorithms. +To enable Odds Ratio (OR) or IDF filtering, just set one of the following variables to true: +\begin{lstlisting} +USE_ODDS_RATIO=true +USE_IDF=false +\end{lstlisting} +It is recommended to apply Odds Ratio or IDF, but not both together. +To determine the minimum threshold considered to keep a feature, adjust the following variables (default is set to 1): +\begin{lstlisting} +OR_THRESHOLD=1 +IDF_THRESHOLD=1 +\end{lstlisting} + +\subsection{Experiment} +The experiment type is used to generate .XML and .ARFF files. +To generate training files, set \texttt{EXP\_TYPE=0}, and to generate test files, set \texttt{EXP\_TYPE=1}. +\begin{lstlisting} +EXP_TYPE=0 +\end{lstlisting} + +% +% \subsubsection{N-Grams} +% \label{ngrams} +% To determine the size of N-Grams features, please set the number of \texttt{NGRAM\_SIZE} variable on the file to \texttt{1}, \texttt{2} or \texttt{3}. +% In order to have a single relation of all the N-Grams from both paper abstract and title, the features should be configured as the following: +% \begin{lstlisting} +% USE_NGRAM_FEATURE=true +% USE_TITLE_NGRAMS=false +% \end{lstlisting} +% If you require the title N-Grams as separated features from the abstract N-Grams, please define its value also as \texttt{true}. +% +% Yet, if you require that N-Grams from the paper abstract should not be considered and only the title text must be taken into account, use the following configuration: +% \begin{lstlisting} +% USE_NGRAM_FEATURE=false +% USE_TITLE_NGRAMS=true +% \end{lstlisting} +% +% \subsubsection{Annotations} +% \label{annotations} +% The same configuration set for abstract and title is valid for the annotations. To have a single relation from both paper abstract and title, use: +% \begin{lstlisting} +% USE_ANNOTATION_FEATURE=true +% USE_TITLE_FEATURE=false +% \end{lstlisting} +% If separated lists of annotation features from abstract and title are needed, please define both values as \texttt{true}. +% +% However, if you wish to have only the annotations found on the paper title, but not on the paper abstract, just apply the variables value as the following: +% \begin{lstlisting} +% USE_ANNOTATION_FEATURE=false +% USE_TITLE_FEATURE=true +% \end{lstlisting} + + +\section{Using \mycos{}} +\mycos{} can be used from a command line interface. +The system utilizes Apache Ant to build the five different modules (.JAR files), +which are available in the \texttt{executables} folder. +To execute \mycos{} modules, it is necessary to access the system home folder. +In a command line interface (a terminal in Linux OS, or a prompt in Microsoft Windows), +navigate until the \homefolder{} folder, such as: + +\begin{lstlisting} + user@machine $ cd /home/usr/mycosort-pck-version +\end{lstlisting} + +On a Microsoft Windows system, the forward slashes should be replaced by back slashes +(e.g. \texttt{home\textbackslash usr\textbackslash ...}). +From now on, the instructions will assume that a Linux OS is being used. + +\paragraph{Compiling} +After accessing the system home folder, it is necessary to first compile \mycos{} modules. +To do so, simply type \texttt{"ant"} in the command line, as the example below: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +\end{lstlisting} +The system should be re-compiled if any parameter is changed or edited in the \configfile file. +Following we describe the usage and configuration for each of the five \mycos{} modules: +\begin{itemize} +\item SampleCorpus +\item CorpusHandler +\item FeatureExtractor +\item NgramExtractor +\item BuildModel +\item Trainer +\end{itemize} + + +\subsection{SampleCorpus} +The \texttt{SampleCorpus} module allows the user to generate training and test collections. +It utilizes all .XML documents contained in the \texttt{corpus/positive} and \texttt{corpus/negative} folders. +\paragraph{Example 1} +When generating the test collection, the .XML instances randomly selected will be moved +from the \texttt{corpus/positive} and \texttt{corpus/negative} folders to the +\texttt{corpus/test} folder. +To execute the test sampling and generate a test collection that represents +15\% of the entire document collection, and that contains 10\% of positive instances, +edit the following variables in the \configfile: +\begin{lstlisting} +SAMPLE_TEST=true +PERCT_TEST=15 +PERCT_POS_TEST=10 +\end{lstlisting} + +\paragraph{Example 2} +When generating the training collection, the .XML instances randomly selected will be +copied from \texttt{corpus/positive} and \texttt{corpus/negative} folders to the +\texttt{corpus/train\_ (PERCT\_POS\_TRAIN)} folder. +To execute the training sampling and generate a trainng collection that contains +50\% of negative instances and 50\% of positive instances, +edit the following variables in the \configfile: +\begin{lstlisting} +SAMPLE_TRAIN=true +PERCT_POS_TRAIN=50 +\end{lstlisting} + +To execute \texttt{SampleCorpus}, run the following instruction in the command line interface: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +user@home/usr/mycosort-pck-version $ ant sample-corpus +\end{lstlisting} +After running the instruction, the selected sampling (training or test) will be executed. +The training collection will then be copied to a \texttt{corpus/train\_50} folder. +The training collection can be generated multiple times, with different class distributions. + +\subsection{CorpusHandler} +The \texttt{CorpusHandler} module is used to create the training and test corpus, +by generating a combined .XML file containing all .XML instances either in the +\texttt{corpus/test} folder or in the \texttt{corpus/train\_(PERCT\_POS\_TRAIN)} folder. +\paragraph{Example 3} +Besides generating the training and test corpora, this module can also perform a check for duplicates. +To check for duplicates between an existing training file +and a given \texttt{DUP\_DIR} folder containing several .XML files, +edit the following variables in the \configfile file: +\begin{lstlisting} +TRAINING_FILE=triage0.xml +DUP_DIR=test/ +\end{lstlisting} +To execute \texttt{CorpusHandler} and check for duplicates between training file and a given folder, +run the following instruction in the command line interface: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +user@home/usr/mycosort-pck-version $ ant -Doptions=df corpus-handler +\end{lstlisting} +\paragraph{Example 4} +To check for duplicates between the train or test folders containing all .XMLs and +a given \texttt{DUP\_DIR} folder containing other several .XML files, +edit the following variables in the \configfile file: +\begin{lstlisting} +DUP_DIR=test/ +EXP_TYPE=0 +\end{lstlisting} +In this case, \texttt{EXP\_TYPE=0} if the train .XMLs must be considered, +or \texttt{EXP\_TYPE=1} if the test .XMLs must be considered. +To execute \texttt{CorpusHandler} and check for duplicates between training file and a given folder, +run the following instruction in the command line interface: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +user@home/usr/mycosort-pck-version $ ant -Doptions=dc corpus-handler +\end{lstlisting} +When checking for duplicates, the duplicates found will by default renamed only in the \texttt{DUP\_DIR} folder. \\ + +\paragraph{Example 5} +To generate a training corpus, the following variables must be edited in the \configfile file: +\begin{lstlisting} +PERCT_POS_TRAIN=50 +EXP_TYPE=0 +\end{lstlisting} +To generate a testing corpus, edit the following variables: +\begin{lstlisting} +PERCT_POS_TEST=10 +EXP_TYPE=1 +\end{lstlisting} +In order to generate the corpora, it is first required to clean (\texttt{-Doptions=cl}), +and only then concatenate (\texttt{-Doptions=cc}) all .XMLs in a given folder. +Thus, when creating the training or test corpora, +run the following instruction in the command line interface: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +user@home/usr/mycosort-pck-version $ ant -Doptions=cl,cc corpus-handler +\end{lstlisting} + +\subsection{NgramExtractor} +The \texttt{NgramExtractor} is a feature extraction module, used to extract n-grams +(small units of text) from the paper article title and abstract. +N-grams can be generated in three different sizes: unigrams (one word), bigrams (two words), and trigrams (three words). +The default ngram size is one word, since this extraction already results in long list of features. +It is also recommended to discard stopwords when extracting n-grams, +and this can be set by keeping the value of \texttt{NGRAM\_STOP} as \texttt{true}. +\paragraph{Example 6} +To perform the ngram extraction, the following variables must be edited in the \configfile file: +\begin{lstlisting} +TRAINING_FILE=triage0.xml +NGRAM_STOP=true +NGRAM_SIZE=1 +FEATURE_MIN_FREQ=2 +FEATURE_MIN_LENGTH=3 +\end{lstlisting} +To execute \texttt{NgramExtractor} run the following instruction in the command line interface: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +user@home/usr/mycosort-pck-version $ ant ngram-extractor +\end{lstlisting} + +\subsection{FeatureExtractor} +The \texttt{FeatureExtractor} is a feature extraction module, used to extract domain annotations +(specific XML tags) from the paper article title and abstract. +To specify the list of annotations (tags) to be considered in the \texttt{FeatureExtractor} module, +please refer to the \texttt{entities.txt} file in the root of \homefolder folder. +In order to consider new annotation type (tag) beside the ones provided in the file, +simply add a new line containing the annotation type name and its level (sentence or entity). +Should a given type not be considered, simply add a \# at the beggining of the specific line. + +\paragraph{Example 7} +To perform the feature extraction, the following variables must be edited in the \configfile file: +\begin{lstlisting} +TRAINING_FILE=triage0.xml +FEATURE_MIN_FREQ=2 +FEATURE_MIN_LENGTH=3 +\end{lstlisting} +To execute \texttt{FeatureExtractor} run the following instruction in the command line interface: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +user@home/usr/mycosort-pck-version $ ant feature-extractor +\end{lstlisting} + + +\subsection{BuildModel} +The \texttt{BuildModel} is the module used to represent the training and test sets as +matrix of document vectors, that will be later fed to a classification algorithm. +Models are saved in the .ARFF file format, and can be generated with several different configurations of features. +All generated models are saved in the \texttt{arff} folder. + +\paragraph{Example 8} +To determine the feature configuration used in a given model, +the chosen options, as described in~\ref{subsec:featureset}, +must be set to \texttt{true} in the \configfile file. +As an example, if the user wants to generate a model based only in unigram features, +the setup of n-gram features must be set to true, as described in~\ref{subsec:featureset}, +while the annotation features setup must be set to false. + +In addition, the following variables must also be edited, +to indicate if the model should be generated based on the training set or the test set, +as well as to indicate which percentage of positives was currently considered in the training set. +\begin{lstlisting} +PERCT_POS_TRAIN=50 +EXP_TYPE=1 +\end{lstlisting} + +To execute \texttt{BuildModel}, run the following instruction in the command line interface: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +user@home/usr/mycosort-pck-version $ ant build-model +\end{lstlisting} + +\subsection{Trainer} +The \texttt{Trainer} module processes the training .ARFF files and utilizes a classification algorithm to +generalize a function, and output predictions for instances in the test .ARFF files. +The corresponding training and test .ARFF files must be indicated +in the \configfile file before executing the \texttt{Trainer} module. +In order to specify the correct files, please refer to these two items: +\begin{lstlisting} +ARFF_TRAIN=triage0.arff +ARFF_TEST=triage1.arff +\end{lstlisting} + +While training and testing the models, feature selection methods can also be set up. +To perform IDF or Odds Ratio filtering, please refer to the items described in~\ref{subsec:featselec}. +It is recommended to perform one of the filterings at once, either IDF or Odds Ratio, +as opposed to both at the same execution. + +\paragraph{Example 9} +A model can be trained using three different classification algorithms: +{Na\"{\i}ve} Bayes (\texttt{-Dclassifier=nb}), Support Vector Machine (\texttt{-Dclassifier=svm}), or Logistic Model Tree (\texttt{-Dclassifier=lmt}). + +To execute \texttt{Trainer} using LMT, run the following instruction in the command line interface: +\begin{lstlisting} +user@home/usr/mycosort-pck-version $ ant +user@home/usr/mycosort-pck-version $ ant -Dclassifier=lmt trainer +\end{lstlisting} + + +\section{Contacts} +Should you have any questions, comments or bug reports, the authors can be reached at the following addresses:\\ +\url{hayda.almeida@concordia.ca} \\ +\url{marie-jean.meurs@concordia.ca} + + +\appendix + + +\bibliographystyle{acm} +% \renewcommand{\baselinestretch}{0.0} +\bibliography{usermanual} + +\end{document}