diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1cde967
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+# Project source #
+###################
+*.project
+*.classpath
+
+# Package Files #
+#################
+*.jar
diff --git a/arff/triage0_50_ngrams_size1_stopwords.arff b/arff/triage0_50_ngrams_size1_stopwords.arff
new file mode 100644
index 0000000..c7525b4
--- /dev/null
+++ b/arff/triage0_50_ngrams_size1_stopwords.arff
@@ -0,0 +1,38 @@
+% Weka training file - HIV triage - 2015
+
+@RELATION triage
+@ATTRIBUTE docID 	REAL 		%PMID of paper
+@ATTRIBUTE Ngram0trees	REAL 		%trees
+@ATTRIBUTE Ngram1model	REAL 		%model
+@ATTRIBUTE Ngram2triage	REAL 		%triage
+@ATTRIBUTE Ngram3genes	REAL 		%genes
+@ATTRIBUTE Ngram4sampling	REAL 		%sampling
+@ATTRIBUTE Ngram5classification	REAL 		%classification
+@ATTRIBUTE Ngram6processing	REAL 		%processing
+@ATTRIBUTE Ngram7fungal	REAL 		%fungal
+@ATTRIBUTE Ngram8enzymes	REAL 		%enzymes
+@ATTRIBUTE Ngram9manual	REAL 		%manual
+@ATTRIBUTE Ngram10literature	REAL 		%literature
+@ATTRIBUTE Ngram11annotation	REAL 		%annotation
+@ATTRIBUTE Ngram12mycoclapfungalgenomicsca	REAL 		%mycoclapfungalgenomicsca
+@ATTRIBUTE Ngram13machine	REAL 		%machine
+@ATTRIBUTE Ngram14first	REAL 		%first
+@ATTRIBUTE Ngram15features	REAL 		%features
+@ATTRIBUTE Ngram16mycoclap	REAL 		%mycoclap
+@ATTRIBUTE Ngram17results	REAL 		%results
+@ATTRIBUTE Ngram18abstracttext	REAL 		%abstracttext
+@ATTRIBUTE Ngram19task	REAL 		%task
+@ATTRIBUTE Ngram20http	REAL 		%http
+@ATTRIBUTE Ngram21support	REAL 		%support
+@ATTRIBUTE Ngram22learning	REAL 		%learning
+@ATTRIBUTE Ngram23database	REAL 		%database
+@ATTRIBUTE Ngram24curation	REAL 		%curation
+@ATTRIBUTE Ngram25logistic	REAL 		%logistic
+@ATTRIBUTE Ngram26applications	REAL 		%applications
+@ATTRIBUTE Ngram27articletitle	REAL 		%articletitle
+@ATTRIBUTE class 	{positive, negative}
+@DATA
+
+25754864,0,0,0,2,0,0,2,5,3,1,1,2,2,0,1,0,6,1,0,0,2,2,0,4,2,0,2,0,negative
+25551575,2,4,3,0,2,2,0,0,0,1,2,0,0,4,1,2,0,1,0,2,0,2,3,0,1,2,0,0,negative
+
diff --git a/arff/triage1_50_ngrams_size1_stopwords.arff b/arff/triage1_50_ngrams_size1_stopwords.arff
new file mode 100644
index 0000000..ddfc07d
--- /dev/null
+++ b/arff/triage1_50_ngrams_size1_stopwords.arff
@@ -0,0 +1,38 @@
+% Weka test file - HIV triage - 2015
+
+@RELATION triage
+@ATTRIBUTE docID 	REAL 		%PMID of paper
+@ATTRIBUTE Ngram0trees	REAL 		%trees
+@ATTRIBUTE Ngram1model	REAL 		%model
+@ATTRIBUTE Ngram2triage	REAL 		%triage
+@ATTRIBUTE Ngram3genes	REAL 		%genes
+@ATTRIBUTE Ngram4sampling	REAL 		%sampling
+@ATTRIBUTE Ngram5classification	REAL 		%classification
+@ATTRIBUTE Ngram6processing	REAL 		%processing
+@ATTRIBUTE Ngram7fungal	REAL 		%fungal
+@ATTRIBUTE Ngram8enzymes	REAL 		%enzymes
+@ATTRIBUTE Ngram9manual	REAL 		%manual
+@ATTRIBUTE Ngram10literature	REAL 		%literature
+@ATTRIBUTE Ngram11annotation	REAL 		%annotation
+@ATTRIBUTE Ngram12mycoclapfungalgenomicsca	REAL 		%mycoclapfungalgenomicsca
+@ATTRIBUTE Ngram13machine	REAL 		%machine
+@ATTRIBUTE Ngram14first	REAL 		%first
+@ATTRIBUTE Ngram15features	REAL 		%features
+@ATTRIBUTE Ngram16mycoclap	REAL 		%mycoclap
+@ATTRIBUTE Ngram17results	REAL 		%results
+@ATTRIBUTE Ngram18abstracttext	REAL 		%abstracttext
+@ATTRIBUTE Ngram19task	REAL 		%task
+@ATTRIBUTE Ngram20http	REAL 		%http
+@ATTRIBUTE Ngram21support	REAL 		%support
+@ATTRIBUTE Ngram22learning	REAL 		%learning
+@ATTRIBUTE Ngram23database	REAL 		%database
+@ATTRIBUTE Ngram24curation	REAL 		%curation
+@ATTRIBUTE Ngram25logistic	REAL 		%logistic
+@ATTRIBUTE Ngram26applications	REAL 		%applications
+@ATTRIBUTE Ngram27articletitle	REAL 		%articletitle
+@ATTRIBUTE class 	{positive, negative}
+@DATA
+
+25754864,0,0,0,2,0,0,2,5,3,1,1,2,2,0,1,0,6,1,0,0,2,2,0,4,2,0,2,0,negative
+25551575,2,4,3,0,2,2,0,0,0,1,2,0,0,4,1,2,0,1,0,2,0,2,3,0,1,2,0,0,negative
+
diff --git a/build.xml b/build.xml
new file mode 100644
index 0000000..efbea36
--- /dev/null
+++ b/build.xml
@@ -0,0 +1,153 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project name="mycoSORTtriage.makejar" basedir="." default="makejar">	
+	<path id="build.classpath">
+	    <fileset dir="jars">
+	        <include name="*.jar"/>
+	    </fileset>			
+	</path>	
+	<manifestclasspath property="jar.classpath" jarfile="executables/*.jar">
+	    <classpath refid="build.classpath"/>
+	</manifestclasspath>
+	<target name="clean">
+		<delete dir="bin" />
+		<mkdir dir="bin" />
+	</target>	
+	
+	<target name="copy">
+		<copy todir="bin" includeemptydirs="false">
+			<fileset dir="." includes="*.cfg, *.txt" />			
+		</copy>
+	</target>		
+		
+	<target name="compile" depends="clean,copy">
+		<javac srcdir="src" destdir="bin" debug="true"
+			includes="analyse/**, arffmatrix/**, 
+				arffvector/**, classifier/**, 
+				configure/**, filter/**, preprocessing/**" 
+			classpathref="build.classpath" 
+		/>
+	</target>		
+	
+	<target name ="makejar" depends="compile" description="Create jars for the mycoSORT Triage project.">				
+    <jar destfile="executables/CorpusHandler.jar">    	
+    	<fileset dir="bin/">
+			<include name="preprocessing/CorpusHandler*.class"/>	
+			<include name="configure/ConfigConstants.class"/>
+    		<include name="config.cfg"/>
+		</fileset>      	 
+    	      	
+    	<manifest>
+			<attribute name="Main-Class" value="preprocessing.CorpusHandler"/>
+    		<attribute name="Class-Path" value="${jar.classpath}"/>
+    	</manifest>    	
+     </jar>
+	 <jar destfile="executables/SampleCorpus.jar">    	
+	 	<fileset dir="bin/">
+			<include name="preprocessing/SampleCorpus*.class"/>	
+			<include name="configure/ConfigConstants.class"/>
+	   		<include name="config.cfg"/>
+		</fileset>      	 
+		<manifest>
+			<attribute name="Main-Class" value="preprocessing.SampleCorpus"/>
+			<attribute name="Class-Path" value="${jar.classpath}"/>
+		</manifest>    	
+	  </jar>	 
+ 	   <jar destfile="executables/NgramExtractor.jar">    	
+	 	<fileset dir="bin/">
+			<include name="analyse/Extractor.class"/>
+			<include name="analyse/NgramExtractor.class"/>
+			<include name="filter/NaiveFilter.class"/>
+			<include name="configure/ConfigConstants.class"/>
+	   		<include name="config.cfg"/>
+			<include name="stopList.txt"/>
+		</fileset>      	 
+		<manifest>
+			<attribute name="Main-Class" value="analyse.NgramExtractor"/>
+			<attribute name="Class-Path" value="${jar.classpath}"/>
+		</manifest>    	
+	   </jar>
+  	   <jar destfile="executables/FeatureExtractor.jar">    	
+	 	<fileset dir="bin/">
+			<include name="analyse/Extractor.class"/>
+			<include name="analyse/FeatureExtractor.class"/>	
+			<include name="filter/NaiveFilter.class"/>		
+			<include name="configure/ConfigConstants.class"/>
+	   		<include name="config.cfg"/>
+			<include name="stopList.txt"/>
+		</fileset>      	 
+		<manifest>
+			<attribute name="Main-Class" value="analyse.FeatureExtractor"/>
+			<attribute name="Class-Path" value="${jar.classpath}"/>
+		</manifest>    	
+	   </jar>	 
+	   <jar destfile="executables/BuildModel.jar">    	
+	 	<fileset dir="bin/">
+			<include name="arffmatrix/BuildModel.class"/>
+			<include name="arffvector/CreateVector.class"/>	
+			<include name="analyse/Extractor.class"/>						
+			<include name="configure/ConfigConstants.class"/>
+	   		<include name="config.cfg"/>			
+		</fileset>      	 
+		<manifest>
+			<attribute name="Main-Class" value="arffmatrix.BuildModel"/>
+			<attribute name="Class-Path" value="${jar.classpath}"/>
+		</manifest>    	
+	   </jar>	
+	    <jar destfile="executables/Trainer.jar">    	
+	 	<fileset dir="bin/">
+			<include name="classifier/Trainer.class"/>
+	 		<include name="filter/InformedFilter.class"/>
+			<include name="configure/ConfigConstants.class"/>
+	   		<include name="config.cfg"/>			
+		</fileset>      	 
+		<manifest>
+			<attribute name="Main-Class" value="classifier.Trainer"/>
+			<attribute name="Class-Path" value="${jar.classpath}"/>
+		</manifest>    	
+	   </jar>    			
+  	</target>
+	<target name="corpus-handler">
+		<java fork="true" classname="preprocessing/CorpusHandler">
+			<classpath>
+				<path location="executables/CorpusHandler.jar"/>
+			</classpath>			
+			<arg value="${options}"/>			
+		</java>		
+	</target>	
+	<target name="sample-corpus">
+		<java fork="true" classname="preprocessing/SampleCorpus">
+		<classpath>
+			<path location="executables/SampleCorpus.jar"/>
+		</classpath>					
+		</java>		
+	</target>
+	<target name="ngram-extractor">
+		<java fork="true" classname="analyse/NgramExtractor">
+		<classpath>
+			<path location="executables/NgramExtractor.jar"/>
+		</classpath>					
+		</java>		
+	</target>
+	<target name="annotation-extractor">
+		<java fork="true" classname="analyse/FeatureExtractor">
+		<classpath>
+			<path location="executables/FeatureExtractor.jar"/>
+		</classpath>					
+		</java>		
+	</target>
+	<target name="build-model">
+		<java fork="true" classname="arffmatrix/BuildModel">
+		<classpath>
+			<path location="executables/BuildModel.jar"/>
+		</classpath>					
+		</java>		
+	</target>
+	<target name="trainer">
+		<java fork="true" classname="classifier/Trainer">
+		<classpath>
+			<path location="executables/Trainer.jar"/>	
+		</classpath>		
+		<arg value="${classifier}"/>			
+		</java>		
+	</target>
+</project>
\ No newline at end of file
diff --git a/corpus/mycoSORTSampleTriagecorpus_test.xml b/corpus/mycoSORTSampleTriagecorpus_test.xml
new file mode 100644
index 0000000..bdd6cf2
--- /dev/null
+++ b/corpus/mycoSORTSampleTriagecorpus_test.xml
@@ -0,0 +1,468 @@
+<corpus>
+<PubmedArticle>
+    <MedlineCitation Owner="NLM" Status="In-Process">
+        <PMID Version="1">25754864</PMID>
+        <DateCreated>
+            <Year>2015</Year>
+            <Month>03</Month>
+            <Day>10</Day>
+        </DateCreated>
+        <DateRevised>
+            <Year>2015</Year>
+            <Month>03</Month>
+            <Day>18</Day>
+        </DateRevised>
+        <Article PubModel="Electronic-Print">
+            <Journal>
+                <ISSN IssnType="Electronic">1758-0463</ISSN>
+                <JournalIssue CitedMedium="Internet">
+                    <Volume>2015</Volume>
+                    <PubDate>
+                        <Year>2015</Year>
+                    </PubDate>
+                </JournalIssue>
+                <Title>Database : the journal of biological databases and curation</Title>
+                <ISOAbbreviation>Database (Oxford)</ISOAbbreviation>
+            </Journal>
+            <ArticleTitle>mycoCLAP, the database for characterized lignocellulose-active proteins of fungal origin: resource and text mining curation support.</ArticleTitle>
+            <Pagination>
+                <MedlinePgn/>
+            </Pagination>
+            <ELocationID EIdType="doi" ValidYN="Y">10.1093/database/bav008</ELocationID>
+            <ELocationID EIdType="pii" ValidYN="Y">bav008</ELocationID>
+            <Abstract>
+                <AbstractText>Enzymes active on components of lignocellulosic biomass are used for industrial applications ranging from food processing to biofuels production. These include a diverse array of glycoside hydrolases, carbohydrate esterases, polysaccharide lyases and oxidoreductases. Fungi are prolific producers of these enzymes, spurring fungal genome sequencing efforts to identify and catalogue the genes that encode them. To facilitate the functional annotation of these genes, biochemical data on over 800 fungal lignocellulose-degrading enzymes have been collected from the literature and organized into the searchable database, mycoCLAP (http://mycoclap.fungalgenomics.ca). First implemented in 2011, and updated as described here, mycoCLAP is capable of ranking search results according to closest biochemically characterized homologues: this improves the quality of the annotation, and significantly decreases the time required to annotate novel sequences. The database is freely available to the scientific community, as are the open source applications based on natural language processing developed to support the manual curation of mycoCLAP. Database URL: http://mycoclap.fungalgenomics.ca.</AbstractText>
+                <CopyrightInformation>© The Author(s) 2015. Published by Oxford University Press.</CopyrightInformation>
+            </Abstract>
+            <AuthorList CompleteYN="Y">
+                <Author ValidYN="Y">
+                    <LastName>Strasser</LastName>
+                    <ForeName>Kimchi</ForeName>
+                    <Initials>K</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>McDonnell</LastName>
+                    <ForeName>Erin</ForeName>
+                    <Initials>E</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Nyaga</LastName>
+                    <ForeName>Carol</ForeName>
+                    <Initials>C</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Wu</LastName>
+                    <ForeName>Min</ForeName>
+                    <Initials>M</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Wu</LastName>
+                    <ForeName>Sherry</ForeName>
+                    <Initials>S</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Almeida</LastName>
+                    <ForeName>Hayda</ForeName>
+                    <Initials>H</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Meurs</LastName>
+                    <ForeName>Marie-Jean</ForeName>
+                    <Initials>MJ</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Kosseim</LastName>
+                    <ForeName>Leila</ForeName>
+                    <Initials>L</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Powlowski</LastName>
+                    <ForeName>Justin</ForeName>
+                    <Initials>J</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Butler</LastName>
+                    <ForeName>Greg</ForeName>
+                    <Initials>G</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA gregb@encs.concordia.ca.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Tsang</LastName>
+                    <ForeName>Adrian</ForeName>
+                    <Initials>A</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+            </AuthorList>
+            <Language>eng</Language>
+            <PublicationTypeList>
+                <PublicationType UI="D016428">Journal Article</PublicationType>
+                <PublicationType UI="D013485">Research Support, Non-U.S. Gov't</PublicationType>
+            </PublicationTypeList>
+            <ArticleDate DateType="Electronic">
+                <Year>2015</Year>
+                <Month>03</Month>
+                <Day>08</Day>
+            </ArticleDate>
+        </Article>
+        <MedlineJournalInfo>
+            <Country>England</Country>
+            <MedlineTA>Database (Oxford)</MedlineTA>
+            <NlmUniqueID>101517697</NlmUniqueID>
+            <ISSNLinking>1758-0463</ISSNLinking>
+        </MedlineJournalInfo>
+        <CitationSubset>IM</CitationSubset>
+        <CommentsCorrectionsList>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nat Biotechnol. 2004 Jun;22(6):695-700</RefSource>
+                <PMID Version="1">15122302</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Appl Environ Microbiol. 2013 Aug;79(15):4620-34</RefSource>
+                <PMID Version="1">23709508</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>J Mol Biol. 1990 Oct 5;215(3):403-10</RefSource>
+                <PMID Version="1">2231712</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nature. 2008 Sep 4;455(7209):47-50</RefSource>
+                <PMID Version="1">18769432</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nucleic Acids Res. 2009 Jan;37(Database issue):D233-8</RefSource>
+                <PMID Version="1">18838391</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nucleic Acids Res. 2009 Jan;37(Database issue):D588-92</RefSource>
+                <PMID Version="1">18984617</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Database (Oxford). 2011;2011:bar020</RefSource>
+                <PMID Version="1">21622642</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Genome Res. 2011 Jun;21(6):885-97</RefSource>
+                <PMID Version="1">21543515</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nat Methods. 2011;8(10):785-6</RefSource>
+                <PMID Version="1">21959131</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nat Biotechnol. 2011 Oct;29(10):922-7</RefSource>
+                <PMID Version="1">21964414</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Enzyme Microb Technol. 2011 Apr 7;48(4-5):397-403</RefSource>
+                <PMID Version="1">22112956</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5</RefSource>
+                <PMID Version="1">22595090</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Science. 2012 Jun 29;336(6089):1715-9</RefSource>
+                <PMID Version="1">22745431</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nucleic Acids Res. 2013 Jan;41(Database issue):D43-7</RefSource>
+                <PMID Version="1">23161681</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nucleic Acids Res. 2013 Jan;41(Database issue):D36-42</RefSource>
+                <PMID Version="1">23193287</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nat Genet. 2000 May;25(1):25-9</RefSource>
+                <PMID Version="1">10802651</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2014;9(12):e115892</RefSource>
+                <PMID Version="1">25551575</PMID>
+            </CommentsCorrections>
+        </CommentsCorrectionsList>
+        <OtherID Source="NLM">PMC4352688</OtherID>
+    </MedlineCitation>
+    <PubmedData>
+        <History>
+            <PubMedPubDate PubStatus="ppublish">
+                <Year>2015</Year>
+                <Month/>
+                <Day/>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="entrez">
+                <Year>2015</Year>
+                <Month>3</Month>
+                <Day>11</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="pubmed">
+                <Year>2015</Year>
+                <Month>3</Month>
+                <Day>11</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="medline">
+                <Year>2015</Year>
+                <Month>3</Month>
+                <Day>11</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+        </History>
+        <PublicationStatus>epublish</PublicationStatus>
+        <ArticleIdList>
+            <ArticleId IdType="pii">bav008</ArticleId>
+            <ArticleId IdType="doi">10.1093/database/bav008</ArticleId>
+            <ArticleId IdType="pubmed">25754864</ArticleId>
+            <ArticleId IdType="pmc">PMC4352688</ArticleId>
+        </ArticleIdList>
+    </PubmedData>
+</PubmedArticle>
+
+<PubmedArticle>
+    <MedlineCitation Owner="NLM" Status="In-Process">
+        <PMID Version="1">25551575</PMID>
+        <DateCreated>
+            <Year>2015</Year>
+            <Month>01</Month>
+            <Day>01</Day>
+        </DateCreated>
+        <DateRevised>
+            <Year>2015</Year>
+            <Month>01</Month>
+            <Day>13</Day>
+        </DateRevised>
+        <Article PubModel="Electronic-eCollection">
+            <Journal>
+                <ISSN IssnType="Electronic">1932-6203</ISSN>
+                <JournalIssue CitedMedium="Internet">
+                    <Volume>9</Volume>
+                    <Issue>12</Issue>
+                    <PubDate>
+                        <Year>2014</Year>
+                    </PubDate>
+                </JournalIssue>
+                <Title>PloS one</Title>
+                <ISOAbbreviation>PLoS ONE</ISOAbbreviation>
+            </Journal>
+            <ArticleTitle>Machine learning for biomedical literature triage.</ArticleTitle>
+            <Pagination>
+                <MedlinePgn>e115892</MedlinePgn>
+            </Pagination>
+            <ELocationID EIdType="doi" ValidYN="Y">10.1371/journal.pone.0115892</ELocationID>
+            <Abstract>
+                <AbstractText>This paper presents a machine learning system for supporting the first task of the biological literature manual curation process, called triage. We compare the performance of various classification models, by experimenting with dataset sampling factors and a set of features, as well as three different machine learning algorithms (Naive Bayes, Support Vector Machine and Logistic Model Trees). The results show that the most fitting model to handle the imbalanced datasets of the triage classification task is obtained by using domain relevant features, an under-sampling technique, and the Logistic Model Trees algorithm.</AbstractText>
+            </Abstract>
+            <AuthorList CompleteYN="Y">
+                <Author ValidYN="Y">
+                    <LastName>Almeida</LastName>
+                    <ForeName>Hayda</ForeName>
+                    <Initials>H</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Meurs</LastName>
+                    <ForeName>Marie-Jean</ForeName>
+                    <Initials>MJ</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Kosseim</LastName>
+                    <ForeName>Leila</ForeName>
+                    <Initials>L</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Butler</LastName>
+                    <ForeName>Greg</ForeName>
+                    <Initials>G</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada; Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Tsang</LastName>
+                    <ForeName>Adrian</ForeName>
+                    <Initials>A</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+            </AuthorList>
+            <Language>eng</Language>
+            <PublicationTypeList>
+                <PublicationType UI="D016428">Journal Article</PublicationType>
+                <PublicationType UI="D013485">Research Support, Non-U.S. Gov't</PublicationType>
+            </PublicationTypeList>
+            <ArticleDate DateType="Electronic">
+                <Year>2014</Year>
+                <Month>12</Month>
+                <Day>31</Day>
+            </ArticleDate>
+        </Article>
+        <MedlineJournalInfo>
+            <Country>United States</Country>
+            <MedlineTA>PLoS One</MedlineTA>
+            <NlmUniqueID>101285081</NlmUniqueID>
+            <ISSNLinking>1932-6203</ISSNLinking>
+        </MedlineJournalInfo>
+        <CitationSubset>IM</CitationSubset>
+        <CommentsCorrectionsList>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Proc AMIA Symp. 2001;:17-21</RefSource>
+                <PMID Version="1">11825149</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Artif Intell Med. 2005 Sep-Oct;35(1-2):121-34</RefSource>
+                <PMID Version="1">16024240</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Mol Cell. 2006 Mar 3;21(5):589-94</RefSource>
+                <PMID Version="1">16507357</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Bioinformatics. 2006 Mar 15;22(6):658-64</RefSource>
+                <PMID Version="1">16287934</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Artif Intell Med. 2006 May;37(1):7-18</RefSource>
+                <PMID Version="1">16233974</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nature. 2008 Sep 4;455(7209):47-50</RefSource>
+                <PMID Version="1">18769432</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>IEEE Trans Syst Man Cybern B Cybern. 2009 Feb;39(1):281-8</RefSource>
+                <PMID Version="1">19068445</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Database (Oxford). 2011;2011:bar020</RefSource>
+                <PMID Version="1">21622642</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>J Integr Bioinform. 2011;8(3):176</RefSource>
+                <PMID Version="1">21926439</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Database (Oxford). 2012;2012:bas020</RefSource>
+                <PMID Version="1">22513129</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5</RefSource>
+                <PMID Version="1">22595090</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2013;8(6):e65848</RefSource>
+                <PMID Version="1">23785456</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2013;8(12):e80503</RefSource>
+                <PMID Version="1">24312478</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2014;9(4):e91315</RefSource>
+                <PMID Version="1">24705246</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2014;9(7):e102039</RefSource>
+                <PMID Version="1">25036529</PMID>
+            </CommentsCorrections>
+        </CommentsCorrectionsList>
+        <OtherID Source="NLM">PMC4281078</OtherID>
+    </MedlineCitation>
+    <PubmedData>
+        <History>
+            <PubMedPubDate PubStatus="ecollection">
+                <Year>2014</Year>
+                <Month/>
+                <Day/>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="received">
+                <Year>2014</Year>
+                <Month>9</Month>
+                <Day>4</Day>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="accepted">
+                <Year>2014</Year>
+                <Month>11</Month>
+                <Day>27</Day>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="epublish">
+                <Year>2014</Year>
+                <Month>12</Month>
+                <Day>31</Day>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="entrez">
+                <Year>2015</Year>
+                <Month>1</Month>
+                <Day>1</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="pubmed">
+                <Year>2015</Year>
+                <Month>1</Month>
+                <Day>1</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="medline">
+                <Year>2015</Year>
+                <Month>1</Month>
+                <Day>1</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+        </History>
+        <PublicationStatus>epublish</PublicationStatus>
+        <ArticleIdList>
+            <ArticleId IdType="doi">10.1371/journal.pone.0115892</ArticleId>
+            <ArticleId IdType="pii">PONE-D-14-39858</ArticleId>
+            <ArticleId IdType="pubmed">25551575</ArticleId>
+            <ArticleId IdType="pmc">PMC4281078</ArticleId>
+        </ArticleIdList>
+    </PubmedData>
+</PubmedArticle>
+
+</corpus>
diff --git a/corpus/mycoSORTSampleTriagecorpus_train_50.xml b/corpus/mycoSORTSampleTriagecorpus_train_50.xml
new file mode 100644
index 0000000..bdd6cf2
--- /dev/null
+++ b/corpus/mycoSORTSampleTriagecorpus_train_50.xml
@@ -0,0 +1,468 @@
+<corpus>
+<PubmedArticle>
+    <MedlineCitation Owner="NLM" Status="In-Process">
+        <PMID Version="1">25754864</PMID>
+        <DateCreated>
+            <Year>2015</Year>
+            <Month>03</Month>
+            <Day>10</Day>
+        </DateCreated>
+        <DateRevised>
+            <Year>2015</Year>
+            <Month>03</Month>
+            <Day>18</Day>
+        </DateRevised>
+        <Article PubModel="Electronic-Print">
+            <Journal>
+                <ISSN IssnType="Electronic">1758-0463</ISSN>
+                <JournalIssue CitedMedium="Internet">
+                    <Volume>2015</Volume>
+                    <PubDate>
+                        <Year>2015</Year>
+                    </PubDate>
+                </JournalIssue>
+                <Title>Database : the journal of biological databases and curation</Title>
+                <ISOAbbreviation>Database (Oxford)</ISOAbbreviation>
+            </Journal>
+            <ArticleTitle>mycoCLAP, the database for characterized lignocellulose-active proteins of fungal origin: resource and text mining curation support.</ArticleTitle>
+            <Pagination>
+                <MedlinePgn/>
+            </Pagination>
+            <ELocationID EIdType="doi" ValidYN="Y">10.1093/database/bav008</ELocationID>
+            <ELocationID EIdType="pii" ValidYN="Y">bav008</ELocationID>
+            <Abstract>
+                <AbstractText>Enzymes active on components of lignocellulosic biomass are used for industrial applications ranging from food processing to biofuels production. These include a diverse array of glycoside hydrolases, carbohydrate esterases, polysaccharide lyases and oxidoreductases. Fungi are prolific producers of these enzymes, spurring fungal genome sequencing efforts to identify and catalogue the genes that encode them. To facilitate the functional annotation of these genes, biochemical data on over 800 fungal lignocellulose-degrading enzymes have been collected from the literature and organized into the searchable database, mycoCLAP (http://mycoclap.fungalgenomics.ca). First implemented in 2011, and updated as described here, mycoCLAP is capable of ranking search results according to closest biochemically characterized homologues: this improves the quality of the annotation, and significantly decreases the time required to annotate novel sequences. The database is freely available to the scientific community, as are the open source applications based on natural language processing developed to support the manual curation of mycoCLAP. Database URL: http://mycoclap.fungalgenomics.ca.</AbstractText>
+                <CopyrightInformation>© The Author(s) 2015. Published by Oxford University Press.</CopyrightInformation>
+            </Abstract>
+            <AuthorList CompleteYN="Y">
+                <Author ValidYN="Y">
+                    <LastName>Strasser</LastName>
+                    <ForeName>Kimchi</ForeName>
+                    <Initials>K</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>McDonnell</LastName>
+                    <ForeName>Erin</ForeName>
+                    <Initials>E</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Nyaga</LastName>
+                    <ForeName>Carol</ForeName>
+                    <Initials>C</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Wu</LastName>
+                    <ForeName>Min</ForeName>
+                    <Initials>M</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Wu</LastName>
+                    <ForeName>Sherry</ForeName>
+                    <Initials>S</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Almeida</LastName>
+                    <ForeName>Hayda</ForeName>
+                    <Initials>H</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Meurs</LastName>
+                    <ForeName>Marie-Jean</ForeName>
+                    <Initials>MJ</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Kosseim</LastName>
+                    <ForeName>Leila</ForeName>
+                    <Initials>L</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Powlowski</LastName>
+                    <ForeName>Justin</ForeName>
+                    <Initials>J</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Butler</LastName>
+                    <ForeName>Greg</ForeName>
+                    <Initials>G</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA gregb@encs.concordia.ca.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Tsang</LastName>
+                    <ForeName>Adrian</ForeName>
+                    <Initials>A</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA Centre for Structural and Functional Genomics, Department of Computer Science and Software Engineering, Department of Chemistry and Biochemistry, and Department of Biology Concordia University, Montréal, QC, USA.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+            </AuthorList>
+            <Language>eng</Language>
+            <PublicationTypeList>
+                <PublicationType UI="D016428">Journal Article</PublicationType>
+                <PublicationType UI="D013485">Research Support, Non-U.S. Gov't</PublicationType>
+            </PublicationTypeList>
+            <ArticleDate DateType="Electronic">
+                <Year>2015</Year>
+                <Month>03</Month>
+                <Day>08</Day>
+            </ArticleDate>
+        </Article>
+        <MedlineJournalInfo>
+            <Country>England</Country>
+            <MedlineTA>Database (Oxford)</MedlineTA>
+            <NlmUniqueID>101517697</NlmUniqueID>
+            <ISSNLinking>1758-0463</ISSNLinking>
+        </MedlineJournalInfo>
+        <CitationSubset>IM</CitationSubset>
+        <CommentsCorrectionsList>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nat Biotechnol. 2004 Jun;22(6):695-700</RefSource>
+                <PMID Version="1">15122302</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Appl Environ Microbiol. 2013 Aug;79(15):4620-34</RefSource>
+                <PMID Version="1">23709508</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>J Mol Biol. 1990 Oct 5;215(3):403-10</RefSource>
+                <PMID Version="1">2231712</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nature. 2008 Sep 4;455(7209):47-50</RefSource>
+                <PMID Version="1">18769432</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nucleic Acids Res. 2009 Jan;37(Database issue):D233-8</RefSource>
+                <PMID Version="1">18838391</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nucleic Acids Res. 2009 Jan;37(Database issue):D588-92</RefSource>
+                <PMID Version="1">18984617</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Database (Oxford). 2011;2011:bar020</RefSource>
+                <PMID Version="1">21622642</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Genome Res. 2011 Jun;21(6):885-97</RefSource>
+                <PMID Version="1">21543515</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nat Methods. 2011;8(10):785-6</RefSource>
+                <PMID Version="1">21959131</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nat Biotechnol. 2011 Oct;29(10):922-7</RefSource>
+                <PMID Version="1">21964414</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Enzyme Microb Technol. 2011 Apr 7;48(4-5):397-403</RefSource>
+                <PMID Version="1">22112956</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5</RefSource>
+                <PMID Version="1">22595090</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Science. 2012 Jun 29;336(6089):1715-9</RefSource>
+                <PMID Version="1">22745431</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nucleic Acids Res. 2013 Jan;41(Database issue):D43-7</RefSource>
+                <PMID Version="1">23161681</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nucleic Acids Res. 2013 Jan;41(Database issue):D36-42</RefSource>
+                <PMID Version="1">23193287</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nat Genet. 2000 May;25(1):25-9</RefSource>
+                <PMID Version="1">10802651</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2014;9(12):e115892</RefSource>
+                <PMID Version="1">25551575</PMID>
+            </CommentsCorrections>
+        </CommentsCorrectionsList>
+        <OtherID Source="NLM">PMC4352688</OtherID>
+    </MedlineCitation>
+    <PubmedData>
+        <History>
+            <PubMedPubDate PubStatus="ppublish">
+                <Year>2015</Year>
+                <Month/>
+                <Day/>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="entrez">
+                <Year>2015</Year>
+                <Month>3</Month>
+                <Day>11</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="pubmed">
+                <Year>2015</Year>
+                <Month>3</Month>
+                <Day>11</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="medline">
+                <Year>2015</Year>
+                <Month>3</Month>
+                <Day>11</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+        </History>
+        <PublicationStatus>epublish</PublicationStatus>
+        <ArticleIdList>
+            <ArticleId IdType="pii">bav008</ArticleId>
+            <ArticleId IdType="doi">10.1093/database/bav008</ArticleId>
+            <ArticleId IdType="pubmed">25754864</ArticleId>
+            <ArticleId IdType="pmc">PMC4352688</ArticleId>
+        </ArticleIdList>
+    </PubmedData>
+</PubmedArticle>
+
+<PubmedArticle>
+    <MedlineCitation Owner="NLM" Status="In-Process">
+        <PMID Version="1">25551575</PMID>
+        <DateCreated>
+            <Year>2015</Year>
+            <Month>01</Month>
+            <Day>01</Day>
+        </DateCreated>
+        <DateRevised>
+            <Year>2015</Year>
+            <Month>01</Month>
+            <Day>13</Day>
+        </DateRevised>
+        <Article PubModel="Electronic-eCollection">
+            <Journal>
+                <ISSN IssnType="Electronic">1932-6203</ISSN>
+                <JournalIssue CitedMedium="Internet">
+                    <Volume>9</Volume>
+                    <Issue>12</Issue>
+                    <PubDate>
+                        <Year>2014</Year>
+                    </PubDate>
+                </JournalIssue>
+                <Title>PloS one</Title>
+                <ISOAbbreviation>PLoS ONE</ISOAbbreviation>
+            </Journal>
+            <ArticleTitle>Machine learning for biomedical literature triage.</ArticleTitle>
+            <Pagination>
+                <MedlinePgn>e115892</MedlinePgn>
+            </Pagination>
+            <ELocationID EIdType="doi" ValidYN="Y">10.1371/journal.pone.0115892</ELocationID>
+            <Abstract>
+                <AbstractText>This paper presents a machine learning system for supporting the first task of the biological literature manual curation process, called triage. We compare the performance of various classification models, by experimenting with dataset sampling factors and a set of features, as well as three different machine learning algorithms (Naive Bayes, Support Vector Machine and Logistic Model Trees). The results show that the most fitting model to handle the imbalanced datasets of the triage classification task is obtained by using domain relevant features, an under-sampling technique, and the Logistic Model Trees algorithm.</AbstractText>
+            </Abstract>
+            <AuthorList CompleteYN="Y">
+                <Author ValidYN="Y">
+                    <LastName>Almeida</LastName>
+                    <ForeName>Hayda</ForeName>
+                    <Initials>H</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Meurs</LastName>
+                    <ForeName>Marie-Jean</ForeName>
+                    <Initials>MJ</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Kosseim</LastName>
+                    <ForeName>Leila</ForeName>
+                    <Initials>L</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Butler</LastName>
+                    <ForeName>Greg</ForeName>
+                    <Initials>G</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Department of Computer Science and Software Engineering, Concordia University, Montreal, QC, Canada; Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+                <Author ValidYN="Y">
+                    <LastName>Tsang</LastName>
+                    <ForeName>Adrian</ForeName>
+                    <Initials>A</Initials>
+                    <AffiliationInfo>
+                        <Affiliation>Centre for Structural and Functional Genomics, Concordia University, Montreal, QC, Canada.</Affiliation>
+                    </AffiliationInfo>
+                </Author>
+            </AuthorList>
+            <Language>eng</Language>
+            <PublicationTypeList>
+                <PublicationType UI="D016428">Journal Article</PublicationType>
+                <PublicationType UI="D013485">Research Support, Non-U.S. Gov't</PublicationType>
+            </PublicationTypeList>
+            <ArticleDate DateType="Electronic">
+                <Year>2014</Year>
+                <Month>12</Month>
+                <Day>31</Day>
+            </ArticleDate>
+        </Article>
+        <MedlineJournalInfo>
+            <Country>United States</Country>
+            <MedlineTA>PLoS One</MedlineTA>
+            <NlmUniqueID>101285081</NlmUniqueID>
+            <ISSNLinking>1932-6203</ISSNLinking>
+        </MedlineJournalInfo>
+        <CitationSubset>IM</CitationSubset>
+        <CommentsCorrectionsList>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Proc AMIA Symp. 2001;:17-21</RefSource>
+                <PMID Version="1">11825149</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Artif Intell Med. 2005 Sep-Oct;35(1-2):121-34</RefSource>
+                <PMID Version="1">16024240</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Mol Cell. 2006 Mar 3;21(5):589-94</RefSource>
+                <PMID Version="1">16507357</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Bioinformatics. 2006 Mar 15;22(6):658-64</RefSource>
+                <PMID Version="1">16287934</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Artif Intell Med. 2006 May;37(1):7-18</RefSource>
+                <PMID Version="1">16233974</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Nature. 2008 Sep 4;455(7209):47-50</RefSource>
+                <PMID Version="1">18769432</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>IEEE Trans Syst Man Cybern B Cybern. 2009 Feb;39(1):281-8</RefSource>
+                <PMID Version="1">19068445</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Database (Oxford). 2011;2011:bar020</RefSource>
+                <PMID Version="1">21622642</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>J Integr Bioinform. 2011;8(3):176</RefSource>
+                <PMID Version="1">21926439</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>Database (Oxford). 2012;2012:bas020</RefSource>
+                <PMID Version="1">22513129</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>BMC Med Inform Decis Mak. 2012;12 Suppl 1:S5</RefSource>
+                <PMID Version="1">22595090</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2013;8(6):e65848</RefSource>
+                <PMID Version="1">23785456</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2013;8(12):e80503</RefSource>
+                <PMID Version="1">24312478</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2014;9(4):e91315</RefSource>
+                <PMID Version="1">24705246</PMID>
+            </CommentsCorrections>
+            <CommentsCorrections RefType="Cites">
+                <RefSource>PLoS One. 2014;9(7):e102039</RefSource>
+                <PMID Version="1">25036529</PMID>
+            </CommentsCorrections>
+        </CommentsCorrectionsList>
+        <OtherID Source="NLM">PMC4281078</OtherID>
+    </MedlineCitation>
+    <PubmedData>
+        <History>
+            <PubMedPubDate PubStatus="ecollection">
+                <Year>2014</Year>
+                <Month/>
+                <Day/>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="received">
+                <Year>2014</Year>
+                <Month>9</Month>
+                <Day>4</Day>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="accepted">
+                <Year>2014</Year>
+                <Month>11</Month>
+                <Day>27</Day>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="epublish">
+                <Year>2014</Year>
+                <Month>12</Month>
+                <Day>31</Day>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="entrez">
+                <Year>2015</Year>
+                <Month>1</Month>
+                <Day>1</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="pubmed">
+                <Year>2015</Year>
+                <Month>1</Month>
+                <Day>1</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+            <PubMedPubDate PubStatus="medline">
+                <Year>2015</Year>
+                <Month>1</Month>
+                <Day>1</Day>
+                <Hour>6</Hour>
+                <Minute>0</Minute>
+            </PubMedPubDate>
+        </History>
+        <PublicationStatus>epublish</PublicationStatus>
+        <ArticleIdList>
+            <ArticleId IdType="doi">10.1371/journal.pone.0115892</ArticleId>
+            <ArticleId IdType="pii">PONE-D-14-39858</ArticleId>
+            <ArticleId IdType="pubmed">25551575</ArticleId>
+            <ArticleId IdType="pmc">PMC4281078</ArticleId>
+        </ArticleIdList>
+    </PubmedData>
+</PubmedArticle>
+
+</corpus>
diff --git a/features/docIDs.txt b/features/docIDs.txt
new file mode 100644
index 0000000..8da8a8a
--- /dev/null
+++ b/features/docIDs.txt
@@ -0,0 +1,1198 @@
+14565843	positive
+23073100	negative
+11501467	negative
+20208428	positive
+9074500	negative
+986853	negative
+8787388	positive
+20826217	positive
+11471729	positive
+16059706	negative
+11298744	positive
+21168763	negative
+10424099	negative
+18415096	positive
+15866877	positive
+2396985	negative
+7838157	negative
+44415	negative
+11170563	negative
+16128806	positive
+12435269	positive
+19756584	positive
+21382036	negative
+15830675	negative
+7579664	negative
+9114071	negative
+21635140	negative
+20070371	positive
+19505579	positive
+14987996	positive
+12882162	positive
+6767680	negative
+963703	negative
+15278289	positive
+17119968	positive
+9486422	negative
+8698653	positive
+16278932	positive
+16488199	positive
+15466516	positive
+21829351	negative
+17302745	negative
+1479358	positive
+15450181	positive
+9199426	negative
+14976875	negative
+2703464	negative
+15290142	negative
+21972816	negative
+16431275	positive
+15541296	positive
+19060407	negative
+18499583	negative
+22260051	negative
+17027758	positive
+12209794	negative
+1400249	positive
+21466636	negative
+11916668	positive
+1452095	negative
+9508797	negative
+19277742	positive
+22226198	positive
+18175902	negative
+20803138	negative
+17341093	negative
+8948426	negative
+9011379	negative
+14595695	negative
+22906713	negative
+8524797	positive
+2226842	positive
+19590866	positive
+9485595	positive
+2332056	negative
+10957961	negative
+9013549	positive
+10923795	positive
+24212486	negative
+10713452	positive
+13306714	negative
+22031024	positive
+8978090	negative
+22067437	negative
+19205049	positive
+16926418	negative
+19393602	positive
+8597544	negative
+3595987	negative
+19690850	positive
+21306947	positive
+23790084	negative
+8597548	positive
+15686849	positive
+9931476	positive
+1398098	positive
+18936994	negative
+18045411	positive
+17229143	positive
+22350290	negative
+8135518	positive
+1588915	negative
+13031603	negative
+16789551	negative
+23218368	positive
+14766566	negative
+11930943	negative
+8647098	positive
+18806001	negative
+21360092	negative
+10802187	positive
+12788920	positive
+17910720	negative
+1368680	positive
+16404950	positive
+21897016	positive
+9468803	negative
+23071108	positive
+8647081	positive
+10434062	negative
+8250548	positive
+23226882	negative
+12162562	positive
+19809198	negative
+9140529	negative
+21575132	negative
+15054207	negative
+11064202	negative
+18800599	positive
+23589840	negative
+1392588	negative
+7603444	positive
+18388475	negative
+17381511	positive
+8804409	negative
+22132148	positive
+8765754	negative
+9140977	positive
+23625219	negative
+17351093	positive
+22075023	negative
+16462863	positive
+19556747	positive
+15944854	negative
+15054209	positive
+8856078	positive
+17662982	negative
+16473771	positive
+8243636	negative
+8810077	positive
+22795531	negative
+23497862	negative
+3114236	positive
+15449305	negative
+12591897	positive
+22733825	negative
+19996679	positive
+7763357	negative
+9803534	negative
+13637982	negative
+19479322	negative
+18555305	negative
+18500632	negative
+16232740	positive
+1368254	positive
+10583968	negative
+16133102	positive
+12882555	negative
+16041128	positive
+16761182	negative
+20109094	positive
+10952011	positive
+11358516	positive
+20518356	negative
+6169675	negative
+19645671	negative
+9802217	positive
+18587856	negative
+21237221	positive
+12485115	positive
+23625216	negative
+9872754	positive
+8503847	positive
+9463945	positive
+17928699	positive
+1768103	negative
+7614556	negative
+14586108	positive
+12363086	negative
+12543554	positive
+3117961	negative
+21402188	negative
+17043824	positive
+7811079	positive
+16343463	positive
+8486628	positive
+10908793	negative
+8964516	negative
+9648215	positive
+11849507	positive
+3080320	negative
+12513977	negative
+23508400	negative
+16662849	negative
+12406766	positive
+23725035	negative
+2337347	positive
+2560409	negative
+22961332	negative
+2510150	positive
+14716497	negative
+23412069	positive
+1735428	negative
+21168322	negative
+8987884	positive
+9830097	negative
+15809023	negative
+15892742	negative
+12908861	negative
+22496740	negative
+19205687	positive
+9492270	positive
+1429462	positive
+7191044	negative
+12889	negative
+17111131	positive
+9430631	negative
+9438354	negative
+17021873	positive
+20562284	positive
+10101286	positive
+20552260	positive
+24113511	negative
+23170978	positive
+21510637	negative
+8509335	positive
+9611196	negative
+18388455	negative
+15756814	positive
+17168300	negative
+23666150	negative
+8987855	positive
+23236275	positive
+22776993	positive
+22835655	positive
+1644702	negative
+1567377	negative
+21369980	negative
+17665191	positive
+10606774	positive
+18490176	positive
+10508057	positive
+11988501	negative
+24077704	negative
+8987622	positive
+15500988	positive
+12909730	negative
+9813313	negative
+10672446	negative
+18460787	negative
+16822232	positive
+5861997	negative
+2629784	negative
+11136466	negative
+8640604	positive
+12147340	positive
+12555575	negative
+24274505	negative
+10939261	negative
+22425351	negative
+1490609	negative
+12172603	negative
+15998406	negative
+12073090	positive
+16349883	negative
+11178973	negative
+1368193	positive
+12455695	positive
+9330667	negative
+2331322	positive
+17376674	positive
+22624316	negative
+1878999	positive
+8960907	negative
+19507068	positive
+7859305	positive
+8299175	positive
+12892894	positive
+8575021	positive
+2146364	negative
+8575023	positive
+2126511	negative
+16524914	negative
+9732526	positive
+11523809	positive
+19761044	negative
+8724139	negative
+8589415	positive
+15291818	positive
+16361785	negative
+23124346	positive
+19500674	positive
+988467	negative
+22432613	negative
+18068392	negative
+12945177	negative
+20652740	positive
+8276068	negative
+10091328	positive
+7640003	positive
+21971070	negative
+18414798	negative
+12743761	negative
+7984103	positive
+10215597	positive
+16637705	negative
+8589407	positive
+8945534	positive
+8589408	negative
+12325291	negative
+19527927	positive
+17838811	negative
+16233798	positive
+18668421	negative
+20129093	positive
+15294290	positive
+23240568	positive
+20823521	negative
+16244441	positive
+7896713	positive
+15870328	positive
+22579450	negative
+16614858	positive
+16039872	positive
+19288093	positive
+18694928	negative
+19189377	negative
+15651	negative
+19575195	positive
+7488173	positive
+12409103	positive
+6358191	negative
+16284933	negative
+7574556	negative
+9830143	negative
+18845181	negative
+15362290	negative
+16233531	positive
+17955189	positive
+19507018	positive
+16233536	positive
+23298573	negative
+18704748	positive
+24186432	negative
+12715256	negative
+18550352	positive
+16380244	negative
+15090228	negative
+18944813	positive
+8598062	positive
+14323029	negative
+16232670	positive
+19054103	positive
+16232432	positive
+6406022	negative
+826291	negative
+11829749	positive
+19513709	positive
+7906649	positive
+9058977	negative
+18023045	positive
+21181156	positive
+21243443	positive
+8381338	negative
+15136043	negative negative
+18378599	negative
+14685768	positive
+22407682	negative
+2707445	positive
+18083533	positive
+22132219	negative
+12226497	negative
+14763977	negative
+14532063	positive
+18563407	positive
+19545999	negative
+19967375	negative
+16233515	positive
+1787790	positive
+22705517	negative
+8975597	positive
+23470758	negative
+23728162	negative
+9309656	negative
+10779688	positive
+1447290	negative
+11257513	positive
+11115392	positive
+10049844	negative
+19088319	positive
+15950056	negative
+12754825	negative
+20569406	negative
+21307589	positive
+23844185	negative
+6541478	negative
+18408068	positive
+20077114	positive
+9169610	positive
+16677342	negative
+23508952	positive
+12602898	negative
+17433483	positive
+10675564	positive
+18443829	negative
+17625262	positive
+17599813	positive
+32175	positive
+7574590	positive
+20212162	negative
+8781176	positive
+18923909	negative
+10491168	positive
+18264680	negative
+23500559	negative
+6863431	negative
+9805384	positive
+14524699	positive
+8400376	positive
+2135869	positive
+14523125	positive
+16129506	positive
+8400378	negative
+18943122	negative
+16275128	positive
+10493932	positive
+23199732	positive
+17651154	negative
+12665550	positive
+12224649	positive
+7439182	negative
+8959766	negative
+9608522	negative
+23199738	positive
+19734721	positive
+20143777	positive
+23306879	positive
+11376609	negative negative
+9334183	negative
+10049864	negative
+16697997	positive
+23489323	negative
+21040747	positive
+21442271	positive
+24372593	negative
+15746364	negative
+9212440	negative
+1952931	negative
+11179652	negative
+8955395	negative
+12843664	positive
+2158993	positive
+23299456	positive
+18512263	positive
+14665735	positive
+20043150	positive
+21626020	positive
+23836384	negative
+7487028	negative
+7961884	positive
+22653604	positive
+12619666	positive
+3111887	negative
+19107534	positive
+32833	positive
+15580593	positive
+16901567	positive
+7824933	positive
+10499260	negative
+9506837	positive
+19527524	positive
+11061997	negative
+22373601	negative
+9758774	positive
+22074954	negative
+19473250	positive
+9758775	positive
+17977149	negative
+12845603	negative
+2506439	negative
+10553664	positive
+22150279	negative
+18456943	positive
+15270720	negative
+18675351	positive
+10385327	negative
+7629010	negative
+1592808	positive
+17505783	negative
+18722542	negative
+12843680	positive
+8593683	positive
+16980715	positive
+16734792	positive
+7487009	negative
+23285046	positive
+22524557	negative
+16520923	positive
+16107755	positive
+19912637	positive
+3936420	positive
+7626800	negative
+9464371	positive
+19269961	positive
+2152162	positive
+22444635	positive
+16666407	negative
+3268297	negative
+8900004	positive
+10659715	negative
+2760033	positive
+1612414	positive
+1368837	positive
+23931690	negative
+8756392	positive
+20424835	negative
+18548669	positive
+10514255	negative
+19922433	negative
+16233124	positive
+15174310	positive
+17043085	negative
+16140328	positive
+8669913	negative
+20429042	negative
+3125847	negative
+20592022	positive
+24212538	negative
+1425667	positive
+7262712	negative
+10525153	positive
+21710260	positive
+7926830	positive
+14674022	negative
+14735222	positive
+18975142	negative
+18935968	positive
+20014432	negative
+23184220	negative
+18722595	positive
+4779294	negative
+15838031	positive
+15025429	negative
+9464399	negative
+23129650	positive
+7493964	positive
+27428	negative
+3561490	positive
+8436950	positive
+9761741	positive
+21945415	negative
+1367522	positive
+20382376	negative
+4269377	negative
+9118231	negative
+7012186	negative
+19756576	negative
+22360347	negative
+15288024	negative
+10586505	positive
+9805373	positive
+22442229	positive
+15782637	negative
+8905923	positive
+15246667	negative
+8688436	positive
+14988022	positive
+18850325	negative
+7549103	positive
+8901566	negative
+8595661	positive
+9547139	negative
+15668816	negative
+1368843	positive
+12702357	positive
+16874542	positive
+23326459	positive
+8935788	negative
+16374635	positive
+1368603	positive
+9153431	positive
+9987124	positive
+16478498	negative
+10029988	positive
+8595669	positive
+22080345	positive
+22754023	positive
+22080343	negative
+8065265	positive
+8961569	negative
+9165762	positive
+20573014	positive
+22940311	positive
+23303647	negative
+12726996	positive
+24479319	negative
+9450333	positive
+20727822	positive
+17922847	positive
+16664778	negative
+16134120	negative
+22072708	negative
+9371889	positive
+8948110	negative
+20619350	positive
+16474906	positive
+11754346	negative
+2579525	negative
+20734107	negative
+8464071	negative
+22805919	negative
+22709462	negative
+9841776	negative
+23100915	negative
+7788716	positive
+7788717	positive
+8654984	positive
+4040855	negative
+45611	negative
+12233746	negative
+17614952	negative
+22685137	positive
+18233	negative
+21948841	positive
+19202090	positive
+16137662	negative
+21726361	negative
+1368777	positive
+3527986	negative
+9654123	positive
+18307762	positive
+12597025	negative
+2508563	negative
+8000538	positive
+560223	negative
+2063624	negative
+1632643	negative
+21193820	positive
+15194814	positive
+11856	negative
+9301101	negative
+24020787	negative
+8190078	positive
+19835139	negative
+22712405	negative
+18377882	positive
+9000377	positive
+21622	negative
+21364303	negative
+16186619	negative
+7987261	positive
+8997712	negative
+22940347	negative
+16523351	negative
+24085297	negative
+1654681	negative negative
+8358833	positive
+8358835	positive
+19725536	positive
+23897210	negative
+15757176	positive
+2187435	positive
+8616259	negative
+9370370	positive
+8358830	positive
+17503147	positive
+2509432	positive
+4281647	negative
+1781689	negative
+22349190	positive
+10508113	positive
+24035805	negative
+21748379	positive
+1815765	negative
+20541633	negative
+11494757	negative
+12668107	negative
+10376824	positive
+21490699	positive
+20851958	positive
+15519295	positive
+24128930	negative
+10773459	negative
+23268348	positive
+11768539	positive
+16233094	positive
+8709949	negative
+18595320	negative
+10725538	positive
+11193399	positive
+17646981	positive
+11217409	negative
+15280013	positive
+15006424	positive
+22309761	positive
+24316358	negative
+16896601	positive
+24528642	negative
+17115208	positive
+22584433	positive
+22437835	positive
+23094334	negative
+10517025	positive
+2076554	positive
+8768520	positive
+23619241	negative
+20623432	negative
+10347026	positive
+11722900	positive
+15921894	negative
+23190610	positive
+17345128	positive
+15130150	negative
+8514419	negative
+9990729	positive
+8941946	negative
+22796724	negative
+19669931	positive
+22160328	positive
+1889394	negative
+12427996	positive
+9324248	positive
+16233072	positive
+23508399	negative
+9797312	positive
+9694679	negative
+9128738	positive
+3907189	negative
+6184962	negative
+3128741	negative
+8431310	negative
+8433972	positive
+23537284	positive
+22846889	positive
+9929401	negative
+19784554	positive
+21848609	negative
+18668373	negative
+19239548	positive
+23990297	negative
+8837440	positive
+8020743	negative
+19736001	positive
+23356577	positive
+16283301	negative
+2688929	positive
+23858710	negative
+16844780	positive
+234905	negative
+16121227	negative
+23959893	negative
+16333341	negative
+11357511	positive
+23261999	positive
+18040681	negative
+8390128	negative
+20579868	negative
+1172175	negative
+106849	negative
+20102533	negative
+1511691	positive
+12723619	positive
+16461639	positive
+11722552	positive
+17905460	negative
+7670182	positive
+20406672	positive
+17623028	positive
+1748872	negative
+9682473	positive
+15564668	positive
+16272431	negative
+23263965	positive
+20735824	negative
+18938241	negative
+2070799	positive
+7750151	positive
+6791629	negative
+10395989	negative
+10052139	negative
+11925050	negative
+11376040	negative
+10052135	positive
+15135402	positive
+10933800	negative negative
+12670686	negative
+9019140	negative
+7763458	negative
+7764306	positive
+14642815	positive
+21532326	negative
+14648113	positive
+7586029	positive
+23615741	positive
+10974100	negative
+19174189	positive
+8670100	positive
+8947054	negative
+10588045	positive
+12356463	positive
+20512738	negative
+1406248	positive
+23844364	negative
+12450128	positive
+8057846	positive
+19934038	negative
+2834092	negative
+5661593	negative
+1814275	positive
+20039188	negative
+18553693	negative
+18524918	negative
+19777823	positive
+16232837	positive
+8297343	positive
+20680265	positive
+18483792	negative
+22860913	negative
+9535817	positive
+8390581	negative
+1369024	positive
+7646037	positive
+10092840	positive
+18490069	negative
+18020405	positive
+22033931	positive
+7708682	positive
+8590631	positive
+15262228	positive
+6787335	negative
+11157256	positive
+9797333	negative
+16556727	negative
+12649442	positive
+12501406	negative
+22689149	positive
+24237246	negative
+6984129	negative
+3265327	negative
+18347828	negative
+1369269	negative
+19060392	positive
+8452520	positive
+12657297	negative
+12297320	negative
+11396904	negative
+24416614	negative
+9167273	negative
+15649508	negative
+20169200	negative
+19039584	positive
+18720841	positive
+20464942	negative
+12623067	positive
+15716038	positive
+23330392	positive
+22859955	positive
+23241981	negative
+9023952	negative
+16349528	positive
+8590658	negative
+6765603	positive
+6440004	positive
+11911612	negative
+17067546	negative
+18998121	positive
+9514754	positive
+1761224	positive
+17928959	positive
+1369161	positive
+11692674	negative
+7764056	positive
+7766233	negative
+15502357	positive
+12137954	positive
+12922166	positive
+15342117	negative
+20645085	positive
+16887562	negative
+12018245	negative
+16650812	positive
+6420649	negative
+9756616	positive
+7574642	positive
+3012284	negative
+10422230	negative
+21124049	negative
+22906186	positive
+15691940	positive
+9742698	positive
+9169553	negative
+7747967	negative
+21708265	positive
+12619703	negative
+22225502	positive
+18716810	negative
+6049382	negative
+10642523	positive
+8477731	positive
+15322773	positive
+18725302	positive
+10931904	positive
+16657425	negative
+19110429	negative
+18346891	positive
+19809200	positive
+14633044	negative
+11997095	positive
+18944759	negative
+23399248	positive
+12949620	negative
+10906956	negative
+11255010	positive
+3932329	negative
+9043114	negative
+10993164	positive
+22365717	negative
+21801352	negative
+19502758	positive
+9466262	negative
+10386374	positive
+20510474	negative
+11368016	positive
+8988359	negative
+23204424	positive
+21564548	positive
+6248742	negative
+24315640	negative
+22684857	positive
+19156406	positive
+11272822	negative negative
+21161225	positive
+19661691	positive
+17651209	positive
+8474449	negative
+18620557	negative
+8437	positive
+23824666	negative
+23525113	positive
+4255900	negative
+11254576	positive
+9791893	positive
+11281712	negative
+22120123	negative
+1970434	positive
+13278322	negative
+23252695	positive
+23583262	negative
+12567244	negative
+23333949	negative
+23180124	positive
+7864815	positive
+8824176	negative
+17627774	negative
+9008887	positive
+2308855	positive
+22448043	negative
+7789795	positive
+12767807	positive
+10377251	positive
+23171402	negative
+16833	negative
+405328	negative
+1624111	negative
+15607743	positive
+9758835	positive
+23833180	negative
+11166820	negative
+18551552	negative
+4661766	negative
+6409895	positive
+17363438	positive
+15917612	negative
+20936239	negative
+7670194	positive
+1339327	positive
+16653055	negative
+8440481	positive
+22073551	negative
+8806739	negative
+15998305	negative
+10665422	negative
+17988729	negative
+17419071	positive
+20879842	negative
+23332834	positive
+20879840	negative
+1934116	positive
+22112956	positive
+19453169	positive
+20591661	positive
+1896470	negative
+19000618	positive
+19580870	negative
+9406381	positive
+17002602	positive
+6771030	negative
+15555940	positive
+15555941	negative
+1320186	positive
+8836148	positive
+23101390	positive
+21531609	negative
+3384334	positive
+23624166	negative
+24313660	negative
+17551789	positive
+21632240	negative
+15555935	positive
+23318568	negative
+16121561	negative
+8929394	positive
+22203550	negative
+3440521	positive
+12466887	positive
+8815461	negative
+8837470	positive
+19924304	negative
+12630320	negative
+17194495	positive
+1606968	positive
+21498763	negative
+9335167	positive
+3153146	negative
+15116339	negative
+17277884	positive
+23850557	negative
+8017902	negative
+15784980	positive
+18661293	negative
+8085821	positive
+20222446	positive
+22048567	negative
+20336338	positive
+956129	negative
+1934135	positive
+20652693	positive
+14704857	negative
+9884411	positive
+24329860	negative
+16988781	positive
+20400566	negative
+20400560	negative
+16701547	negative
+18072936	positive
+12167544	positive
+12094738	negative
+19336219	negative
+9145525	positive
+11795847	positive
+573117	negative
+12237858	positive
+16887700	negative
+16366715	negative
+17884661	negative
+23107704	positive
+3314909	negative
+15469730	positive
+16202538	positive
+15280646	positive
+23365723	negative
+18479937	positive
+9546185	positive
+3596237	positive
+9766241	negative
+3290051	negative
+11553760	negative
+10584016	positive
+10586675	negative
+11319115	positive
+23898996	negative
+16694	negative
+12506981	positive
+15749766	negative
+16362326	negative
+2450787	negative
+15821912	positive
+15808943	positive
+18633609	positive
+8577701	negative
+20414741	negative
+507620	negative
+15715951	negative
+8939815	positive
+23869387	negative
+3246351	positive
+15988573	positive
+7121328	negative
+10636904	positive
+16572843	negative
+15715941	positive
+18839231	negative
+17487548	positive
+1366983	negative
+16330537	positive
+9756469	positive
+8529895	positive
+23768357	negative
+10427736	negative
+16944135	positive
+16614901	positive
+15850449	negative
+7592488	positive
+23463247	positive
+16668702	negative
+17851776	negative
+19044008	negative
+16328626	positive
+16739943	negative
+8455560	negative
+19129654	positive
+15629130	positive
+12374797	positive
+9325167	negative
+12850270	positive
+15666544	negative
+22353731	positive
+9002269	negative
+20419375	positive
+1366537	negative
+17964183	positive
+20235799	negative
+560243	negative
+3124870	negative
+15316684	negative
+12145937	negative
+4736235	negative
+2113524	positive
+20850111	negative
+24232491	negative
+7805053	positive
+4796770	negative
+23292745	negative
+6540443	negative
+18942586	negative
+10572260	positive
+9003585	negative
+2925681	positive
+17455791	positive
+16407250	negative
+12664153	positive
+1899374	negative
+16776296	positive
+2785629	negative
+15819855	negative
+21350668	negative
+23280774	negative
+9177963	negative
+12489121	negative
+12383257	positive
+16535476	negative
+11402645	negative
+10830498	positive
+7961928	negative
+22738957	negative
+11601607	negative
+17876815	positive
+1930835	negative
+8572698	positive
+23454546	negative
+20675115	negative
+18060506	positive
+3240864	positive
+2762318	negative
+20807550	negative
+21920035	positive
+9546178	positive
+9546179	positive
+23956415	negative
+22036533	positive
+22750808	positive
+23077275	positive
+1643283	negative
+12387863	negative
+11955286	negative
+3912145	negative
+14558143	negative
+16233469	positive
+8419289	positive
+2227420	positive
+12400688	positive
+1979298	positive
+19941023	positive
+10222181	positive
+18327544	negative
+6068063	negative
+22579385	negative
+6068064	negative
\ No newline at end of file
diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore
new file mode 100644
index 0000000..6b468b6
--- /dev/null
+++ b/src/analyse/.gitignore
@@ -0,0 +1 @@
+*.class
diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java
new file mode 100644
index 0000000..dfb2d48
--- /dev/null
+++ b/src/analyse/Extractor.java
@@ -0,0 +1,455 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package analyse;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import configure.ConfigConstants;
+/**
+ * Implements common tools to FeatureExtractor 
+ * and NgramExtractor classes that are used to 
+ * extract features from doc instances 
+ * 
+ * @author halmeida
+ *
+ */
+public class Extractor {
+	
+	
+	String id;
+	String endId;
+	String openFile;	
+	String endFile;
+	String openAbst;
+	String closeAbst;
+	String abstractLabel;
+	String openEC;
+	String closeEC;
+	String classTag;
+	String openTitle;
+	String closeTitle;
+	String openJournal;
+	String closeJournal;
+	String copyR;
+	String closeCopyR;
+	
+	/**
+	 * Replaces special characters to clean 
+	 * text for tokenizing.
+	 * 
+	 * @param str text to be cleaned
+	 * @return string with cleaned text
+	 */
+	public String removeSpecialChar(String str){
+		str = str.replace("}", "");
+		str = str.replace("{", "");
+		str = str.replace("]", "");
+		str = str.replace("[", "");
+		str = str.replace("\"", "");
+		str = str.replace("<", "");
+		str = str.replace(">", "");
+		str = str.replace("/", " ");
+		str = str.replace("\\", " ");
+		str = str.replace("#", "");
+		str = str.replace("*", "");
+		str = str.replace("&gt", "");
+		str = str.replace("&apos", "");
+		str = str.replace("%", "");
+		str = str.replace("&quot", "");
+		str = str.replace("&", "");
+		str = str.replace("=", "");
+		str = str.replace("?", "");
+		str = str.replace("!", "");
+		str = str.replace(";", "");
+		str = str.replace(":", "");
+//		str = str.replace(",", "");
+//		str = str.replace(".", "");
+		str = str.replace(")", "");
+		str = str.replace("(", "");
+		str = str.replace("\t\t", "\t");
+		//losing ngrams because of hifen between names 
+		str = str.replace("-", " ");
+		str = str.replace("  ", "");
+		
+		return str;
+	}
+	
+	/**
+	 * Handles external tags (and multiple abstract 
+	 * text tags) present in a single paper
+	 * @param str abstract content
+	 * @return string without external tags 
+	 */
+	
+	public String processAbstract(String str){
+		str = str.replace("  ", "");	
+		
+		if(str.contains("Copyright") && !(str.contains(".</"))) str = str.replace("</", ".</");
+		
+		String[] remove = str.split("");
+		StringBuilder sb = new StringBuilder();
+		String temp = "";
+		String abstrac = "";
+		
+		for(int i = 0; i < remove.length; i++){
+			temp = temp + remove[i];
+			
+			if(temp.contains("<AbstractText ")){
+				temp = "";				
+				do{
+					i++;
+				} while(!(remove[i].equalsIgnoreCase(">")));
+			}
+			
+			//Handling the word "Copyright" before the end of abstract
+			if(temp.contains("Copyright ")){
+				temp = "";
+				do{
+					i++;
+					//an exception here can mean that a copyright information
+					//tag content did not ended with a period
+				}while(!(remove[i]).equalsIgnoreCase("."));
+			}
+			else sb.append(remove[i]);		
+		}
+		
+		 abstrac = sb.toString();
+		 abstrac = removeAbstractTags(abstrac);
+				 
+		 return abstrac;
+	}
+	
+
+	/**
+	 * Removes specific tags encountered on Abstract texts.
+	 * This is used to clean the abstract text before 
+	 * processing the feature count on the model. 
+	 * @param str
+	 * @return
+	 */
+	
+	public String removeAbstractTags(String str){		
+		//this order of removing tags matters to 
+		//exclude the first tag from the abstracts.
+		
+		str = str.replace("<abstracttext>", "");
+		str = str.replace("<abstracttext", "");
+		str = str.replace("<abstracttext", "");
+		str = str.replace("<copyrightinformation>", "");
+		str = str.replace("</copyrightinformation>", "");
+		str = str.replace("copyright", "");		
+		str = str.replace("</abstractText>", "");
+		str = str.replace("<abstract>", "");
+		str = str.replace("</abstract>", "");
+		str = str.replace("<abstracttext.*?>", "");	
+		
+		return str;
+	}
+	
+	
+	/**
+	 * Removes the markup annotations of a
+	 * text field, and keeps its content
+	 * 
+	 * @param str text containing markups
+	 * @return string with cleaned text 
+	 */	
+	public String removeTags(String str) {
+		String[] remove = str.split("");
+		StringBuilder sb = new StringBuilder();
+		
+		for(int i = 0; i < remove.length; i++){
+			
+			//iterating over the text until finding opening tag
+			if(remove[i].equalsIgnoreCase("<")){
+				do{
+					i++;
+				}
+				//skipping the content until finding closing tag
+				while(!(remove[i].equalsIgnoreCase(">")));
+			}
+			else sb.append(remove[i]);
+		}
+				
+		return sb.toString();		
+	}
+	
+	
+	/**
+	 * Displays the keys and values of the
+	 * maps created.
+	 * 
+	 * @param hash  HashMap containing list,
+	 * values, counts
+	 */
+	public void displayList(HashMap hash){
+		Iterator<Object> itr = hash.keySet().iterator();
+		int sum = 0;
+		while(itr.hasNext()){
+			Object str = itr.next();
+			System.out.println("key: "+str+"\t value: "+hash.get(str));			
+		}		
+	}
+	
+	
+	/**
+	 * Exports hashmap of values extracted  
+	 * from dataset to external file
+	 * 
+	 * @param location folder, file name and file extension
+	 * @param list values to be exported
+	 */	
+	public void exportFile(String location, HashMap list){
+
+		String SEPARATOR = "\t";
+		StringBuffer line = new StringBuffer();
+		Iterator<Object> itr = list.keySet().iterator();
+		
+			try{
+				BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8"));
+
+				while(itr.hasNext()){
+					Object str = itr.next();
+					if(str != null){
+						line.append(str).append(SEPARATOR).append(list.get(str));
+						if(line.toString().contains("="))
+							line.replace(line.indexOf("="), line.indexOf("=")+1,SEPARATOR);	
+						//handling specificities from title content extraction
+						if(line.toString().contains(","))
+							line.replace(line.indexOf(","), line.indexOf(",")+1,SEPARATOR);						
+					}
+					if(itr.hasNext()){
+						line.append(System.getProperty("line.separator"));		
+									
+					}
+					writer.write(removeSpecialChar(line.toString()));					
+					line.replace(0, line.length(), "");					
+				}
+				writer.flush();
+				writer.close();
+			}
+			catch(UnsupportedEncodingException e){
+				e.printStackTrace();
+			}
+			catch(FileNotFoundException e){
+				e.printStackTrace();
+			}
+			catch(IOException e){
+				e.printStackTrace();
+			}		
+			
+			
+		//}
+	}
+	
+	
+	/**
+	 * Exports list of values extracted  
+	 * from dataset to a string variable
+	 * 
+	 * @param list list of values to be exported
+	 * @return string containing values on list
+	 * @deprecated
+	 */		
+	public String exportContent(HashMap list){
+		String SEPARATOR = "\t";
+		Iterator<String> itr = list.keySet().iterator();
+		StringBuffer export = new StringBuffer();		
+		//try{
+		while(itr.hasNext()){
+			String str = itr.next();
+			if(str != null){
+				export.append(str).append(SEPARATOR).append(list.get(str));
+				
+				if(export.toString().contains("="))
+					export.replace(export.indexOf("="), export.indexOf("=")+1,SEPARATOR);				
+			}
+			
+			if(itr.hasNext()){
+				export.append("\n");
+			}
+		}		
+		return removeSpecialChar(export.toString());			
+	}
+	
+	
+	/**
+	 * Exports list of values extracted  
+	 * from dataset to external file
+	 * 
+	 * @param location folder, file name and file extension
+	 * @param list list of values to be exported
+	 *
+	 */	
+	public void exportList(String location, ArrayList<String> list){
+
+		String SEPARATOR = "\n";
+		StringBuffer line = new StringBuffer();		
+
+		try{
+			BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8"));
+
+			for(int i = 0; i < list.size(); i++){
+				String str = list.get(i);
+				if(str != null){
+					line.append(str).append(SEPARATOR);											
+				}
+			}
+			writer.write(removeSpecialChar(line.toString()));
+
+			writer.flush();
+			writer.close();
+		}
+		catch(UnsupportedEncodingException e){
+			e.printStackTrace();
+		}
+		catch(FileNotFoundException e){
+			e.printStackTrace();
+		}
+		catch(IOException e){
+			e.printStackTrace();
+		}		
+
+	}
+	
+	
+	public void initialize(File featureDir, ConfigConstants pathVars){
+		try{
+			featureDir.mkdir();
+			
+		}catch(Exception e){
+			System.out.println("Error creating" + featureDir + "folder.");
+			System.exit(0);
+		}	
+	}  
+	
+		
+	/**
+	 * Accessors and mutators methods
+	 * for Extractor variables. 
+	 * @return
+	 */
+	
+	public String getid() {
+		return id;
+	}
+	public void setid(String id) {
+		this.id = id;
+	}
+	public String getendId() {
+		return endId;
+	}
+	public void setendId(String endId) {
+		this.endId = endId;
+	}
+	public String getOpenFile() {
+		return openFile;
+	}
+	public void setOpenFile(String openFile) {
+		this.openFile = openFile;
+	}
+	public String getendFile() {
+		return endFile;
+	}
+	public void setendFile(String endFile) {
+		this.endFile = endFile;
+	}
+	public String getopenAbst() {
+		return openAbst;
+	}
+	public void setopenAbst(String openAbst) {
+		this.openAbst = openAbst;
+	}
+	public String getcloseAbst() {
+		return closeAbst;
+	}
+	public void setcloseAbst(String closeAbst) {
+		this.closeAbst = closeAbst;
+	}
+	public String getOpenEC() {
+		return openEC;
+	}
+	public void setOpenEC(String openEC) {
+		this.openEC = openEC;
+	}
+	public String getCloseEC() {
+		return closeEC;
+	}
+	public void setCloseEC(String closeEC) {
+		this.closeEC = closeEC;
+	}
+	public String getAbstractLabel() {
+		return abstractLabel;
+	}
+	public void setAbstractLabel(String abstractLabel) {
+		this.abstractLabel = abstractLabel;
+	}	
+	public String getClassTag() {
+		return classTag;
+	}
+	public void setClassTag(String classTag) {
+		this.classTag = classTag;
+	}
+	public String getOpenTitle() {
+		return openTitle;
+	}
+	public void setOpenTitle(String titleTag) {
+		this.openTitle = titleTag;
+	}
+	public String getCloseTitle() {
+		return closeTitle;
+	}
+	public void setCloseTitle(String closeTitle) {
+		this.closeTitle = closeTitle;
+	}
+	public String getOpenJournal() {
+		return openJournal;
+	}
+	public void setOpenJournal(String openJournal) {
+		this.openJournal = openJournal;
+	}
+	public String getCloseJournal() {
+		return closeJournal;
+	}
+	public void setCloseJournal(String closeJournal) {
+		this.closeJournal = closeJournal;
+	}
+
+}
\ No newline at end of file
diff --git a/src/analyse/FeatureExtractor.java b/src/analyse/FeatureExtractor.java
new file mode 100644
index 0000000..1593c01
--- /dev/null
+++ b/src/analyse/FeatureExtractor.java
@@ -0,0 +1,544 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package analyse;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import configure.ConfigConstants;
+import filter.NaiveFilter;
+
+
+/**
+ * This class extracts and parses domain 
+ * annotation features from doc instances
+ *   
+ * @author Hayda Almeida
+ * @since 2014
+ * 
+ */
+
+public class FeatureExtractor extends Extractor{	
+	
+	public FeatureExtractor(){
+				
+		this.id = "PMID";
+		this.openAbst = "AbstractText";
+		this.abstractLabel = "AbstractText ";
+		this.openEC = "RegistryNumber";
+		this.classTag = "TRIAGE";
+		this.openJournal = "Title";
+		this.openTitle = "ArticleTitle";		
+	}
+		
+	
+	public static void main(String[] args) {
+		
+		ConfigConstants pathVars = new ConfigConstants();
+		boolean verbose = false;
+		
+		String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE;		
+		FeatureExtractor fextrac = new FeatureExtractor();
+		NaiveFilter featFilter = new NaiveFilter();
+				
+		File featureDir = new File(pathVars.HOME_DIR + pathVars.FEATURE_DIR + "/");
+		
+		fextrac.initialize(featureDir, pathVars);
+		featFilter.loadStopWords(pathVars.HOME_DIR + pathVars.STOP_LIST);
+		
+		//store all features, type and count
+		HashMap<Map<String,String>,Integer> abstract_count = new HashMap<Map<String,String>,Integer>();
+		//store title features, type and count
+		HashMap<Map<String,String>, Integer> title_count = new HashMap<Map<String,String>, Integer>();
+		//store title features, whole journal title content and classification
+		HashMap<Map<String,String>,String> title_content = new HashMap<Map<String,String>,String>();		
+		//store title content and EC numbers
+		ArrayList<String> ec_numbers = new ArrayList<String>();
+		
+		//store ID, class and features
+		HashMap<String,String> PMIDs = new HashMap<String,String>();
+				
+		
+		int jTitle = 0;
+				
+		try 
+		{
+			//Loading file
+			File input = new File(AnCorpus);			
+			//Jsoup parse
+			Document doc = Jsoup.parse(input, "UTF-8");
+
+			Elements corpus = doc.body().getElementsByTag("pubmedarticle");
+
+			//Fetching elements
+
+			for(Element paper : corpus ){			
+
+				//Fetching elements
+				Elements journalTitle = paper.getElementsByTag(fextrac.getOpenJournal());
+				Elements title = paper.getElementsByTag(fextrac.getOpenTitle());
+				Elements abstractC = paper.getElementsByTag(fextrac.getopenAbst());
+				Elements ECnumber = paper.getElementsByTag(fextrac.getOpenEC());
+				Elements classDoc = paper.getElementsByTag(fextrac.getClassTag());				
+
+				String journal = "";
+				String docID = "";
+				String label = "";
+				ArrayList<String> tempList = new ArrayList<String>();
+				StringBuffer sb = new StringBuffer();
+				
+				//fetching the paper ID - 
+				//for all items in a paper, retrieve only PMIDs 
+				for(Element e : paper.select(fextrac.getid())){
+					//only consider the ID if the parent is medline citation
+					if(e.parentNode().nodeName().contains("medline")){						
+						docID = e.text();
+					}
+				}			
+				//fetch the doc label as well
+				if(classDoc.hasText()){
+					label = classDoc.text();									
+				}
+				
+				PMIDs.put(docID, label);				
+
+				if(journalTitle.hasText()){
+
+					jTitle++;				
+					journal = journalTitle.toString();
+					journal = fextrac.removeSpecialChar(journal);				
+					journal = fextrac.removeTags(journal);									
+				}				
+
+				String title_annotation = "";
+				if(title.hasText()){
+					title_annotation = title.toString();
+				//	title_annotation = fextrac.removeSpecialChar(title_annotation);
+
+					tempList.addAll(fextrac.annotations(title_annotation, title_count, featFilter, pathVars));
+					fextrac.addContent(title_annotation, journal, title_content, featFilter);					
+				}
+
+				String abstrac = "";
+				if(abstractC.hasText()){
+					abstrac = abstractC.toString();
+					//abstrac = fextrac.removeSpecialChar(abstrac);
+					//abstrac = fextrac.removeAbstractTags(abstrac);
+
+					tempList.addAll(fextrac.annotations(abstrac, abstract_count, featFilter, pathVars));				
+				}		
+
+				String ecnum = "";
+				if(ECnumber.hasText()){				
+					for(Element number : ECnumber){						
+						ecnum = number.toString();
+						if(ecnum.contains("EC")){
+							ecnum = fextrac.removeSpecialChar(ecnum);
+							ecnum = fextrac.removeTags(ecnum);
+							ec_numbers.add(ecnum);
+						}
+					}				
+				}			
+
+				String triage = "";
+				if(classDoc.hasText()){
+					triage = classDoc.toString();
+					triage = fextrac.removeSpecialChar(triage);
+					triage = fextrac.removeTags(triage);					
+				}				
+
+			}
+			
+		}
+		
+		catch (FileNotFoundException e) {
+			e.printStackTrace();			
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+		}		
+		
+		if(verbose){
+			//print list of extracted features
+			System.out.println("\n===========TITLE==ANNOTATIONS=============");
+			fextrac.displayList(title_count);;				
+			fextrac.displayList(title_content);
+			System.out.println("\n========ABSTRACT==ANNOTATIONS=============");
+			fextrac.displayList(abstract_count);		
+			
+		}
+		
+		//filter features by occurence
+		featFilter.considerAnnotationOccurence(abstract_count, pathVars);
+		featFilter.considerAnnotationOccurence(title_count, pathVars);
+				
+		System.out.println("\n===========FEATURE==EXPORT===============");
+		fextrac.exportFile(featureDir + "/" + pathVars.DOC_IDS, PMIDs);
+		System.out.println("..."+ PMIDs.size()+" document IDs listed.");
+		fextrac.exportList(featureDir + "/" + pathVars.ECNUM_FEATURES, ec_numbers);
+		System.out.println("..."+ ec_numbers.size()+" EC numbers saved.");				
+		fextrac.exportFile(featureDir + "/" + pathVars.ANNOTATION_FEATURES, abstract_count);
+		System.out.println("..."+ abstract_count.size()+" unique Abstract annotations saved.");
+		fextrac.exportFile(featureDir + "/" + pathVars.TITLE_FEATURES, title_count);
+		System.out.println("..."+ title_count.size() +" unique Title annotations saved.");
+		fextrac.exportFile(featureDir + "/" + pathVars.JOURNAL_TITLE_FEATURES, title_content);
+		System.out.println("..."+jTitle+" Journal titles saved.");
+		System.out.println("\n=========================================\n");
+		
+	}		
+
+	/**
+	 * Identifies the classification on doc
+	 * 
+	 * @param clas text containing classification (after char removal)
+	 * @return classification of doc
+	 */	
+	private String getClassif(String clas) {
+		
+		//parsing the not edited text into HTML using Jsoup
+		Document doc = Jsoup.parseBodyFragment(clas);
+		//saving the text as an Jsoup element, with a main tag (the HTML body), 
+		//attributes and child nodes (TRIAGE tags)
+		Element text = doc.body();
+		
+		Elements classification = text.getElementsByTag("TRIAGE");
+				
+		return classification.text();		
+	}
+	
+	/**
+	 * Inserts the classification 
+	 * on the list of features
+	 * 
+	 * @param class information to insert on list
+	 * @param list list of features used
+	 */	
+	private void addClass(String element, HashMap<Map<String,String>, String> list){
+		//going over list to insert
+		//classif on document instances		
+		Iterator<Map<String, String>>it = list.keySet().iterator();
+		
+		while(it.hasNext()){		
+			Map<String,String> str = it.next();
+								
+			if(list.get(str).contains(element)){
+			//if(list.get(str).contains("positive") || list.get(str).contains("negative")){
+					
+			}
+			else list.put(str, element);
+		}
+	}	
+
+	
+	/**
+	 * Extract the annotations from a determined section
+	 * of the document and add them to the specified lists.
+	 * 
+	 * @param annotation cleaned and splitted line with annotation
+	 * @param count list that holds annotation, its type and its count
+	 * @param type list that holds annotation, its type and its classification
+	 */	
+	private ArrayList<String> annotations(String annot, HashMap<Map<String, String>, Integer> count, NaiveFilter filter, ConfigConstants pathVars) {		
+		HashMap<String,String> features = loadAnnotationEntities();
+		ConfigConstants pathVar = new ConfigConstants(); 
+		NgramExtractor nextrac = new NgramExtractor();
+		ArrayList<String> content = new ArrayList<String>();		
+
+		//parsing the not edited text into HTML using Jsoup
+		Document doc = Jsoup.parseBodyFragment(annot);
+		//saving the text as an Jsoup element, with a main tag (the HTML body), 
+		//attributes and child nodes (annotation tags)
+		Element annotations = doc.body();
+
+		//iterating over list of entities
+		for(Map.Entry<String,String> value : features.entrySet()){
+
+			String an_type = value.getKey();
+			String an_level = value.getValue();
+
+			//for each entity, find the annotations on abstract
+			Elements annots = annotations.getElementsByTag(an_type);			
+
+			//for each annotation found, 
+			for(Element an : annots){
+
+				//grabbing annotation content:
+				//if the annotation is made on the sentence level:
+				if(an_level.contains("sentence")){
+
+					//checking if sentence contains inner annotations
+					if(an.childNodeSize() != 0){
+
+						//going over list of inner annotations
+						for(Element child : an.children()){
+
+							//if child is sentence (sentence inside of sentence),  
+							//then add annotations as ngrams on this
+							if(features.get(child.nodeName()).contains("sentence")) {
+								content.addAll(nextrac.nGrams(child.text(), filter, pathVar));								
+								insertAnnotation(content, an.nodeName(), count, pathVars);
+							}
+							//adding annotations on sentence as they are - no ngrams on this
+							else {
+								content.add(child.text());	
+								insertAnnotation(content, an.nodeName(), count, pathVars);
+							}
+						}
+						
+						//removing inner annotations from sentence, they are already added
+						Element tempAnnot = an.clone();
+						tempAnnot.children().remove();
+
+						//splitting content in ngrams to whats left on the sentence
+						content.addAll(nextrac.nGrams(tempAnnot.text(), filter, pathVar));
+						insertAnnotation(content, an.nodeName(), count, pathVars);
+					}			
+
+				}
+				else {
+					//keeping original annotation content for other cases					
+					content.add(an.text()); 
+					insertAnnotation(content, an.nodeName(), count, pathVars);
+				}
+			}
+
+		}
+		return content;
+
+	}	
+	
+	
+	/**
+	 * Insert annotation (or ngram list of annotation) 
+	 * on lists, used on @annotations method 
+	 * @param content content of annotation
+	 * @param an_type type extracted from text (entity)
+	 * @param count list of annotations and their count
+	 */	
+	private void insertAnnotation(ArrayList<String> content, String an_type, HashMap<Map<String, String>, Integer> count, ConfigConstants pathVars){
+		
+		//iterating over list of annotations
+		for(int i = 0; i < content.size(); i++){
+			
+			String current_content = content.get(i);
+			current_content = removeSpecialChar(current_content);
+
+			if(current_content.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){
+
+				//creating the list key as: content - type mapping
+				Map<String, String> an_content = new HashMap<String, String>();				
+				an_content.put(current_content, an_type);
+
+				//for each annotation (or ngram on annotation)
+				//insert content and related type
+				if(count.containsKey(an_content)){						
+					try{
+						int cnt = count.get(an_content);								
+						count.put(an_content, cnt+1);
+
+					}catch(Exception e){
+						count.put(an_content, 1);															
+					}
+				}					
+				else{					
+					count.put(an_content, 1);					
+				}				
+			}
+		}
+		
+		content.clear();
+		
+	}
+
+	
+	/**
+	 * Inserts the text (e.g.title) content into   
+	 * a list of features (e.g.title features)
+	 *  
+	 * @param annot text with the annotations to be handled
+	 * @param wContent whole field to be added on the list of features
+	 * @param list features used
+	 * 
+	 */	
+	private void addContent(String annot, String wContent, HashMap<Map<String,String>,String> list, NaiveFilter filter) {
+
+		HashMap<String,String> features = loadAnnotationEntities();
+		ArrayList<String> content = new ArrayList<String>();
+		NgramExtractor nextrac = new NgramExtractor();
+		ConfigConstants pathVar = new ConfigConstants();
+
+		//parsing not edited text into HTML using Jsoup
+		Document doc = Jsoup.parseBodyFragment(annot);
+		//saving the text as an Jsoup element, with a main tag (the HTML body), 
+		//attributes and child nodes (annotation tags)
+		Element annotations = doc.body();
+
+		//iterating over annotation types
+		for(Map.Entry<String,String> value : features.entrySet()){
+
+			String an_type = value.getKey();
+			String an_level = value.getValue();
+
+			//for each annotation type, find all related annotations on the abstract
+			Elements annots = annotations.getElementsByTag(an_type);			
+
+			//for each annotation type, 
+			for(Element an : annots){
+
+				//grab annotation content								
+				if(an_level.contains("sentence"))
+					//splitting in ngrams for sentence level annotations
+					content = nextrac.nGrams(an.text(), filter, pathVar);
+				else 
+					//keeping original annotation for other cases
+					content.add(an.text());
+
+				//iterating over list of annotations
+				for(int i = 0; i < content.size(); i++){
+					
+					String current_content = content.get(i);
+					current_content = removeSpecialChar(current_content);
+					
+					Map<String,String> an_content = new HashMap<String,String>();
+					an_content.put(current_content, wContent);
+					
+					//populating list of feature_an_types, with:
+					//feature--an_type--class
+					list.put(an_content, "");									
+				}
+				content.clear();
+			}
+		}
+	}
+
+	
+	/**
+	 * Loads list of entities from external file
+	 * 
+	 * @param str list of entities
+	 * @param pathVar constants from 
+	 * @return
+	 */	
+	public HashMap<String,String> loadAnnotationEntities(){
+		
+		String pathEntities = "entities.txt";		
+		HashMap<String,String> values = new HashMap<String,String>();
+						
+		try{			
+			BufferedReader reader = new BufferedReader(new FileReader(pathEntities));
+			
+			String line = null;	
+			
+			while((line = reader.readLine()) != null){				
+                
+				if(!line.contains("#")){
+					String[] value = StringUtils.split(line, " ");
+					values.put(value[0].toLowerCase(), value[1].toLowerCase());
+				}
+			}
+			
+			reader.close();
+			
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }		
+		//String[] entities = values.toArray(new String[values.size()]);
+		
+		return values;
+	}
+	
+	@Override
+	public void initialize(File featureDir, ConfigConstants pathVars){
+					
+			try{				
+				
+				if(!featureDir.exists())
+					featureDir.createNewFile();
+				
+				File ecnumbers = new File(featureDir + "/" + pathVars.ECNUM_FEATURES);
+				ecnumbers.createNewFile();
+				
+				File annotations = new File(featureDir + "/" + pathVars.ANNOTATION_FEATURES);
+				annotations.createNewFile();
+				
+				File titleAnnotations = new File(featureDir + "/" + pathVars.TITLE_FEATURES);
+				titleAnnotations.createNewFile();
+				
+				File journaltitles = new File(featureDir + "/" + pathVars.JOURNAL_TITLE_FEATURES);
+				journaltitles.createNewFile();
+				
+			}catch(Exception e){				
+				System.out.println(e.getMessage());
+				System.exit(0);
+			}		
+	}
+
+	
+	/**
+	 * Handles the content of annotations; when
+	 * there is multiple elements, they are 
+	 * concatenated after extracted 
+	 * 
+	 * @param str list of annotation elements
+	 * @return single string with all elements
+	 */	
+	public String contentToString(String[] str){
+		String cont = "";
+		
+		for(int i = 0; i < str.length; i++){
+				if(cont.contentEquals("")){
+					cont = cont + str[i];	
+				}
+				else cont = cont+" "+ str[i];
+				
+			}		
+		
+		return cont;
+	}
+	
+	
+
+}
diff --git a/src/analyse/NgramExtractor.java b/src/analyse/NgramExtractor.java
new file mode 100644
index 0000000..b11be56
--- /dev/null
+++ b/src/analyse/NgramExtractor.java
@@ -0,0 +1,309 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package analyse;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import configure.ConfigConstants;
+import filter.NaiveFilter;
+
+/**
+ * This class extracts and parses n-grams
+ * from XML doc instances.
+ * 
+ * @author Hayda Almeida
+ * @since 2014
+ * 
+ */
+
+public class NgramExtractor extends Extractor{
+		
+	public NgramExtractor(){
+		
+		//defining relevant paper text fields
+		this.id = "PMID";
+		this.openJournal = "Title";
+		this.openAbst = "AbstractText";		
+		this.openEC = "RegistryNumber";
+		this.classTag = "TRIAGE";
+		this.openTitle = "ArticleTitle";		
+	}	
+	
+		
+	public static void main(String[] args) {
+		
+		ConfigConstants pathVars = new ConfigConstants();
+		boolean verbose = false;
+		
+		String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE;
+		NgramExtractor nextrac = new NgramExtractor();
+		NaiveFilter featFilter = new NaiveFilter();
+		File featureDir = new File(pathVars.HOME_DIR + pathVars.FEATURE_DIR);
+		
+		featFilter.loadStopWords(pathVars.HOME_DIR + pathVars.STOP_LIST);
+		
+		//store abstract ngrams and its count
+		HashMap<String,Integer> ngram_count = new HashMap<String,Integer>();
+		//store abstract ngrams and doc ID
+		HashMap<String,String> ngram_ID  = new HashMap<String,String>();
+		//store title ngrams and its count
+		HashMap<String,Integer> ngram_title_count = new HashMap<String,Integer>();
+		//store title ngrams, count and "relevance(TBD)"
+		HashMap<Map<String,String>,Integer> ngram_title = new HashMap<Map<String,String>,Integer>();
+		//store ID and label of documents
+		HashMap<String,String> PMIDs = new HashMap<String,String>();
+				
+		nextrac.initialize(featureDir, pathVars);		
+		
+		try 
+		{		
+			
+			//Loading file
+			File input = new File(AnCorpus);
+			//Jsoup parse
+			Document doc = Jsoup.parse(input, "UTF-8");
+
+			Elements corpus = doc.body().getElementsByTag("pubmedarticle");
+						
+			//Fetching elements
+			
+			for(Element paper : corpus ){			
+
+				Elements journalTitle = paper.getElementsByTag(nextrac.getOpenJournal());
+				Elements title = paper.getElementsByTag(nextrac.getOpenTitle());
+				Elements abstractC = paper.getElementsByTag(nextrac.getopenAbst());
+				Elements ECnumber = paper.getElementsByTag(nextrac.getOpenEC());
+				Elements classDoc = paper.getElementsByTag(nextrac.getClassTag());		
+
+				String journal = "";
+				String docID = "";
+				String label = "";
+				int jTitle = 0;
+
+				//fetching the paper ID - 
+				//for all items in a paper, retrieve only PMIDs 
+				for(Element e : paper.select(nextrac.getid())){
+					//only consider the ID if the parent is medline citation
+					if(e.parentNode().nodeName().contains("medline")){						
+						docID = e.text();
+					}
+				}			
+				//fetch the doc label as well
+				if(classDoc.hasText()){
+					label = classDoc.text();									
+				}
+
+				PMIDs.put(docID, label);
+
+				//Extracting the Journal Title
+				if(journalTitle.hasText()){
+					jTitle++;				
+					journal = journalTitle.toString();
+					journal = nextrac.removeSpecialChar(journal);				
+					journal = nextrac.removeTags(journal);									
+				}
+
+				String tit_content = "";
+				//Extracting the Paper Title
+				if(title.hasText()){
+					tit_content = title.toString();
+					tit_content = nextrac.removeSpecialChar(tit_content);
+					tit_content = nextrac.removeTags(tit_content);
+
+					ArrayList<String> title_c = nextrac.nGrams(tit_content, featFilter, pathVars);
+					nextrac.addNGram(title_c, ngram_title_count, pathVars);		
+				}
+
+				String abstrac = "";
+				//Extracting the Paper abstract
+				if(abstractC.hasText()){
+					abstrac = abstractC.toString();
+					//abstrac = nextrac.removeTags(abstrac);
+					abstrac = nextrac.removeSpecialChar(abstrac);				
+					abstrac = nextrac.removeAbstractTags(abstrac);
+
+					ArrayList<String> abstract_c = nextrac.nGrams(abstrac, featFilter, pathVars);
+					nextrac.addNGram(abstract_c, ngram_count, pathVars);			
+				}
+			}
+
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        } 
+        
+		if(verbose){
+			//print list of extracted n-grams
+			nextrac.displayList(PMIDs);
+			System.out.println("\n========ABSTRACT==NGRAMS=============");
+			nextrac.displayList(ngram_count);
+			nextrac.displayList(ngram_title);
+			System.out.println("\n===========TITLE==NGRAMS=============");
+			nextrac.displayList(ngram_title_count);
+		}	
+		
+		//filter features by occurence			
+		featFilter.considerOccurence(ngram_count, pathVars);
+		featFilter.considerOccurence(ngram_title_count, pathVars);		
+		
+		System.out.println("\n===========NGRAMS==EXPORT===============\n");
+		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs);
+		System.out.println("..."+ PMIDs.size()+" document IDs listed.");
+		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES, ngram_count);
+		System.out.println("..."+ ngram_count.size()+" unique Abstract ngrams saved.");
+		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS, ngram_title_count);
+		System.out.println("... "+ ngram_title_count.size() +" unique Title ngrams saved.");		
+		System.out.println("\n========================================\n");		
+               
+	}
+	
+
+	/**
+	 * Inserts ngrams into list of features 
+	 * with a mapping for ngram count  
+	 * @param str relation of ngrams extracted
+	 * @param list_count mapping for ngram counts
+	 * @param pathVars 
+	 */
+	
+	private void addNGram(ArrayList<String> str, HashMap<String,Integer> list_count, ConfigConstants pathVars){
+		
+		//iterating over ngram list
+		for(int i = 0; i < str.size(); i++){
+			String currentNGram = str.get(i);
+			
+			//checking existence of current ngram on list mapping
+			if(list_count.containsKey(currentNGram)){
+				//retrieve the amount of current ngrams on mapping
+				int count = list_count.get(currentNGram);
+				//insert the updated count of ngrams
+				list_count.put(currentNGram, count+1);			
+			}
+			else {
+				//insert ngram on mapping list 
+				if(currentNGram.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){
+					list_count.put(currentNGram, 1);
+				}
+			}
+		}
+	}
+	
+	/**
+	 * Extracts n-grams from a given content field
+	 * 
+	 * @param str text to extract ngrams
+	 * @return list of extracted grams
+	 */	
+	public ArrayList<String> nGrams(String str, NaiveFilter filter, ConfigConstants pathVar){
+
+		//removing ASCII special characters		
+		str = str.replace("/", "");
+		str = str.replace("\\", "");
+		str = str.replace(" ", "-");
+		str = str.replaceAll("\\s+"," ");
+		str = str.replace(" ", "-");		
+		
+		//Tokenizing the sentence
+		String[] words = StringUtils.split(str,"-"); 
+		ArrayList<String> ngramList = new ArrayList<String>();
+
+		int ngram =Integer.parseInt(pathVar.NGRAM_SIZE);
+
+		//Stop-words removal 
+		if(Boolean.valueOf(pathVar.NGRAM_STOP)){
+			words = StringUtils.split(filter.removeStopList(words)," ");
+		}	
+		
+		//extracting ngrams according to gram size (1, 2, 3)
+		for(int i=0; i < words.length - (ngram - 1); i++){
+			switch(pathVar.NGRAM_SIZE){
+			case "1":
+				ngramList.add(words[i].toLowerCase());
+				break;
+			case "2":
+				ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase());
+				break;
+			case "3":
+				ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()+" "+words[i+2].toLowerCase());
+				break;				
+			}			
+		}
+		
+		return ngramList;
+	}
+	
+	
+	@Override
+	public void initialize(File featureDir, ConfigConstants pathVars){		
+			try{
+				featureDir.mkdir();				
+				File ngrams = new File(featureDir + pathVars.NGRAM_FEATURES);
+				ngrams.createNewFile();
+				
+				File titlengrams = new File(featureDir + pathVars.TITLE_NGRAMS);
+				titlengrams.createNewFile();
+				
+			}catch(Exception e){
+				System.out.println(e.getMessage());
+				System.exit(0);
+			}		
+	}
+	
+		
+	/**
+	 * Displays the keys and values of the
+	 * maps created with n-grams and counts.
+	 * @param hash  HashMap containing n-grams
+	 */
+	@Override
+	public void displayList(HashMap hash){
+		super.displayList(hash);
+			//sum = sum + hash.get(str);		
+		System.out.println("\n=======================================\n");
+		System.out.println("Number of unique n-grams: "+hash.size());
+		System.out.println("\n=======================================\n");
+	}
+	
+		
+	
+}
diff --git a/src/arffmatrix/.gitignore b/src/arffmatrix/.gitignore
new file mode 100644
index 0000000..ec5761d
--- /dev/null
+++ b/src/arffmatrix/.gitignore
@@ -0,0 +1,2 @@
+/buildmodel.class
+/buildtest.class
diff --git a/src/arffmatrix/BuildModel.class b/src/arffmatrix/BuildModel.class
new file mode 100644
index 0000000..022e81f
Binary files /dev/null and b/src/arffmatrix/BuildModel.class differ
diff --git a/src/arffmatrix/BuildModel.java b/src/arffmatrix/BuildModel.java
new file mode 100644
index 0000000..c94cbf3
--- /dev/null
+++ b/src/arffmatrix/BuildModel.java
@@ -0,0 +1,317 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+***
+* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/corpus/buildmodel.java
+* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
+*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
+*                   
+* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term
+* http://creativecommons.org/licenses/by-nc/3.0/
+*/
+
+package arffmatrix;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+import com.sun.org.apache.xerces.internal.impl.xs.identity.Selector.Matcher;
+
+import analyse.Extractor;
+import arffvector.CreateVector;
+import configure.ConfigConstants;
+
+/**
+ * This class reads the corpus instances and uses
+ * the CreateVector class to generate a model file (ARFF) *  
+ *
+ * @author Hayda Almeida, Marie-Jean Meurs
+ * @since 2014
+ *
+ */
+
+public class BuildModel {
+	
+		
+	public static void main(String[] args) {
+		
+		//-----------------------------------
+		// instantiate classes of constants
+		// and configuration file.
+		//-----------------------------------
+
+		ConfigConstants pathVars = new ConfigConstants();	
+
+		Extractor model = new Extractor();
+		File outputDir = new File(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL);
+		
+		model.initialize(outputDir, pathVars);
+		
+		CreateVector vectorgenerator = new CreateVector(pathVars);
+		String attributes = vectorgenerator.informFeatures(pathVars);
+		System.out.println("Features loaded ...");
+		
+		// name output ARFF files
+		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
+		String arffFileName = "triage" + pathVars.EXP_TYPE + "_"+ pathVars.PERCT_POS_TRAIN + attributes +"_"+ timeStamp + ".arff";
+				
+		try 
+	    {		
+			//by default
+			String sortarffFileName = pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + arffFileName; // default
+			
+			// create file			
+			BufferedWriter out = new BufferedWriter(new FileWriter(sortarffFileName));
+			 
+			// load ARFF header and write it
+			String outHeaderArff = vectorgenerator.genArffHeader(pathVars,Integer.parseInt(pathVars.EXP_TYPE));
+			//System.out.println(outHeaderArff); // verbose
+			out.write(outHeaderArff + "\n");			
+
+			// reader for corpus
+			BufferedReader reader = null;
+			//train corpus
+			if(Integer.parseInt(pathVars.EXP_TYPE) == 0)
+				reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE));	
+			//test corpus
+			else if(Integer.parseInt(pathVars.EXP_TYPE) ==1)
+				reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_FILE));
+						
+	        //--------------------------------------------
+	        // repeat until all lines have been read
+	        // from the file
+	        //--------------------------------------------
+			String text = null;
+			String content = null;
+			
+			String abstracttext = "";
+			String journaltitle = "";
+			String title = "";
+			String ecnumber = "";
+			String classtriage = "";
+			int hasText = 0;
+			int journaltitlecount = 0;
+			int abstracttitlecount = 0;
+			int abstracttextcount = 0;
+			int positivecount = 0;
+			int negativecount = 0;
+			
+			
+	        while ((text = reader.readLine()) != null) { 		        	
+	        	
+	        	// detect a PubMed abstract
+	        	if (text.contains("<PMID Version")){
+	        		
+	        		// Reinitialize journal title 
+	        		 journaltitle = "";
+	        		 
+	        		// Reinitialize abstract title 
+	        		 title = ""; 
+	        		
+	        		 // Reinitialize abstract text 
+	        		 abstracttext = ""; 
+	        		 
+	        		 // Reinitialize hasText to false
+	        		hasText = 0;
+	        		
+	        		String pmid = text.substring(text.indexOf("<PMID Version"));
+	        		
+	        		pmid = pmid.replaceFirst("<PMID Version=\"1\">", "");
+	        		pmid = pmid.replace("</PMID>", "").trim();
+	        		//System.out.println("PMID : " + pmid);
+	        		
+	        		// continue to read
+	        		content = reader.readLine();
+	        		content = content.replaceAll("\t", "");
+	        		content = content.replaceFirst("\\s+", "");	        		
+	        		
+	        		while ( ! content.contains("TRIAGE")) {
+	        			
+	        			if (content.contains("<Title>")){
+	        				
+	        				journaltitlecount++;
+	        				System.out.println("#: " + journaltitlecount + "\t PMID : " + pmid);
+	        				
+	        				content = content.replace("<Title>", "");
+	        				content = content.replace("</Title>", "");
+	        				journaltitle = content;
+	        				//System.out.println("Journal title : " + content);
+	        			}
+	        			
+	        			if (content.contains("<ArticleTitle>")){
+	        				
+	        				abstracttitlecount++;
+	        				
+	        				content = content.replace("<ArticleTitle>", "");
+	        				content = content.replace("</ArticleTitle>", "");
+	        				title = content;
+	        				//System.out.println("Paper title : " + content);
+	        			}
+	        			
+	        			        			
+	        			if (content.contains("<AbstractText>")){
+
+	        				abstracttextcount++;
+	        				hasText = 1; // use it to indicate if the abstract has some text or not 
+
+	        				content = content.replace("<AbstractText>", "");
+	        				
+	        				//checks if there are empty lines after AbstractText tag
+	        				//and keeps reading until finds the abstract content
+	        				while(content.isEmpty()){
+	        						content = reader.readLine();     					
+	        				}	        				
+	        					abstracttext = abstracttext + content; 	        					
+	        					// clean
+	        					abstracttext = model.removeAbstractTags(abstracttext);        					
+	        				
+
+	        				content = reader.readLine();
+	        				// converting toLowerCase is not relevant in bio context
+	        				// because it introduces ambiguities (ie Gene name / Enzyme alias)
+	        				// abstracttext = abstracttext.toLowerCase();
+	        			}
+
+	        			if (content.contains("<AbstractText ")){       				        				
+	        				
+	        				String temp = "";
+							String newAbs = "<AbstractText>";
+							
+							if(content.contains("</Abstract>")){
+								temp = temp + model.processAbstract(content);
+							}
+							else{
+								do{							
+									temp = temp + model.processAbstract(content);								
+									content = reader.readLine();							
+								}while(!(content.contains("</Abstract>")));
+							}
+							
+							newAbs = newAbs + temp;
+							content = newAbs + "</AbstractText>"; 
+							
+							abstracttext = content;
+							abstracttext = model.removeAbstractTags(abstracttext);
+							
+							content = reader.readLine();
+								        				
+	        			}	        			
+	        			
+	        			if (content.contains("<RegistryNumber>EC ")){
+	        				content = content.substring(content.indexOf("<RegistryNumber>EC "));
+							content = content.replace("</RegistryNumber>", "");
+							ecnumber = content;	        				
+	        			}
+	        			
+//	        			if (content.contains("<TRIAGE")){
+//
+//	        				content = content.substring(content.indexOf("<TRIAGE>"));
+//	        				content = content.replace("</TRIAGE>", "");
+//	        				classtriage = content;
+//	        				if(content.contains("positive")){
+//	        					positivecount++;
+//	        				}
+//	        				if(content.contains("negative")){
+//	        					negativecount++;
+//	        				}
+//
+//	        				//System.out.println("Triage classification : " + content);
+//	        			}
+	        			
+	        			content = reader.readLine();	        			
+	        			content = content.replaceAll("\t", "");
+	        			content = content.replaceFirst("\\s+", "");
+	        		}
+	        		
+	        		if (content.contains("<TRIAGE")){
+	        			
+        				content = content.substring(0, content.indexOf("</TRIAGE>"));
+        				content = content.replace("<TRIAGE>", "");
+        				classtriage = content;
+        				if(content.contains("positive")){
+        					positivecount++;
+        				}
+        				if(content.contains("negative")){
+        					negativecount++;
+        				}
+
+        				//System.out.println("Triage classification : " + content);
+        			}
+	        		
+	        		//System.out.println("Abstract : " + abstracttext.toString() + "\n\n");
+
+	        		// end of if: collect data and write ARFF
+	        		String Arffline = vectorgenerator.getArffLine(pmid,
+	        				journaltitle, 
+	        				title, 
+	        				abstracttext,
+	        				ecnumber,
+	        				classtriage,	        				
+	        				Integer.parseInt(pathVars.EXP_TYPE)
+	        				);
+	        		
+	        		Arffline = Arffline + "\n";
+	        		// write line on disc
+	        		out.write(Arffline);	        		
+	        		// out.write(id + " " + Arffline + "\n"); // 	        		
+	        	}      	
+	        	
+	        }
+	        
+	        System.out.println(
+	        		"Abstracts processed: " + abstracttitlecount
+	        		+ "\t with text content: " + abstracttextcount
+	        		+ "\t from " + journaltitlecount + " journals"
+	        		+ "\nTotal of: \n" + positivecount + " positive"
+	        		+ "\t and " + negativecount + " negative documents");
+	        out.write("\n");
+	        out.close();
+	        
+	        reader.close();
+	      
+	        
+	    }catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }		
+		
+	}	
+	
+}
+
+
+
diff --git a/src/arffvector/.gitignore b/src/arffvector/.gitignore
new file mode 100644
index 0000000..bdc0ba3
--- /dev/null
+++ b/src/arffvector/.gitignore
@@ -0,0 +1,7 @@
+/buildvector.class
+/FeatureVector.class
+/CreateVector.class
+/CreateWeightedVector.class
+/ArbitraryWeight.class
+/CountsWeightedVector.class
+/ArbitraryWeightedVector.class
diff --git a/src/arffvector/CreateVector.java b/src/arffvector/CreateVector.java
new file mode 100644
index 0000000..66aee23
--- /dev/null
+++ b/src/arffvector/CreateVector.java
@@ -0,0 +1,894 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+***
+* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/vector/buildvector.java
+* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
+*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
+*                   
+* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term
+* http://creativecommons.org/licenses/by-nc/3.0/
+*/
+
+
+
+
+package arffvector;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import org.apache.commons.lang3.StringUtils;
+import configure.ConfigConstants;
+
+/**
+ * Uses the features extracted and the 
+ * generated corpus to create a feature vector
+ * (a matrix representation of the corpus) 
+ * 
+ * @author Hayda Almeida, Marie-Jean Meurs
+ * @since 2014
+ * 
+ */
+public class CreateVector {	
+	
+	ArrayList<String> annotations = new ArrayList<String>();
+	ArrayList<String> annotationsType = new ArrayList<String>();
+	ArrayList<String> journalTitles = new ArrayList<String>();
+	ArrayList<String> ecnumbers = new ArrayList<String>();
+	ArrayList<String> titleGrams = new ArrayList<String>();
+	ArrayList<String> titleAnnot = new ArrayList<String>();
+	ArrayList<String> nGrams = new ArrayList<String>();
+	ArrayList<String> docID = new ArrayList<String>();
+		
+	ConfigConstants pathVars = null;
+	
+	/**
+	 * Constructor to load all features extracted
+	 * from training files. These features will be 
+	 * used to generate the ARFF header and the
+	 * ARFF vector lines.
+	 * 
+	 * @param extVars Variables holding system paths
+	 */
+	
+	public CreateVector(ConfigConstants extVars) {
+		
+		pathVars = extVars;		
+		
+		String pathJournalT = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES;
+		try{
+			String journalT = "";
+			
+			//receiving journal title
+			BufferedReader reader = new BufferedReader(new FileReader(pathJournalT));
+			int featcount = 0;
+			while (( journalT = reader.readLine()) != null) {
+				
+				if (Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
+										
+					String[] features = StringUtils.split(journalT,"\n"); 
+
+					for(int i = 0; i < features.length; i++){
+
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for journal titles duplicates 						
+						if(featurename[1] != "" && !(journalTitles.contains(featurename[1]))){
+							journalTitles.add(featurename[1]);
+						}
+					}		
+				}
+				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
+
+			}
+			reader.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+		
+		String pathAnnotations = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES;
+		String pathTitleAnnot = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES;
+		
+		try{
+			String abstAnnot = "";
+			String tAnnot = "";
+			
+			//receiving abstract annotations (features)
+			BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations));
+			BufferedReader readerT = new BufferedReader(new FileReader(pathTitleAnnot));
+			
+			int featcount = 0;			
+			
+			while (( abstAnnot = reader.readLine()) != null) {				
+				
+				if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
+					String[] features = StringUtils.split(abstAnnot,"\n"); 
+
+					for(int i = 0; i < features.length; i++){
+
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for duplicate abstract annotations
+						if(featurename[0] != "" && !(annotations.contains(featurename[0]))){
+							annotations.add(featurename[0]);
+						}
+					}		
+				}				
+				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
+			}
+			
+			
+			if(!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){
+				while((tAnnot = readerT.readLine()) != null){
+					
+					String[] features = StringUtils.split(tAnnot,"\n");
+					
+					for(int i = 0; i < features.length; i++){
+
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for duplicate annotations
+						if(featurename[0] != "" && !(annotations.contains(featurename[0]))){
+							annotations.add(featurename[0]);
+						}
+					}	
+					
+				}
+				
+			}
+			
+			reader.close();
+			readerT.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+		
+		try{
+			String abstAnType = "";
+			
+			//receiving abstract annotation types
+			BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations));
+			int featcount = 0;
+			while (( abstAnType = reader.readLine()) != null) {
+				
+				if (Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
+										
+					String[] features = StringUtils.split(abstAnType,"\n"); 
+
+					for(int i = 0; i < features.length; i++){
+
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for duplicate abstract annotation types 
+						if(featurename[1] != "" && !(annotationsType.contains(featurename[1]))){							
+							annotationsType.add(featurename[1]);
+						}
+						
+					}		
+				}
+				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
+
+			}
+			reader.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}		
+		
+		
+		
+		try{
+			String titAnnot = "";
+			
+			//receiving title annotations (features)
+			BufferedReader reader = new BufferedReader(new FileReader(pathTitleAnnot));
+			// int featcount = 0;
+			while (( titAnnot = reader.readLine()) != null) {
+
+				if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){
+					
+					//String titAnnot = FeatureExtractor.getTitCount();
+
+					String[] features = StringUtils.split(titAnnot,"\n");				
+
+					for(int i = 0; i < features.length; i++){
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for duplicate title annotations
+						if(!(titleAnnot.contains(featurename[0]))){
+							titleAnnot.add(featurename[0]);	
+						}
+					}								
+				}
+			}
+			reader.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+		
+		
+		String pathECNumFeatures = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES;
+
+		try{
+			String ECNum = "";
+
+			//receiving EC numbers (features)
+			BufferedReader reader = new BufferedReader(new FileReader(pathECNumFeatures));
+			// int featcount = 0;
+			while ((ECNum = reader.readLine()) != null) {
+
+				if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
+
+					//String titAnnot = FeatureExtractor.getTitCount();
+
+					String[] features = StringUtils.split(ECNum,"\n");				
+
+					for(int i = 0; i < features.length; i++){
+						String[] featurename = StringUtils.split(features[i],"\t");
+
+						//checking for duplicate EC numbers
+						if(!(ecnumbers.contains(featurename[0]))){
+							ecnumbers.add(featurename[0]);	
+						}
+					}								
+				}
+			}
+			reader.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+		
+		
+		String pathTitleGrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS;
+		
+		
+		try{
+			String titCont = "";
+			// String grams = "";
+			
+			//receiving title ngrams
+			BufferedReader reader = new BufferedReader(new FileReader(pathTitleGrams));
+			
+			int featcount = 0;
+			while (( titCont = reader.readLine()) != null) {
+
+				if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
+					
+					String[] content = StringUtils.split(titCont,"\n");				
+
+					for(int i = 0; i < content.length; i++){				
+						String[] featurename = StringUtils.split(content[i],"\t");			
+						
+						//check for duplicate title ngrams
+						if(!(titleGrams.contains(featurename[0]))){
+							titleGrams.add(featurename[0]);
+						}
+					}			
+				}
+			}
+						
+			reader.close();
+
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+				
+		String pathNgrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES;
+		try{
+			String grams = "";
+			String tgrams = "";
+			
+			//receiving ngrams
+			BufferedReader reader = new BufferedReader(new FileReader(pathNgrams));
+			BufferedReader readerT = new BufferedReader(new FileReader(pathTitleGrams));
+			
+			// int featcount = 0;
+			while (( grams = reader.readLine()) != null) {
+
+				if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
+
+					String[] features = StringUtils.split(grams,"\n");
+
+					for(int i = 0; i < features.length; i++){
+						String[] featurename = StringUtils.split(features[i],"\t");
+
+						//check for duplicate abstract ngrams
+						if(!(nGrams.contains(featurename[0]))){
+							nGrams.add(featurename[0]);
+						}
+					}
+				}
+
+			}
+			
+			//if not using title grams separately, 
+			// then insert title grams with abstract grams.  
+			if (!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){
+				while (( tgrams = readerT.readLine()) != null) {
+					
+					String[] features = StringUtils.split(tgrams,"\n");
+					
+					for(int i = 0; i < features.length; i++){
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//check for duplicate ngrams
+						if(!(nGrams.contains(featurename[0]))){
+							nGrams.add(featurename[0]);
+						}
+					}					
+				}				
+			}
+			
+			reader.close();
+			readerT.close();
+			
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+	}
+	
+	/**
+	 * Gathers the list of features, according to 
+	 * experimental configurations. The list of 
+	 * features will be written on the ARFF header.
+	 * 
+	 * @param pathVars Variables holding system paths
+	 * @param exp experiment type: train or test
+	 * @return a String containing the ARFF header
+	 */
+	
+	public String genArffHeader(ConfigConstants pathVars, int exp){
+		
+		StringBuilder headerArff = new StringBuilder();
+		
+		switch(exp){
+			case 0: 
+				headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2015\n\n");
+			break;			
+			case 1: 
+				headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2015\n\n");
+			break;
+		}		
+		
+		headerArff.append("@RELATION triage\n");
+		
+		if(Boolean.valueOf(pathVars.USE_TEXT_SIZE)){
+			// writing the list of text sizes
+			headerArff.append("@ATTRIBUTE sizeoftitle \tREAL \t\t%size of title\n");
+			headerArff.append("@ATTRIBUTE sizeoftext \tREAL \t\t%size of text\n");			
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_DOC_ID)){
+			//writing the docIDs
+			headerArff.append("@ATTRIBUTE docID \tREAL \t\t%PMID of paper\n");
+						
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
+			for(int i = 0; i < journalTitles.size(); i++){
+			// writing list of journal titles
+				String feature = journalTitles.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "journalTitle" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+							
+			}
+		}
+		
+		if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
+			// writing list of annotation features
+			for(int i = 0; i < annotations.size(); i++){
+
+				String feature = annotations.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "annotation" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+
+			}
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
+			// writing list of annotation entities
+			for(int i = 0; i < annotationsType.size(); i++){
+				String feature = annotationsType.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "annotationType" + String.valueOf(i) + namefeature;
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+				
+			}
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){			
+			// write list of title features
+			for( int i = 0; i < titleAnnot.size(); i++){
+
+				String feature = titleAnnot.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "titleAnnot" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+				
+			}			
+			
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
+			// writing list of EC numbers
+			for(int i = 0; i < ecnumbers.size(); i++){
+				String feature = ecnumbers.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "ECnumber" + String.valueOf(i) + namefeature;
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");				
+			}
+		}
+		
+		if (Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
+			// writing list of ngrams on titles			
+			for( int i = 0; i < titleGrams.size(); i++){
+
+				String feature = titleGrams.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "titleNgram" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+				
+			}
+		}		
+		
+		if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
+			// write list of ngrams
+			for(int i = 0; i < nGrams.size(); i++){
+
+				String feature = nGrams.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				String ref = "Ngram" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+				
+			}
+		}
+		
+		// writing the dataset classes		
+		headerArff.append("@ATTRIBUTE class 	{positive, negative}\n");
+		headerArff.append("@DATA\n");
+		
+		return headerArff.toString();
+	}	
+
+	/**
+	 * Iterates over the list of features and 
+	 * counts number of features containing 
+	 * on a given document.    
+	 * 
+	 * @param jTitle title of journal 
+	 * @param title  title of paper
+	 * @param text  abstract content
+	 * @param ecnum  paper EC numbers 
+	 * @param classTriage  triage classification: positive or negative
+	 * @param exp experiment type: train or test
+	 * @return String holding counts for all features found in a document
+	 */
+	
+	public String getArffLine(String paperID, String jTitle, String title, String text, String ecnum, String classTriage, int exp){
+		//String vectorArff = "";
+		StringBuilder vectorArff = new StringBuilder();
+				
+		paperID = removeSpecialChar(paperID.toLowerCase());
+		text = removeSpecialChar(text.toLowerCase());
+		title = removeSpecialChar(title.toLowerCase());
+		jTitle = removeSpecialChar(jTitle.toLowerCase());
+		ecnum = removeSpecialChar(ecnum);			
+		
+		int emptyabs = 0;
+
+		// fill title and text sizes (number of words)
+		// annotation markups do not matter because
+		// they do not introduce blank spaces hence 
+		// they do not modify the number of words found	
+		if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){
+
+			String[] titleGrams = StringUtils.split(title," ");
+			int titlesize = titleGrams.length;
+
+			String[] abstractcontent = StringUtils.split(text," ");
+			int abstractsize = abstractcontent.length;
+			
+			if(abstractsize == 1){
+				emptyabs++;
+			}
+			
+			vectorArff.append(titlesize).append(",").append(abstractsize).append(",");			
+		}
+		
+		//fill ID of documents
+		if(Boolean.valueOf(pathVars.USE_DOC_ID)){
+
+				if(paperID.length()>0){					
+					vectorArff.append(paperID).append(",");
+				}
+				else{
+					vectorArff.append("0,");
+				}			
+		}
+		
+		//fill values of journal titles
+		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
+			
+			for(int i = 0; i < journalTitles.size(); i++){
+				String jfeat = "";
+				int jfeatcount = 0;
+				jfeat = journalTitles.get(i).replaceFirst(" ", "");
+				
+				if(jTitle.contains(jfeat)){
+					jfeatcount = StringUtils.countMatches(jTitle, jfeat);
+					vectorArff.append(jfeatcount).append(",");
+				}
+				else{
+					vectorArff.append("0,");
+				}
+			}
+		}
+		
+		// fill values of annotation types taken into account 
+		// either only the abstract or abstract and title
+		// adds on vector the count of occurrences		
+		if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
+
+			for(int i = 0; i < annotations.size(); i++){		
+				String anfeat = "";
+				int anfeatcount = 0;
+				anfeat = annotations.get(i).replaceFirst(" ", "").toLowerCase();
+				
+				//in case the text has current annotation
+				if (text.contains(anfeat)){
+					//check the count of the annotation
+					if((Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){
+						anfeatcount = StringUtils.countMatches(text, anfeat);						
+					}
+					//adding title annot count to annotations
+					else if (!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){						
+						anfeatcount = StringUtils.countMatches(text, anfeat);
+						//in case title has annotation, add to count
+						if(title.contains(anfeat)){
+							anfeatcount = anfeatcount + StringUtils.countMatches(title, anfeat);
+						}
+					}					
+					vectorArff.append(anfeatcount).append(",");
+				}
+				//handles the case that only the title (but not abstract) has current annotation
+				else if((!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)))){
+					if(title.contains(anfeat)){
+						anfeatcount = StringUtils.countMatches(title, anfeat);
+					}
+					vectorArff.append(anfeatcount).append(",");
+				}
+				else{
+					vectorArff.append("0,");					
+				}
+			}			
+		}
+		
+		//fill values of abstract annotation types
+		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
+			
+			for(int i = 0; i < annotationsType.size(); i++){		
+				String antype = "";
+				int antypecount = 0;
+				antype = annotationsType.get(i).replaceFirst(" ", "").toLowerCase();
+				
+				if (text.contains(antype)){
+					//divided by 2  to match occurance 
+					//(count considers open and close tags)
+					antypecount = (StringUtils.countMatches(text, antype))/2;
+					vectorArff.append(antypecount).append(",");					
+				}
+				else{
+					vectorArff.append("0,");					
+				}
+			}		
+			
+		}
+		
+		//fill values of title annotations
+		if (Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){
+			
+			for( int i =0; i < titleAnnot.size(); i++){				
+				String titfeat = "";
+				int titfeatcount = 0;
+				titfeat = titleAnnot.get(i).replaceFirst(" ", "").toLowerCase();
+				
+				if (title.contains(titfeat)){
+					titfeatcount = StringUtils.countMatches(title, titfeat);
+					vectorArff.append(titfeatcount).append(",");					
+				}
+				else{
+					vectorArff.append("0,");				
+				}				
+			}
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
+			
+			for(int i = 0; i < ecnumbers.size(); i++){
+				String ecfeat = "";
+				int ecnumcount  = 0;
+				ecfeat = ecnumbers.get(i);
+				
+				if(ecnum.contains(ecfeat)){
+					ecnumcount = StringUtils.countMatches(ecnum, ecfeat);
+					vectorArff.append(ecnumcount).append(",");
+				}
+				else{
+					vectorArff.append("0,");
+				}
+			}
+		}
+		
+		// fill only values of title ngrams
+		if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
+
+			String cleanTitle = removeTags(title.toLowerCase());
+					
+			for( int i =0; i < titleGrams.size(); i++){
+				String titgram = "";
+				int titgramcount = 0;
+				titgram = titleGrams.get(i).toLowerCase();
+				
+				//in case the title has current ngram
+				if (cleanTitle.contains(titgram)){
+					//check the count of the ngram
+					titgramcount = StringUtils.countMatches(cleanTitle, titgram);
+
+					//adding weight to current ngram count
+					if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
+						titgramcount = applyWeight(titgramcount, Integer.parseInt(pathVars.WEIGHT));						
+					}
+					vectorArff.append(titgramcount).append(",");					
+				}
+				else{
+					vectorArff.append("0,");				
+				}
+			}					
+		}
+		
+		// fill values of ngrams
+		if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
+			String cleanText = removeTags(text.toLowerCase());
+			String cleanTitle = removeTags(title.toLowerCase());
+						
+			for( int i = 0; i < nGrams.size(); i++){
+				String ngramfeat = "";
+				int ngramcount = 0;
+				ngramfeat = nGrams.get(i).toLowerCase();
+
+				//in case the text has current ngram
+				if (cleanText.contains(ngramfeat)){
+					//check the count of the ngram
+					if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){						
+						ngramcount = StringUtils.countMatches(cleanText, ngramfeat);
+						
+						//adding weight to current ngram count 
+						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
+							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
+						}
+					}
+					//checking if title ngrams should be added to the count
+					else if(!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){
+						ngramcount = StringUtils.countMatches(cleanText, ngramfeat);						
+						
+						//in case title has ngram, add to count
+						if(cleanTitle.contains(ngramfeat)){							
+							ngramcount += StringUtils.countMatches(cleanTitle, ngramfeat);							
+						}
+						
+						//adding weight to current ngram count 
+						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){							
+							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
+						}
+					}				
+
+					vectorArff.append(ngramcount).append(",");					
+				}
+				////handles the case that only the title (but not abstract) has current ngram
+				else if (!(cleanText.contains(ngramfeat))){
+					//in case only the title has the ngram, add to count
+					if(cleanTitle.contains(ngramfeat)){
+						ngramcount = StringUtils.countMatches(cleanTitle, ngramfeat);
+						
+						//adding weight to ngram count
+						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
+							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
+						}
+					}
+					vectorArff.append(ngramcount).append(",");
+				}
+				else{
+					vectorArff.append("0,");					
+				}
+			}	
+		}
+		
+		
+		//if(exp == 0){
+			if (classTriage.contains("positive")){ 
+				vectorArff.append("positive");
+				//vectorArff.append("?");	
+			}
+			else {
+				vectorArff.append("negative");
+				//vectorArff.append("?");
+			}
+		//}
+
+		/*else if (exp == 1){
+			vectorArff.append("?");				
+		}	*/	
+		
+	return vectorArff.toString();
+	}
+	
+	/**
+	 * Cleans a given String from special characters
+	 *  
+	 * @param str String to be cleaned 
+	 * @return String without special characters
+	 */
+	
+	public String removeSpecialChar(String str){
+		str = str.replace("}", "");
+		str = str.replace("{", "");
+		str = str.replace("]", "");
+		str = str.replace("[", "");
+		str = str.replace("#", "");
+		str = str.replace("*", "");
+		str = str.replace("&gt", "");
+		str = str.replace("&quot", "");
+		str = str.replace("&apos", "");
+		str = str.replace("&amp", "");
+		str = str.replace("%", "");
+		str = str.replace("/", "");
+		str = str.replace("\\", "");		
+		str = str.replace("&", "");
+		str = str.replace("=", "");
+		str = str.replace("?", "");
+		str = str.replace(",", "");
+		str = str.replace(":", "");
+		str = str.replace(";", "");
+		str = str.replace(".", "");
+		str = str.replace(")", "");
+		str = str.replace("(", "");		
+		str = str.replace("\t\t", "\t");
+		str = str.replace("-", "");
+		str = str.replace("  ", "");
+		
+		return str;
+	}
+	
+	/**
+	 * 
+	 * @param str
+	 * @return
+	 */
+	public String removeTags(String str){
+		String[] remove = StringUtils.split(str,"");
+		StringBuilder sb = new StringBuilder();
+		
+		for(int i = 0; i < remove.length; i++){
+			
+			if(remove[i].equalsIgnoreCase("<")){
+				do{
+					i++;
+				}
+				while(!(remove[i].equalsIgnoreCase(">")));
+			}
+			else sb.append(remove[i]);
+		}
+				
+		return sb.toString();	
+	}
+	
+	public int applyWeight(int count, int weight){
+		
+		if(weight > 0){
+			count = count * weight;
+		}
+		return count;
+	}
+	
+	
+	public String informFeatures(ConfigConstants pathVars){
+		String value = "";
+		if(Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE))
+			value = value + "_annotations";
+		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE))
+			value = value + "_types";
+		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE))
+			value = value + "_journal";
+		if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE) || Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))
+			value = value + "_title";
+		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE))
+			value = value + "_ecnum";
+		if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE))
+			value = value + "_ngrams_size"+ pathVars.NGRAM_SIZE;
+		if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE) && Boolean.valueOf(pathVars.NGRAM_STOP))
+			value = value + "_stopwords";
+		if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM))
+			value = value + "_weight"+ pathVars.WEIGHT;
+		
+		return value;
+	}
+
+	
+}
diff --git a/src/classifier/.gitignore b/src/classifier/.gitignore
new file mode 100644
index 0000000..b92cc15
--- /dev/null
+++ b/src/classifier/.gitignore
@@ -0,0 +1,3 @@
+/test.class
+/train.class
+/Trainer.class
diff --git a/src/classifier/Trainer.java b/src/classifier/Trainer.java
new file mode 100644
index 0000000..5bb317c
--- /dev/null
+++ b/src/classifier/Trainer.java
@@ -0,0 +1,483 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+package classifier;
+import java.util.ArrayList;
+import weka.classifiers.Classifier;
+import weka.classifiers.Evaluation;
+import weka.classifiers.bayes.NaiveBayes;
+import weka.classifiers.evaluation.Prediction;
+import weka.classifiers.evaluation.output.prediction.PlainText;
+import weka.classifiers.functions.LibSVM;
+import weka.classifiers.trees.LMT;
+import weka.core.Instances;
+import weka.core.Range;
+import weka.core.converters.ConverterUtils.DataSource;
+import weka.filters.Filter;
+import weka.filters.unsupervised.attribute.Remove;
+import configure.ConfigConstants;
+import filter.InformedFilter;
+
+/**
+ * Trains and tests a classifier, 
+ * executes k-fold cross validation on train data 
+ * and outputs the classification results.
+ * 
+ * @author Hayda Almeida
+ * @since 2014
+ *
+ */
+
+public class Trainer {
+	
+	public static int SEED = 1; //the seed for randomizing the data
+	public static int FOLDS = 5; //the # of folds to generate
+	double[][] ranking;
+	String rank;
+	
+	boolean verbose = true;
+	
+
+	/**
+	 * @param args
+	 * @throws Exception 
+	 */
+	public static void main(String[] args) throws Exception {
+		
+		
+		String classifier = "";	
+		
+		//for(int i = 0; i < args.length; i++){
+		try{
+
+			classifier = args[0];
+
+			if(classifier.length() > 1){
+				if(classifier.contains("lmt"))
+					classifier = "lmt";
+				else if(classifier.contains("svm"))
+					classifier = "svm";
+				else classifier = "nb";				
+			}
+			
+		}
+			catch(Exception e){
+				//	else{
+				System.out.println("A classifier must be given as argument. Use: \n"
+						+ "-lmt -> a LMT classifier; \n"
+						+ "-svm -> a SVM classifier; \n"
+						+ "-nb  -> a Naive Bayes classifier. ");
+				System.exit(0);
+			}
+	//	}
+		
+		ConfigConstants pathVars = new ConfigConstants();
+		Trainer evaluator = new Trainer();
+		InformedFilter filter = new InformedFilter();			
+		Classifier cls;
+		
+		//Creating classifier
+		if(classifier.contains("lmt")) 
+			cls = (Classifier) new LMT();
+		else if (classifier.contains("svm")) 
+			cls = (Classifier) new LibSVM();
+		else 
+			cls = (Classifier) new NaiveBayes();
+       						
+		//Loading train data
+		DataSource sourceTrain = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.ARFF_TRAIN);
+		Instances trainData = sourceTrain.getDataSet();
+		
+		//Flagging the class index on the training data
+		trainData.setClassIndex(trainData.numAttributes()-1);		
+		System.out.println("Class index set on training data.");
+		
+		System.out.println("Training data loaded. Number of instances: " + trainData.numInstances() + "\n");	
+					
+		
+		//Loading test data
+		DataSource sourceTest = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.ARFF_TEST);
+		Instances testData = sourceTest.getDataSet();
+		
+		//Flagging the class index on the training data
+		testData.setClassIndex(trainData.numAttributes()-1);		
+		System.out.println("Class index set on testing data.");
+		
+		System.out.println("Test data loaded. Number of instances: " + testData.numInstances() + "\n");		
+		
+		
+		//filter the file IDs, consider the new training set
+		Instances filteredTrainData = evaluator.filteredIDs(trainData);
+		Instances filteredTestData = evaluator.filteredIDs(testData);
+		
+		if(Boolean.valueOf(pathVars.USE_ODDS_RATIO)){
+			//Calculate OddsRatio for all instances
+			double[] OR = evaluator.loadFeatureFilter(filteredTrainData, filter, 1, Integer.parseInt(pathVars.OR_THRESHOLD));
+
+			//Apply Odds Ratio filtering in instances
+			filteredTrainData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTrainData);
+			filteredTestData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTestData);
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_IDF)){
+			//Calculate idf for all instances
+			double[] idf = evaluator.loadFeatureFilter(filteredTrainData, filter, 2, Integer.parseInt(pathVars.IDF_THRESHOLD));
+			
+			//Apply idf filtering in instances
+			filteredTrainData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTrainData);
+			filteredTestData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTestData);
+		}
+				
+		//Training and testing classifier
+		evaluator.classify(filteredTrainData, filteredTestData, cls, testData);			
+		
+	}	
+	
+	/**
+	 * Loads evaluation of attributes according
+	 * to feature selection method provided.
+	 * 
+	 * @param data data instances
+	 * @param filter informed filter instance 
+	 * @param method identifier for selection method 
+	 * @return
+	 */
+	private double[] loadFeatureFilter(Instances data, InformedFilter filter, int method, int threshold){
+		
+		double[] values = new double[data.numAttributes()];		
+		
+		switch(method){
+		
+		case 1:
+			values = filter.oddsRatio(data, threshold);
+			break;
+		case 2:
+			values = filter.idf(data, threshold);
+			break;
+		}		
+		
+		return values;		
+	}	
+	
+	/**
+	 * Uses evaluation of features according to 
+	 * selection method to remove attributes from
+	 * the dataset before training phase.
+	 * 
+	 * @param threshold selection method threshold
+	 * @param values evaluation of attributes according to method 
+	 * @param data dataset instances
+	 * @return filtered dataset instances
+	 * @throws Exception
+	 */	
+	private Instances applyFilter(String threshold, double[] values, Instances data) throws Exception{
+		int numberRemoved = 0;
+		
+		String indexRemove = "";		
+		
+		for(int i = 0; i < values.length; i++){
+			if(values[i] == 0){
+				
+				int ind = i+1;
+				
+				if(indexRemove.length()==0) indexRemove = ind + ""; 
+				else indexRemove = indexRemove + "," + ind;
+				
+				numberRemoved++;
+			}
+		}
+		
+		try{
+			indexRemove = indexRemove.substring(0, indexRemove.length()-1);
+			//if(verbose)
+			System.out.println("\n = = = = => Filter removed " + numberRemoved +" attributes: " + indexRemove.toString() );
+		}
+		catch (Exception e){
+			System.out.println("\n = = = = => Filter threshold did not remove any attribute.");
+			}
+		
+		Remove remove = new Remove();
+		remove.setAttributeIndices(indexRemove);
+		remove.setInvertSelection(false);		
+		remove.setInputFormat(data);		
+		
+		Instances dataSubset = Filter.useFilter(data, remove);
+		return dataSubset;		
+	}
+	
+	
+	/**
+	 * Removes the ID attribute (index 1) 
+	 * from a given dataset 
+	 * 
+	 * @param data instances
+	 * @return filtered dataset
+	 * @throws Exception
+	 */
+	private Instances filteredIDs(Instances data) throws Exception {
+		Remove remove = new Remove();		
+		//setting index to be removed
+		remove.setAttributeIndices("1");
+		remove.setInvertSelection(false);		
+		remove.setInputFormat(data);
+		
+		Instances dataSubset = Filter.useFilter(data, remove);
+		return dataSubset;
+	}
+
+
+	/**
+	 * Trains and tests a classifier when two separated
+	 * datasets are provided.
+	 * 
+	 * @param train training data to build classifier
+	 * @param test  test data to evaluate classifier
+	 * @param classif  type of classifier applied
+	 * @throws Exception
+	 */
+	public void classify(Instances filteredTrain, Instances filteredTest, Classifier classif, Instances test) throws Exception{
+
+		StringBuffer sb = new StringBuffer();
+		PlainText prediction = new PlainText();
+		Range attributesToShow = null;
+		prediction.setBuffer(sb);
+		prediction.setHeader(test);				
+		prediction.setOutputDistribution(true);
+
+		classif.buildClassifier(filteredTrain);
+
+		Evaluation evaluateClassifier = new Evaluation(filteredTrain);		
+		evaluateClassifier.evaluateModel(classif, filteredTest, prediction, attributesToShow, true);
+		//evaluateClassifier.evaluateModel(classif, filteredTest);	
+
+			stats(evaluateClassifier, classif);
+
+		ArrayList<Prediction> output =  evaluateClassifier.predictions();		
+
+		if(verbose){
+		for(int i = 0; i < output.size(); i++){
+			double act = output.get(i).actual();
+			String actual;
+			if(act == 1.0) actual = "negative"; else actual = "positive";
+
+			double pred = output.get(i).predicted();
+			String predicted;
+			if(pred == 1.0) predicted = "negative"; else predicted = "positive";
+
+			String value = test.instance(i).toString(0);
+
+			System.out.println("PMID: "+ value + "\t" +
+					"Actual: " + actual + "\t" +
+					"Predicted: " + predicted								
+					);	
+		}	}			
+	}
+
+	
+	/**
+	 * Outputs classifier results.
+	 * 
+	 * @param eval  Evaluation model built by a classifier
+	 * @param classif  type of classifier applied
+	 * @throws Exception 
+	 */
+	public void stats(Evaluation eval, Classifier classif) throws Exception{		
+		System.out.println("Number of attributes: " + eval.getHeader().numAttributes());
+		System.out.println(eval.toSummaryString("\n======== RESULTS ========\n", false));
+		System.out.println(eval.toClassDetailsString("\n\n======== Detailed accuracy by class ========\n"));
+		System.out.println(eval.toMatrixString("\n\n======== Confusion Matrix ========\n"));		
+	}
+	
+	
+	//Training and testing costSensitive classifier
+	//evaluator.classify(trainData, testData, evaluator.classifySensitive(cls));
+	
+//	/**
+//	 * Trains and tests a classifier using a 
+//	 * provided Cost matrix 
+//	 * 
+//	 * @param classif type of classifier to be trained
+//	 * @return CostSensitive classifier with costs and classifier
+//	 * @throws Exception
+//	 */	
+//	public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{
+//		CostSensitiveClassifier costSensitive = new CostSensitiveClassifier();
+//		CostMatrix matrix = new CostMatrix(2);
+//		matrix.setElement(0, 1, 4);
+//		matrix.setElement(1, 0, 1);
+//		costSensitive.setClassifier(classif);
+//		costSensitive.setCostMatrix(matrix);
+//		
+//		return costSensitive;
+//	}
+	
+	//Executing k-fold cross validation on filtered classifiers
+	//evaluator.crossFold(trainData, PCAclassifier);
+	//evaluator.crossFold(trainData, LSAclassifier);
+	
+//	/**
+//	 * Executes k-fold cross validation 
+//	 * on a given dataset
+//	 * @param data training data provided
+//	 * @param classif type of classifier usedsearch
+//	 * @throws Exception
+//	 */			
+//	public void crossFold(Instances data, Classifier classif) throws Exception{
+//
+//		Random random = new Random(SEED); //creating seed number generator
+//		Evaluation evaluateClassifier = new Evaluation(data);
+//		
+//		System.out.println("Classifier working...\n\n");
+//		//Classifier should not be trained when cross-validation is executed. 
+//		//because subsequent calls to buildClassifier method will return the same results always.
+//		evaluateClassifier.crossValidateModel(classif, data, FOLDS, random);		
+//						
+//		stats(evaluateClassifier, classif);		
+//	}	
+	
+	
+	//Creating filtered classifiers
+	//AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls);
+	//AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls);
+	//AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls);
+	//AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls);
+	
+//	/**
+//	 * Implements a Filtered GainRatio classifier, 
+//	 * using the ranker as a search method.
+//	 * 
+//	 * @param classif type of classifier to be used
+//	 * @return  filtered classif with Correlation analysis
+//	 */	
+//	public AttributeSelectedClassifier setGRFilter(Classifier classif){
+//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+//		
+//		//Creating evaluator and search method
+//		GainRatioAttributeEval GR = new GainRatioAttributeEval();
+//		Ranker rank = new Ranker();
+//		//return the attributes with evaluation greater than 0
+//		double threshold = 0.0;
+//		rank.setThreshold(threshold);
+//		
+//		//Setting GainRatio filtered classifier		
+//		fClassif.setClassifier(classif);
+//		fClassif.setEvaluator(GR);
+//		fClassif.setSearch(rank);
+//		
+//		return fClassif;
+//		
+//	}
+//	
+//	/**
+//	 * Implements a Filtered Correlation classifier, 
+//	 * using the ranker as a search method.
+//	 * 
+//	 * @param classif type of classifier to be used
+//	 * @return  filtered classif with Correlation analysis
+//	 */	
+//	public AttributeSelectedClassifier setCorrFilter(Classifier classif){
+//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+//		
+//		//Creating evaluator and search method
+//		CorrelationAttributeEval Corr = new CorrelationAttributeEval();
+//		Ranker rank = new Ranker();
+//		
+//		//return the attributes with evaluation greater than 0
+//		double threshold = 0.03;
+//		rank.setThreshold(threshold);
+//		
+//		//Setting GainRatio filtered classifier		
+//		fClassif.setClassifier(classif);
+//		fClassif.setEvaluator(Corr);
+//		fClassif.setSearch(rank);
+//		
+//		return fClassif;
+//		
+//	}
+//	
+//	/**
+//	 * Implements a Filtered PCA classifier, 
+//	 * using the ranker as a search method.
+//	 * 
+//	 * @param classif type of classifier to be used
+//	 * @return  filtered classif with PCA analysis config
+//	 */
+//	public AttributeSelectedClassifier setPCAFilter(Classifier classif){
+//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+//		
+//		//Creating evaluator and search method
+//		PrincipalComponents PCA = new PrincipalComponents();
+//		PCA.setMaximumAttributeNames(-1);
+//		Ranker rank = new Ranker();
+//		//return the attributes with evaluation greater than 0
+//		rank.setThreshold(0);
+//				
+//		//Setting the PCA classifier configurations
+//		fClassif.setClassifier(classif);
+//		fClassif.setEvaluator(PCA);
+//		fClassif.setSearch(rank);		
+//		
+//		return fClassif;
+//	}
+//	
+//	/**
+//	 * Implements a Filtered LSA classifier, 
+//	 * using the ranker as a search method
+//	 * @param classif
+//	 * @return
+//	 */	
+//	private AttributeSelectedClassifier setLSAFilter(Classifier classif) {
+//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+//		
+//		//Creating evaluator
+//		LatentSemanticAnalysis LSA = new LatentSemanticAnalysis();
+//		LSA.setMaximumAttributeNames(-1);
+//		//value between 0 and 1 includes proportion of total latent variables
+//		//greater than 1 = exact # of variables to include;
+//		//less than or equal zero = include all;
+//		//default = 0.95 (proportional)
+//		double defaul = 0;
+//		LSA.setRank(defaul);
+//		//Creating search method
+//		Ranker rank = new Ranker();
+//		rank.setThreshold(0);
+//				
+//		//Setting the LSA classifier configurations
+//		fClassif.setClassifier(classif);		
+//		fClassif.setEvaluator(LSA);
+//		fClassif.setSearch(rank);				
+//		
+//		return fClassif;
+//	}	
+	
+	
+
+}
diff --git a/src/configure/.gitignore b/src/configure/.gitignore
new file mode 100644
index 0000000..26ecd44
--- /dev/null
+++ b/src/configure/.gitignore
@@ -0,0 +1,2 @@
+/DeprecatedVariables.class
+/PathConstants.class
diff --git a/src/configure/ConfigConstants.java b/src/configure/ConfigConstants.java
new file mode 100644
index 0000000..eb6b602
--- /dev/null
+++ b/src/configure/ConfigConstants.java
@@ -0,0 +1,220 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+***
+* This class re-uses https://code.google.com/p/semlinker/source/browse/trunk/src/configure/NistKBPConfiguration.java
+* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
+*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
+*                   
+* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License
+* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
+*/
+
+package configure;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ *
+ * Variables used by the software 
+ * 
+ * @author Hayda Almeida, Marie-Jean Meurs
+ * @since 2014
+ *  
+ */
+public class ConfigConstants {	
+
+	public static String CONFIG_FILE = "config.cfg";
+	
+	/**
+	 * Default constructor
+	 */
+	public ConfigConstants() {
+		initVars();
+	}
+
+	/**
+	 * Constructor with custom parameter file.
+	 * @param configfile
+	 */
+//	public ConfigConstants(String configfile) {
+//		CONFIG_FILE = configfile;
+//		initVars();
+//	}
+
+
+	//public static String CONFIG_FILE = "config.cfg";
+	public HashMap<String, String> CONFIG_MAP = new HashMap<String, String>();
+
+	//Input files
+	public String HOME_DIR;
+	public String CORPUS_DIR;
+	public String DUP_DIR;
+	public String POS_DIR;
+	public String NEG_DIR; 
+	public String TRAIN_DIR; 
+	public String TEST_DIR;
+	public String FEATURE_DIR;	
+	public String OUTPUT_MODEL;
+
+	public String TRAINING_FILE;
+	public String TEST_FILE;
+	public String ARFF_TRAIN;
+	public String ARFF_TEST;
+	public String STOP_LIST;
+	
+	//corpus sampling
+	public String SAMPLE_TRAIN;
+	public String SAMPLE_TEST;
+	public String PERCT_TEST; 
+	public String PERCT_POS_TRAIN;
+	public String PERCT_POS_TEST;
+
+	//Output files
+	public String JOURNAL_TITLE_FEATURES;
+	public String ECNUM_FEATURES;
+	public String ANNOTATION_FEATURES;
+	public String TITLE_FEATURES;
+	public String NGRAM_FEATURES;
+	public String TITLE_NGRAMS;
+	public String DOC_IDS;
+	
+	//Feature setup
+	public String USE_TEXT_SIZE;
+	public String USE_JOURNAL_TITLE_FEATURE;
+	public String USE_ECNUM_FEATURE;
+	public String FEATURE_MIN_FREQ;
+	public String FEATURE_MIN_LENGTH;
+	
+	//Feature setup - Annotations
+	public String USE_ANNOTATION_FEATURE;
+	public String USE_ANNOTATION_TYPE;
+	public String USE_TITLE_FEATURE;
+	public String USE_DOC_ID;
+	
+	//Feature setup - Ngrams
+	public String USE_NGRAM_FEATURE;
+	public String USE_TITLE_NGRAMS;
+	public String NGRAM_STOP;
+	public String NGRAM_SIZE;
+	public String USE_WEIGHTED_NGRAM;
+	public String WEIGHT;
+	
+	//Feature filtering
+	public String USE_ODDS_RATIO;
+	public String OR_THRESHOLD;
+	public String USE_IDF;
+	public String IDF_THRESHOLD;
+
+	//Task setup
+	public String EXP_TYPE;	
+	public String NB_PARAMS;		
+	
+
+	private void initVars() {
+		String text = null;
+
+		try {
+			BufferedReader reader = new BufferedReader(new InputStreamReader(
+			         this.getClass().getClassLoader().getResourceAsStream(CONFIG_FILE)));
+			//BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+			while ((text = reader.readLine()) != null) {
+				if (! text.startsWith("#")) {
+					String label = text.split("=")[0];
+					String value = text.split("=")[1];
+					CONFIG_MAP.put(label, value);
+				}
+			}
+			reader.close();
+		} catch (IOException ex) {
+			Logger.getLogger(ConfigConstants.class.getName()).log(Level.SEVERE, null, ex);
+		}
+		HOME_DIR = CONFIG_MAP.get("HOME_DIR");
+		CORPUS_DIR = CONFIG_MAP.get("CORPUS_DIR");		
+		DUP_DIR = CONFIG_MAP.get("DUP_DIR");
+		POS_DIR = CONFIG_MAP.get("POS_DIR");
+		NEG_DIR = CONFIG_MAP.get("NEG_DIR"); 
+		TRAIN_DIR = CONFIG_MAP.get("TRAIN_DIR"); 
+		TEST_DIR = CONFIG_MAP.get("TEST_DIR");
+		FEATURE_DIR = CONFIG_MAP.get("FEATURE_DIR");		
+		OUTPUT_MODEL = CONFIG_MAP.get("OUTPUT_MODEL");
+		
+		TRAINING_FILE = CONFIG_MAP.get("TRAINING_FILE");
+		TEST_FILE = CONFIG_MAP.get("TEST_FILE");
+		ARFF_TRAIN = CONFIG_MAP.get("ARFF_TRAIN");
+		ARFF_TEST = CONFIG_MAP.get("ARFF_TEST");
+		STOP_LIST = CONFIG_MAP.get("STOP_LIST");
+		
+		SAMPLE_TRAIN = CONFIG_MAP.get("SAMPLE_TRAIN");
+		SAMPLE_TEST = CONFIG_MAP.get("SAMPLE_TEST"); 
+		PERCT_TEST = CONFIG_MAP.get("PERCT_TEST"); 
+		PERCT_POS_TRAIN = CONFIG_MAP.get("PERCT_POS_TRAIN");
+		PERCT_POS_TEST = CONFIG_MAP.get("PERCT_POS_TEST");
+		
+		JOURNAL_TITLE_FEATURES = CONFIG_MAP.get("JOURNAL_TITLE_FEATURES");
+		ECNUM_FEATURES = CONFIG_MAP.get("ECNUM_FEATURES");	
+		ANNOTATION_FEATURES = CONFIG_MAP.get("ANNOTATION_FEATURES");
+		TITLE_FEATURES = CONFIG_MAP.get("TITLE_FEATURES");
+		NGRAM_FEATURES = CONFIG_MAP.get("NGRAM_FEATURES");
+		TITLE_NGRAMS = CONFIG_MAP.get("TITLE_NGRAMS");
+		DOC_IDS = CONFIG_MAP.get("DOC_IDS");
+		
+		USE_TEXT_SIZE = CONFIG_MAP.get("USE_TEXT_SIZE");
+		USE_JOURNAL_TITLE_FEATURE = CONFIG_MAP.get("USE_JOURNAL_TITLE_FEATURE");	
+		USE_ECNUM_FEATURE = CONFIG_MAP.get("USE_ECNUM_FEATURE");
+		FEATURE_MIN_FREQ = CONFIG_MAP.get("FEATURE_MIN_FREQ");
+		FEATURE_MIN_LENGTH = CONFIG_MAP.get("FEATURE_MIN_LENGTH");
+		
+		USE_ANNOTATION_FEATURE = CONFIG_MAP.get("USE_ANNOTATION_FEATURE");
+		USE_ANNOTATION_TYPE = CONFIG_MAP.get("USE_ANNOTATION_TYPE");		
+		USE_TITLE_FEATURE = CONFIG_MAP.get("USE_TITLE_FEATURE");
+		USE_DOC_ID = CONFIG_MAP.get("USE_DOC_ID");
+		
+		USE_NGRAM_FEATURE = CONFIG_MAP.get("USE_NGRAM_FEATURE");
+		USE_TITLE_NGRAMS = CONFIG_MAP.get("USE_TITLE_NGRAMS");
+		NGRAM_STOP = CONFIG_MAP.get("NGRAM_STOP");		
+		NGRAM_SIZE = CONFIG_MAP.get("NGRAM_SIZE");
+		USE_WEIGHTED_NGRAM = CONFIG_MAP.get("USE_WEIGHTED_NGRAM");
+		WEIGHT = CONFIG_MAP.get("WEIGHT");
+		
+		USE_ODDS_RATIO = CONFIG_MAP.get("USE_ODDS_RATIO");
+		OR_THRESHOLD = CONFIG_MAP.get("OR_THRESHOLD");
+		USE_IDF = CONFIG_MAP.get("USE_IDF");
+		IDF_THRESHOLD = CONFIG_MAP.get("IDF_THRESHOLD");
+				
+		EXP_TYPE = CONFIG_MAP.get("EXP_TYPE");		
+		NB_PARAMS = CONFIG_MAP.get("NB_PARAMS");		
+		
+	}
+}
diff --git a/src/filter/InformedFilter.java b/src/filter/InformedFilter.java
new file mode 100644
index 0000000..4b125db
--- /dev/null
+++ b/src/filter/InformedFilter.java
@@ -0,0 +1,182 @@
+package filter;
+
+import weka.core.Attribute;
+import weka.core.Instances;
+
+/**
+ * This class implements informed feature selection
+ * methods, to be used as filters after vector 
+ * generation and pre-model building 
+ * 
+ * @author Hayda Almeida
+ * @since 2015
+ *
+ */
+public class InformedFilter {
+	
+	private boolean verbose = true;
+	
+	/**
+	 * Calculates oddsRatio of each feature 
+	 * in a given set of Instances
+	 *  
+	 * @param data set of instances, read from ARFF file
+	 * @return oddsRatio for each attribute in the matrix
+	 */
+	public double[] oddsRatio(Instances data, int threshold){
+
+		double[] oddsRatio = new double[data.numAttributes()];
+		
+
+		for(int i = 0; i < data.numAttributes()-1; i++ ){
+
+			double OR = 0;
+
+			Attribute current = data.attribute(i);
+			double pos_docs = 0, //number of documents in class C 
+					pos_oc = 0,  //number of times term t occured in class C
+					pos_term_docs = 0, //number of docs in class C that have term
+					pos_not_docs = 0,  //number of docs in class C that do not have term
+					neg_term_docs = 0,   //number of docs not in class C with term
+					neg_not_docs = 0,  //number of docs not in class C nor with term
+					neg_docs = 0; //number of documents not in class C
+
+			for(int j = 0; j < data.size(); j++){
+
+				double current_value = data.instance(j).value(current);
+				double current_class = data.instance(j).classValue();
+
+				//class is positive  
+				if(current_class < 1){
+					pos_docs = pos_docs + 1;
+
+					//the feature occurred in the document
+					if(current_value > 0){
+						pos_oc = pos_oc + current_value;
+						pos_term_docs = pos_term_docs +1;
+					}
+					//the feature did not occur in positive docs
+					else pos_not_docs = pos_not_docs + 1;
+				}
+				//class is negative
+				else{
+					neg_docs = neg_docs+1;
+
+					//the feature occurred in the document
+					if(current_value > 0){
+						neg_term_docs = neg_term_docs +1;
+					}
+					//the feature did not occur in negative docs
+					else neg_not_docs = neg_not_docs + 1;
+				}
+
+			}
+
+			OR = ( ( (pos_term_docs / pos_docs) / (pos_not_docs/ pos_docs) ) / 
+					( (neg_term_docs / neg_docs) / (neg_not_docs / neg_docs) ) ); 
+			
+		//	OR = (pos_term_docs / pos_not_docs) / (neg_term_docs / neg_not_docs);
+			
+			
+			//99% confidence: 2.575
+			//95% confidence: 1.96
+			double confidenceLow =  Math.exp(Math.log(OR) - (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs))));
+			double confidenceHigh = Math.exp(Math.log(OR) + (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs))));
+						
+			//checking if OR value is within the confidence interval
+			//and if it satisfies the threshold
+			if( ((OR <= confidenceHigh) && (OR >= confidenceLow) 
+					&& !(OR == threshold))
+					//checking if the confidence interval holds the null hypothesis (i.e., spans 1.0)
+					&& !(confidenceLow <=1 && confidenceHigh >=1))
+				oddsRatio[i] = OR;
+			else
+				oddsRatio[i] = 0;
+			
+			if(verbose){
+			System.out.println("Attribute: "+ data.attribute(i).toString() +"\t\t OddsRatio: " + oddsRatio[i] + 
+					"\tConfidenceLow: " + confidenceLow + "\tConfidenceHigh: "+ confidenceHigh);
+			}
+		}
+		
+		return oddsRatio;		
+	}
+	
+	/**
+	 * Calculates the inverse document frequency
+	 * for each attribute in the dataset. 
+	 * 
+	 * @param data instances
+	 * @param threshold 
+	 * @return list of idfs for each attribute
+	 */
+	public double[] idf(Instances data, int threshold){
+		
+		double[] idf = new double[data.numAttributes()];		
+		
+		for(int i = 0; i < data.numAttributes()-1; i++ ){
+
+			double idf_at = 0;
+			double idf_at2 = 0;
+
+			Attribute current = data.attribute(i);
+			double pos_docs = 0, //number of documents in class C				
+					pos_term_docs = 0, //number of docs in class C that have term
+					neg_term_docs = 0,   //number of docs not in class C with term					
+					neg_docs = 0; //number of documents not in class C
+
+			for(int j = 0; j < data.size(); j++){
+
+				double current_value = data.instance(j).value(current);
+				double current_class = data.instance(j).classValue();
+
+				//class is positive  
+				if(current_class < 1){					
+					pos_docs = pos_docs + 1;
+
+					//the feature occurred in the document
+					if(current_value > 0){						
+						pos_term_docs = pos_term_docs +1;	
+					}						
+				}
+				else{
+					//class is negative 
+					neg_docs = neg_docs+1;
+					
+					//the feature occurred in the document
+					if(current_value > 0){						
+						neg_term_docs = neg_term_docs +1;
+					}					
+				}
+			}			
+						
+//			double idf_pos = Math.log((pos_docs)/(pos_term_docs));
+//			double idf_neg = Math.log((neg_docs)/(neg_term_docs));
+
+			//check if the idf in the "positive" collection
+			//is greater than the idf in the "negative" collection
+//			if (idf_pos > idf_neg) 
+//				idf_at = idf_pos;
+//				
+//			else idf_at = 0;			
+
+			idf_at = Math.log((pos_docs + neg_docs)/(pos_term_docs + neg_term_docs));
+			
+			if(idf_at <= threshold)
+				idf[i] = 0;				
+			else
+				idf[i] = idf_at;
+		}
+		
+		if(verbose){
+			for(int i = 0; i < idf.length; i++){
+				if(idf[i]>0)
+				   System.out.println("Attribute: "+ data.attribute(i).toString()+ "\t\t\t IDF: " + idf[i]);				
+			}
+		}
+		
+		return idf;			
+	}	
+	
+
+}
diff --git a/src/filter/NaiveFilter.java b/src/filter/NaiveFilter.java
new file mode 100644
index 0000000..761787c
--- /dev/null
+++ b/src/filter/NaiveFilter.java
@@ -0,0 +1,139 @@
+package filter;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+
+import configure.ConfigConstants;
+
+/**
+ * 
+ * This class implements naive feature filtering methods 
+ * to be used by the extractor processes pre-vector building
+ *   
+ * @author Hayda Almeida
+ * @since 2015 
+ *
+ */
+public class NaiveFilter {
+	
+
+	private boolean verbose = true;
+	private String[] stopWords;
+				
+
+	/**
+	 * Removes from feature list all features with 
+	 * frequency not statistically relevant (2 or less)
+	 * @param list to be cleaned
+	 */	
+	public void considerAnnotationOccurence(HashMap<Map<String,String>,Integer> list, ConfigConstants vars){
+		//going over the list of annotations and removing the
+		//features with occurance lower than specified.
+		
+		Iterator<Map<String, String>> iterator = list.keySet().iterator();
+							
+		while(iterator.hasNext()){
+			Map<String, String> key = iterator.next();
+			int valor = list.get(key).intValue();			
+			
+			if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
+				iterator.remove();				
+			}
+		}		
+	}
+	
+	/**
+	 * Removes from feature list all features with 
+	 * frequency not statistically relevant (2 or less)
+	 * @param list to be cleaned
+	 */	
+	public void considerOccurence(HashMap<String,Integer> list, ConfigConstants vars){
+		//going over the list of annotations and removing the
+		//statistically not significant features - frequency less than 2
+		Iterator <Integer> iterator = list.values().iterator();
+
+		while(iterator.hasNext()){
+			Integer key = iterator.next();
+
+			if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
+				iterator.remove();				
+			}
+		}
+	}
+	
+	/**
+	 * Load the list of PubMed stopwords
+	 * @param path file with stopwords list
+	 * @return list of stopwords
+	 */
+	public void loadStopWords(String path){
+				
+		StringBuilder cleaned = new StringBuilder();
+		
+		try{
+			
+			BufferedReader reader = new BufferedReader(new FileReader(path));
+			
+			String line = null;	
+			//loading stop-words list
+			while((line = reader.readLine()) != null){
+				this.stopWords = StringUtils.split(line,",");
+				System.out.println("");
+				//line = reader.readLine();
+			}
+			
+			reader.close();
+			
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        } 		
+		
+	}
+	
+	/**
+	 * Removes stopwords from ngrams list
+	 * 
+	 * @param str list of ngrams
+	 * @param constants 
+	 * @return cleaned list of ngrams
+	 */	
+	public String removeStopList(String[] str){
+				
+		StringBuilder cleaned = new StringBuilder();			
+		
+		//iteraing over text to be cleaned
+		for(int i = 0; i < str.length; i++){
+			//iterating over stop-words list
+			for(int j = 0; j < this.stopWords.length; j++){
+				
+				//when stop-word is encountered, replace it
+				if(str[i].equalsIgnoreCase(this.stopWords[j])){
+					str[i] = str[i].replace(str[i],"*");					
+				}				
+			}
+			//retrieve the text without stop-words replacements
+			if(!(str[i].contentEquals("*"))){
+				cleaned.append(str[i]).append(" ");				
+			}
+		}		
+		return cleaned.toString().replace("  ", " ");
+	}
+	
+	public String[] getStopWords() {
+		return stopWords;
+	}
+
+	public void setStopWords(String[] stopWords) {
+		this.stopWords = stopWords;
+	}
+
+}
diff --git a/src/preprocessing/ConcatXML.java b/src/preprocessing/ConcatXML.java
new file mode 100644
index 0000000..89e255f
--- /dev/null
+++ b/src/preprocessing/ConcatXML.java
@@ -0,0 +1,717 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+package preprocessing;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Date;
+import java.util.List;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import configure.PathConstants;
+
+/**
+ * Generates a corpus from raw XML doc instances, 
+ * so that features can be extracted from it
+ *   
+ * @author Hayda Almeida
+ * @since 2014
+ *
+ */
+public class ConcatXML{
+	
+	private String tag1;
+	private String tag2;
+	private String tag3;
+	private String tag4;
+	private String id;
+	private String corpusTag;
+	private String corpusTagC;
+
+
+	public ConcatXML(){	
+
+		this.setId("PMID");				
+		this.setTag1("(?s)<.*?xml.*?>");
+		this.setTag2("(?s)<.*?!DOCTYPE.*?>");
+		this.setTag3("(?s)<.*?corpus.*?>");
+		this.seTag4("(?s)<.*?/corpus.*?>");
+		this.setCorpusTag("<corpus>");
+		this.setCorpusTag("</corpus>");		
+	}
+	
+	
+
+	public static void main(String[] args) throws Exception {	
+		
+		PathConstants pathVars = new PathConstants();
+		
+		String xmlDir = "";
+		if(Integer.parseInt(pathVars.EXP_TYPE)== 1)
+			xmlDir = "test";
+		else xmlDir = "train";
+		
+		String sourceDir = "", duplicatesDir = "";
+		
+		Boolean dc = false, df = false, cl = false, cc = false;
+		
+		for(int i = 0; i < args.length; i++){
+			try{				
+				if(args[i].matches("-dc")) 	dc = true;
+				if(args[i].matches("-df"))	df = true;
+				if(args[i].matches("-cl"))  cl = true;
+				if(args[i].matches("-cc"))  cc = true;
+			}
+			catch(Exception e){
+				System.out.println("Use: \n"			
+					+ "-tr -> train, -ts -> test; \n "
+					+ "-dc 	-> check duplicates in corpus vs. folder; \n "
+					+ "-df  -> check duplicates in two folders; \n"
+					+ "-cl  -> clean a source folder; \n"
+					+ "-cc  -> concatenate files in a folder ");
+				System.exit(0);
+				};
+		}				
+		
+		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
+		String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE;
+
+		sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + xmlDir;
+		duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.DUP_DIR;
+				
+		String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml";
+		String tagCorpus = concatCorpus;
+		
+		ConcatXML concat = new ConcatXML();		
+		
+		//================= Checking for duplicates =====================//
+		if(dc) concat.checkDupCorpus(trainCorpusPath, sourceDir);
+		if(df) concat.checkDupFolder(sourceDir, duplicatesDir);
+				
+		//================== Creating corpus ==========================//
+		if(cl){
+			concat.cleanXML(sourceDir);
+			if(duplicatesDir.length()>1)
+				concat.cleanXML(duplicatesDir);
+			}
+		if(cc){
+			concat.concatenateXML(sourceDir, "", concatCorpus);
+			concat.tagCorpus(tagCorpus);
+		}
+	}	
+	
+	/**
+	 * Returns the ID of a XML jsoup document
+	 * @param doc  a XML doc parsed by jsoup 
+	 * @return ID string
+	 * @throws IOException
+	 */
+	public String returnID(Document doc) throws IOException{
+		
+		String id = "";
+		
+		Elements paper = doc.body().getElementsByTag("pubmedarticleset");						
+								
+		//fetching the paper ID - 
+		//for all items in a paper, retrieve only PMIDs 
+		for(Element e : paper.select(getId())){
+			//only consider the ID if the parent is medline citation
+			if(e.parentNode().nodeName().contains("medline")){						
+				id = e.text();
+			}
+		}
+		return id;
+	}
+	
+	/**
+	 * Reads the file IDs in a folder and 
+	 * checks a second folder for duplicates. 
+	 *  
+	 * @param dirSrc source folder
+	 * @param dirDup folder to check for duplicates
+	 */
+	
+	public void checkDupFolder(String dirSrc, String dirDup){
+		ArrayList<String> sourceIDs = new ArrayList<String>();
+		ArrayList<String> duplicated = new ArrayList<String>();
+		ArrayList<String> dupIDs = new ArrayList<String>();
+		int ids = 0;
+
+		if(dirSrc.contentEquals(dirDup)){		
+			System.out.println("Source and duplicates directories are the same.\n\n========================\n");			
+		}		
+		else {		
+
+			File sourceDir = new File(dirSrc);
+			File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
+				@Override
+				public boolean accept(File dir, String name){
+					return name.endsWith(".xml");
+				}
+			});	
+
+			try{
+				//for each file on the source dir 
+				for (File xml : srcXMLs){				
+
+					try{
+						
+						String id  = "";
+						//Loading file
+						File input = new File(xml.getPath());
+						//Jsoup parse
+						Document doc = Jsoup.parse(input, "UTF-8");
+												
+						//fetching the document ID
+						id = returnID(doc);
+
+						if(!id.isEmpty()){
+							sourceIDs.add(id);
+							ids++;
+						}
+
+					}catch (FileNotFoundException e) {
+						e.printStackTrace();
+					}
+
+				}				
+
+			}catch (FileNotFoundException e) {
+				e.printStackTrace();
+			}
+			catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			System.out.println(ids + " source file IDs encountered.");
+			ids = 0;
+
+			File dupDir = new File(dirDup);
+
+			File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+				@Override
+				public boolean accept(File dir, String name){
+					return name.endsWith(".xml");
+				}
+			});		
+
+			try{
+				//for each file on the possibly duplicated dir 
+				for (File xml : dupXMLs){				
+
+					try{
+						String id  = "";
+												//Loading file
+						File input = new File(xml.getPath());
+						//Jsoup parse
+						Document doc = Jsoup.parse(input, "UTF-8");
+												
+						//fetching the document ID
+						id = returnID(doc);
+
+						if(!id.isEmpty()){
+							dupIDs.add(id);
+							String dupFileID = id;
+							ids++;
+							
+							for(int j = 0; j < sourceIDs.size(); j++){
+								if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){
+									
+									//add ID to duplicated list
+									duplicated.add(dupFileID);
+									
+									//rename the original file									
+									Path from = xml.toPath(); //convert from File to Path
+									Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+						    	    Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+								}
+							}							
+						}
+
+					}catch (FileNotFoundException e) {
+						e.printStackTrace();
+					}
+				}				
+
+			}catch (FileNotFoundException e) {
+				e.printStackTrace();
+			}
+			catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			//count number of existing papers on possibly duplicated folder
+			//just to make sure we are gathering all IDs
+			System.out.println(ids + " new file IDs encountered.");
+			ids = 0;
+			//sorting the list of duplicated IDs
+			Collections.sort(duplicated, new Comparator<String>(){
+				@Override
+				public int compare(String one, String two){
+					return one.compareTo(two);
+				}
+			});	
+
+			System.out.println("\nReaded source files: " + sourceIDs.size());				
+			System.out.println("Readed new files: " + dupIDs.size());	
+
+			System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+
+			System.out.println("\nDuplicated files IDs: ");
+			for(int i = 0; i < duplicated.size(); i++){
+				System.out.println(duplicated.get(i));
+			}
+
+			System.out.println("\n========================\n");
+		}
+
+
+	}
+	
+	/**
+	 * Reads the corpus and checks the papers IDs
+	 * to identify duplicates in case new papers 
+	 * are being concatenated to corpus.
+	 * 
+	 * @param corpus path to current corpora to check
+	 * @param dir path to folder with new files to be concatenated
+	 */
+	
+	public void checkDupCorpus(String corpus, String dir){
+		ArrayList<String> trainingIDs = new ArrayList<String>();
+		ArrayList<String> duplicated = new ArrayList<String>();
+		ArrayList<String> newFiles = new ArrayList<String>();
+		
+		int ids = 0;
+		
+		try 
+		{
+			File input = new File(corpus);
+			//Jsoup parse
+			Document doc = Jsoup.parse(input, "UTF-8");
+			Elements corp = doc.body().getElementsByTag("pubmedarticleset");
+
+			String id  = "";		
+
+			for(Element paper : corp){
+				Document thisDoc = Jsoup.parseBodyFragment(paper.toString());
+				
+				//fetching the document ID
+				id = returnID(thisDoc);
+
+				if(!id.isEmpty()){
+					trainingIDs.add(id);
+					ids++;
+				}	
+			}		
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+		
+		System.out.println(ids + " training file IDs encountered.");
+		ids = 0;
+		
+		File corpusDir = new File(dir);
+		File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});		
+		
+		try{
+			//for each file on the corpus dir 
+			for (File xml : newXMLs){				
+
+				try{
+					String id  = "";
+					//Loading file
+					File input = new File(xml.getPath());
+					//Jsoup parse
+					Document doc = Jsoup.parse(input, "UTF-8");
+
+					//fetching the document ID
+					id = returnID(doc);
+
+					if(!id.isEmpty()){						
+
+						newFiles.add(id);
+						String newFileID = id;
+						ids++;
+
+
+						for(int j = 0; j < trainingIDs.size(); j++){
+							if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){
+
+								//add ID to duplicated list
+								duplicated.add(newFileID);
+
+								//moving the original file									
+								Path from = xml.toPath(); //convert from File to Path
+								Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+								Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+							}
+						}
+					}
+				}catch (FileNotFoundException e) {
+					e.printStackTrace();
+				}
+			}			
+
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+		
+		//count number of existing papers on the training file
+		//just to make sure we are gathering all IDs
+		System.out.println(ids + " new file IDs encountered.");
+		ids = 0;
+
+		
+		//sorting the list of duplicated IDs
+		Collections.sort(duplicated, new Comparator<String>(){
+			@Override
+			public int compare(String one, String two){
+				return one.compareTo(two);
+			}
+		});	
+		
+		System.out.println("\nReaded training files: " + trainingIDs.size());				
+		System.out.println("Readed new files: " + newFiles.size());	
+				
+		System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+		
+		System.out.println("\nDuplicated files IDs: ");
+		for(int i = 0; i < duplicated.size(); i++){
+			System.out.println(duplicated.get(i));
+		}
+		
+		System.out.println("\n========================\n");
+		
+	}
+	
+	
+	/**
+	 * Reads and edits a list of XMLs files in a folder
+	 * to remove XML and previous corpus tags, 
+	 * preparing the files to be concatenated. 
+	 *  
+	 * @param dir string with folder path
+	 */
+	
+	public void cleanXML(String dir){		
+
+		//listing files on corpus dir
+		File sourceDir = new File(dir);
+		
+		File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});		
+
+		System.out.println("... Files list loaded.");				
+
+		try{
+			//for each file on the corpus dir 
+			for (File xml : newXMLs){				
+
+				try{
+					BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+					String line = null;
+					ArrayList<String> allLines = new ArrayList<String>();
+					String content  = null;
+
+					while((line = reader.readLine()) != null){						
+						content = line;	
+
+						//cleaning XML markups
+						content = content.replaceFirst(getTag1(), "");
+						content = content.replaceFirst(getTag2(), "");
+						//cleaning previous corpus tags
+						content = content.replaceFirst(getTag3(), "");
+						content = content.replaceFirst(getTag4(), "");
+						allLines.add(content);										
+					}					
+
+					PrintWriter writer = new PrintWriter(xml.getPath());
+
+					for (String l : allLines){
+						writer.println(l);			
+					}					
+					reader.close();
+					writer.close();				
+
+				}catch (FileNotFoundException e) {
+					e.printStackTrace();
+				}
+
+			}				
+
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+
+		System.out.println("... Files cleaned and saved.");
+		System.out.println("Ready for concatenation.");
+		System.out.println("\n========================\n");
+	}
+	
+
+
+	/**
+	 * Concatenates all XMLs in one folder or between two folders.
+	 * @param sourceDir main directory with XML files.
+	 * @param duplicDir second directory with duplicated XML files 
+	 * @param concatFile path name to saved concatenated corpus
+	 */
+	
+	public void concatenateXML(String sourceDir, String duplicDir, String concatFile){		
+
+		final int BUFFER = 1024 << 8;
+		byte[] buffer = new byte[BUFFER];
+
+		//listing files on corpus dir
+		File srcDir = new File(sourceDir);
+		File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});
+		
+		File dupDir = new File(duplicDir);
+		File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name) {				
+				return name.endsWith(".xml");
+			}			
+		}); 
+		
+		System.out.println("... Files list loaded.");		
+
+		//defining the output file (concatenated)
+		File newCorpus = new File(concatFile);		
+
+		try{	
+			OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus));
+			
+
+			//for each file on the corpus dir 
+			for (File xmls : srcXMLs){				
+				InputStream input = new FileInputStream(xmls);				
+				int count;				
+				
+				//if the file is not empty/finished
+				try{
+					while((count = input.read(buffer)) >= 0){										
+						
+						//write it on the concatenated final file
+						output.write(buffer, 0, count);
+					}
+				}finally{
+					input.close();
+				}
+			}
+			
+		if(dupXMLs != null){
+			for(File xmld : dupXMLs){
+				InputStream input = new FileInputStream(xmld);				
+				int count;				
+				
+				//if the file is not empty/finished
+				try{
+					while((count = input.read(buffer)) >= 0){										
+						
+						//write it on the concatenated final file
+						output.write(buffer, 0, count);
+					}
+				}finally{
+					input.close();
+				}
+			}
+		}
+			output.flush();
+			output.close();				
+			
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+
+		System.out.println("... File concatenated and saved.");
+		System.out.println("Ready for corpus tagging.");
+		System.out.println("\n========================\n");
+	}
+	
+	/**
+	 * Inserts corpus tag on XML file
+	 * 
+	 * @param pathToCorpus path to 
+	 * 		  concatenated corpus 
+	 */
+	
+	public void tagCorpus(String pathToCorpus){
+		
+		//tagging as corpus		
+		try{
+			BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus));
+						
+			String line = null;
+			String edit = null;
+			List<String> allLines = new ArrayList<String>();
+			
+			//adds tag at beggining of corpus
+			allLines.add(getCorpusTag());
+			
+			while((line = reader.readLine()) != null){	
+				 
+				allLines.add(line);					
+			}
+			//adds tag at the end of corpus
+			allLines.add(getCorpusTagC());			
+			
+			System.out.println("... Corpus loaded and tagged.");
+			//re-writting the file
+			PrintWriter writer = new PrintWriter(pathToCorpus);
+			
+			for (String l : allLines){
+				writer.println(l);			
+			}
+			reader.close();
+			writer.close();
+			
+			System.out.println("... File saved as tagged corpus.");			
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(IOException e){
+			e.printStackTrace();
+		}
+	}
+	
+	private String getCorpusTagC() {		
+		return corpusTagC;
+	}
+
+	private String getCorpusTag() {
+		// TODO Auto-generated method stub
+		return corpusTag;
+	}
+
+	public String getTag1() {
+		return tag1;
+	}
+
+	public void setTag1(String tag1) {
+		this.tag1 = tag1;
+	}
+
+	public String getTag2() {
+		return tag2;
+	}
+
+	public void setTag2(String tag2) {
+		this.tag2 = tag2;
+	}
+	
+	private String getTag4() {
+		// TODO Auto-generated method stub
+		return tag4;
+	}
+
+	private String getTag3() {
+		// TODO Auto-generated method stub
+		return tag3;
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}	
+	
+	private void setCorpusTag(String string) {
+		this.corpusTag = string;
+		
+	}
+
+	private void seTag4(String string) {
+		this.tag4 = string;
+		
+	}
+
+	private void setTag3(String string) {
+		this.tag3 = string;
+		
+	}
+		
+}
+
+
diff --git a/src/preprocessing/CorpusHandler.java b/src/preprocessing/CorpusHandler.java
new file mode 100644
index 0000000..94b5786
--- /dev/null
+++ b/src/preprocessing/CorpusHandler.java
@@ -0,0 +1,754 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+package preprocessing;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Date;
+import java.util.List;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import configure.ConfigConstants;
+
+/**
+ * Generates a corpus from raw XML doc instances, 
+ * so that features can be extracted from it
+ *   
+ * @author Hayda Almeida
+ * @since 2014
+ *
+ */
+public class CorpusHandler{
+	
+	private String tag1;
+	private String tag2;
+	private String tag3;
+	private String tag4;
+	private String id;
+	private String corpusTag;
+	private String corpusTagC;
+
+
+	public CorpusHandler(){	
+
+		this.setId("PMID");				
+		this.setTag1("(?s)<.*?xml.*?>");
+		this.setTag2("(?s)<.*?!DOCTYPE.*?>");
+		this.setTag3("(?s)<.*?corpus.*?>");
+		this.seTag4("(?s)<.*?/corpus.*?>");
+		this.setCorpusTag("<corpus>");
+		this.setCorpusTag("</corpus>");		
+	}
+	
+	
+
+	public static void main(String[] args) throws Exception {	
+		
+		ConfigConstants pathVars = new ConfigConstants();
+		
+		String xmlDir = "";
+		if(Integer.parseInt(pathVars.EXP_TYPE)== 1)
+			xmlDir = pathVars.TEST_DIR.substring(0, pathVars.TEST_DIR.length()-1);
+		else xmlDir = pathVars.TRAIN_DIR.substring(0, pathVars.TRAIN_DIR.length()-1) + "_" + pathVars.PERCT_POS_TRAIN;
+		
+		String sourceDir = "", duplicatesDir = "";
+
+		Boolean dc = false, df = false, cl = false, cc = false;
+
+		String param = "";
+
+		try{
+			param = args[0];
+
+			if(param.length() > 1){
+				if(param.contains("dc"))
+					dc = true;
+				if(param.contains("df"))
+					df = true;
+				if(param.contains("cl"))
+					cl = true;
+				if(param.contains("cc"))
+					cc = true;
+			}
+		}
+		catch(Exception e){
+			System.out.println("Use: \n"				
+					+ "-dc 	-> check duplicates in corpus vs. folder; \n "
+					+ "-df  -> check duplicates in two folders; \n"
+					+ "-cl  -> clean a source folder; \n"
+					+ "-cc  -> concatenate files in a folder ");
+			System.exit(0);
+		};		
+		
+		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
+		String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE;
+
+		sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + xmlDir;
+		duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.DUP_DIR;
+				
+		String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml";
+		String tagCorpus = concatCorpus;
+		
+		CorpusHandler concat = new CorpusHandler();		
+		
+		//================= Checking for duplicates =====================//
+		//
+		//Check for duplicates between training file and a specific folder
+		if(dc) concat.checkDupCorpus(trainCorpusPath, sourceDir);
+		//
+		//----------------------------------------------------
+		//
+		//Check for duplicates between two folders (duplicates found being sinalized in duplicatesDir)
+		if(df) concat.checkDupFolder(sourceDir, duplicatesDir);
+		//
+		//==================== Creating corpus ==========================//		
+		//
+		//Removing XML tags from files 
+		if(cl){
+			concat.cleanXML(sourceDir, xmlDir);
+			if(duplicatesDir.length()>1 && (dc || df))
+				concat.cleanXML(duplicatesDir, xmlDir);
+		}
+		//
+		//------------------------------------
+		//
+		//Concatenating files from folders and outputting a corpus file
+		//Inserting <corpus> tag in file 
+		if(cc){
+			concat.concatenateXML(sourceDir, "", concatCorpus, xmlDir);
+			concat.tagCorpus(tagCorpus, xmlDir);
+		  }
+		//
+		//===============================================================//
+	}	
+	
+	/**
+	 * Returns the ID of a XML jsoup document
+	 * @param doc  a XML doc parsed by jsoup 
+	 * @return ID string
+	 * @throws IOException
+	 */
+	public String returnID(Document doc) throws IOException{
+		
+		String id = "";
+		
+		Elements paper = doc.body().getElementsByTag("pubmedarticleset");						
+								
+		//fetching the paper ID - 
+		//for all items in a paper, retrieve only PMIDs 
+		for(Element e : paper.select(getId())){
+			//only consider the ID if the parent is medline citation
+			if(e.parentNode().nodeName().contains("medline")){						
+				id = e.text();
+			}
+		}
+		return id;
+	}
+	
+	/**
+	 * Reads the file IDs in a folder and 
+	 * checks a second folder for duplicates. 
+	 *  
+	 * @param dirSrc source folder
+	 * @param dirDup folder to check for duplicates
+	 */
+	
+	public void checkDupFolder(String dirSrc, String dirDup){
+		ArrayList<String> sourceIDs = new ArrayList<String>();
+		ArrayList<String> duplicated = new ArrayList<String>();
+		ArrayList<String> dupIDs = new ArrayList<String>();
+		int ids = 0;
+
+		if(dirSrc.contentEquals(dirDup)){		
+			System.out.println("Source and duplicates directories are the same.\n\n========================\n");			
+		}		
+		else {	
+			
+			System.out.println("Source directory: "+ dirSrc + " \n");
+			System.out.println("Duplicates directory: " + dirDup + " \n");
+			
+			//Loading files in the source folder
+			File sourceDir = new File(dirSrc);
+			File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
+				@Override
+				public boolean accept(File dir, String name){
+					return name.endsWith(".xml");
+				}
+			});	
+
+			try{
+				//for each file on the source dir 
+				for (File xml : srcXMLs){				
+
+					try{
+						
+						String id  = "";
+						//Loading file
+						File input = new File(xml.getPath());
+						//Jsoup parse
+						Document doc = Jsoup.parse(input, "UTF-8");
+												
+						//fetching the document ID
+						id = returnID(doc);
+
+						if(!id.isEmpty()){
+							sourceIDs.add(id);
+							ids++;
+						}
+
+					}catch (FileNotFoundException e) {
+						e.printStackTrace();
+					}
+
+				}				
+
+			}catch (FileNotFoundException e) {
+				e.printStackTrace();
+			}
+			catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			System.out.println(ids + " source file IDs encountered.");
+			ids = 0;
+
+			File dupDir = new File(dirDup);
+
+			//Loading files in the duplicated folder
+			File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+				@Override
+				public boolean accept(File dir, String name){
+					return name.endsWith(".xml");
+				}
+			});		
+
+			try{
+				//for each file on the possibly duplicated dir 
+				for (File xml : dupXMLs){				
+
+					try{
+						String id  = "";
+												//Loading file
+						File input = new File(xml.getPath());
+						//Jsoup parse
+						Document doc = Jsoup.parse(input, "UTF-8");
+												
+						//fetching the document ID
+						id = returnID(doc);
+
+						if(!id.isEmpty()){
+							dupIDs.add(id);
+							String dupFileID = id;
+							ids++;
+							
+							for(int j = 0; j < sourceIDs.size(); j++){
+								if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){
+									
+									//add ID to duplicated list
+									duplicated.add(dupFileID);
+									
+									//rename the original file									
+									Path from = xml.toPath(); //convert from File to Path
+									Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+						    	    Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+								}
+							}							
+						}
+
+					}catch (FileNotFoundException e) {
+						e.printStackTrace();
+					}
+				}				
+
+			}catch (FileNotFoundException e) {
+				e.printStackTrace();
+			}
+			catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			//count number of existing papers on possibly duplicated folder
+			//just to make sure we are gathering all IDs
+			System.out.println(ids + " new file IDs encountered.");
+			ids = 0;
+			//sorting the list of duplicated IDs
+			Collections.sort(duplicated, new Comparator<String>(){
+				@Override
+				public int compare(String one, String two){
+					return one.compareTo(two);
+				}
+			});	
+
+			System.out.println("\nReaded source files: " + sourceIDs.size());				
+			System.out.println("Readed new files: " + dupIDs.size());	
+
+			System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+
+			System.out.println("\nDuplicated files IDs: ");
+			for(int i = 0; i < duplicated.size(); i++){
+				System.out.println(duplicated.get(i));
+			}
+
+			System.out.println("\n========================\n");
+		}
+
+
+	}
+	
+	/**
+	 * Reads the corpus and checks the papers IDs
+	 * to identify duplicates in case new papers 
+	 * are being concatenated to corpus.
+	 * 
+	 * @param corpus path to current corpora to check
+	 * @param dir path to folder with new files to be concatenated
+	 */
+	
+	public void checkDupCorpus(String corpus, String dir){
+		ArrayList<String> trainingIDs = new ArrayList<String>();
+		ArrayList<String> duplicated = new ArrayList<String>();
+		ArrayList<String> newFiles = new ArrayList<String>();
+		
+		int ids = 0;
+		
+		try 
+		{
+			System.out.println("Corpus directory: "+ corpus + " \n");
+			System.out.println("Duplicates directory: " + dir + " \n");
+			
+			File input = new File(corpus);
+			//Jsoup parse
+			Document doc = Jsoup.parse(input, "UTF-8");
+			Elements corp = doc.body().getElementsByTag("pubmedarticleset");
+
+			String id  = "";		
+
+			for(Element paper : corp){
+				Document thisDoc = Jsoup.parseBodyFragment(paper.toString());
+				
+				//fetching the document ID
+				id = returnID(thisDoc);
+
+				if(!id.isEmpty()){
+					trainingIDs.add(id);
+					ids++;
+				}	
+			}		
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+		
+		System.out.println(ids + " training file IDs encountered.");
+		ids = 0;
+		
+		File corpusDir = new File(dir);
+		File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});		
+		
+		try{
+			//for each file on the corpus dir 
+			for (File xml : newXMLs){				
+
+				try{
+					String id  = "";
+					//Loading file
+					File input = new File(xml.getPath());
+					//Jsoup parse
+					Document doc = Jsoup.parse(input, "UTF-8");
+
+					//fetching the document ID
+					id = returnID(doc);
+
+					if(!id.isEmpty()){						
+
+						newFiles.add(id);
+						String newFileID = id;
+						ids++;
+
+
+						for(int j = 0; j < trainingIDs.size(); j++){
+							if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){
+
+								//add ID to duplicated list
+								duplicated.add(newFileID);
+
+								//moving the original file									
+								Path from = xml.toPath(); //convert from File to Path
+								Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+								Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+							}
+						}
+					}
+				}catch (FileNotFoundException e) {
+					e.printStackTrace();
+				}
+			}			
+
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+		
+		//count number of existing papers on the training file
+		//just to make sure we are gathering all IDs
+		System.out.println(ids + " new file IDs encountered.");
+		ids = 0;
+
+		
+		//sorting the list of duplicated IDs
+		Collections.sort(duplicated, new Comparator<String>(){
+			@Override
+			public int compare(String one, String two){
+				return one.compareTo(two);
+			}
+		});	
+		
+		System.out.println("\nReaded training files: " + trainingIDs.size());				
+		System.out.println("Readed new files: " + newFiles.size());	
+				
+		System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+		
+		System.out.println("\nDuplicated files IDs: ");
+		for(int i = 0; i < duplicated.size(); i++){
+			System.out.println(duplicated.get(i));
+		}
+		
+		System.out.println("\n========================\n");
+		
+	}
+	
+	
+	/**
+	 * Reads and edits a list of XMLs files in a folder
+	 * to remove XML and previous corpus tags, 
+	 * preparing the files to be concatenated. 
+	 *  
+	 * @param dir string with folder path
+	 */
+	
+	public void cleanXML(String dir, String xmlDir){	
+
+		//listing files on corpus dir
+		File sourceDir = new File(dir);
+		
+		File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});		
+
+		System.out.println("... Files list loaded: "+ dir);					
+
+		try{
+			//for each file on the corpus dir 
+			for (File xml : newXMLs){				
+
+				try{
+					BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+					String line = null;
+					ArrayList<String> allLines = new ArrayList<String>();
+					String content  = null;
+
+					while((line = reader.readLine()) != null){						
+						content = line;	
+
+						//cleaning XML markups
+						content = content.replaceFirst(getTag1(), "");
+						content = content.replaceFirst(getTag2(), "");
+						//cleaning previous corpus tags
+						content = content.replaceFirst(getTag3(), "");
+						content = content.replaceFirst(getTag4(), "");
+						allLines.add(content);										
+					}					
+
+					PrintWriter writer = new PrintWriter(xml.getPath());
+
+					for (String l : allLines){
+						writer.println(l);			
+					}					
+					reader.close();
+					writer.close();				
+
+				}catch (FileNotFoundException e) {
+					e.printStackTrace();
+				}
+
+			}				
+
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+
+		System.out.println("... Files cleaned and saved for " + xmlDir + ".");
+		System.out.println("Ready for concatenation.");
+		System.out.println("\n========================\n");
+		
+	}
+	
+
+
+	/**
+	 * Concatenates all XMLs in one folder or between two folders.
+	 * @param sourceDir main directory with XML files.
+	 * @param duplicDir second directory with duplicated XML files 
+	 * @param concatFile path name to saved concatenated corpus
+	 */
+	
+	public void concatenateXML(String sourceDir, String duplicDir, String concatFile, String xmlDir){		
+
+		final int BUFFER = 1024 << 8;
+		byte[] buffer = new byte[BUFFER];
+
+		//listing files on corpus dir
+		File srcDir = new File(sourceDir);
+		File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});
+		
+		File dupDir = new File(duplicDir);
+		File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name) {				
+				return name.endsWith(".xml");
+			}			
+		}); 
+		
+		System.out.println("... Files list loaded: "+ xmlDir + ".");			
+
+		//defining the output file (concatenated)
+		File newCorpus = new File(concatFile);		
+
+		try{	
+			OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus));
+			
+
+			//for each file on the corpus dir 
+			for (File xmls : srcXMLs){				
+				InputStream input = new FileInputStream(xmls);				
+				int count;				
+				
+				//if the file is not empty/finished
+				try{
+					while((count = input.read(buffer)) >= 0){										
+						
+						//write it on the concatenated final file
+						output.write(buffer, 0, count);
+					}
+				}finally{
+					input.close();
+				}
+			}
+			
+		if(dupXMLs != null){
+			for(File xmld : dupXMLs){
+				InputStream input = new FileInputStream(xmld);				
+				int count;				
+				
+				//if the file is not empty/finished
+				try{
+					while((count = input.read(buffer)) >= 0){										
+						
+						//write it on the concatenated final file
+						output.write(buffer, 0, count);
+					}
+				}finally{
+					input.close();
+				}
+			}
+		}
+			output.flush();
+			output.close();				
+			
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+
+		System.out.println("... File concatenated and saved for "+ xmlDir+ ".");
+		System.out.println("Ready for corpus tagging.");
+		System.out.println("\n========================\n");
+	}
+	
+	/**
+	 * Inserts corpus tag on XML file
+	 * 
+	 * @param pathToCorpus path to 
+	 * 		  concatenated corpus 
+	 */
+	
+	public void tagCorpus(String pathToCorpus, String xmlDir){
+		
+		//tagging as corpus		
+		try{
+			BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus));
+						
+			String line = null;
+			String edit = null;
+			List<String> allLines = new ArrayList<String>();
+			
+			//adds tag at beggining of corpus
+			allLines.add(getCorpusTag());
+			
+			while((line = reader.readLine()) != null){	
+				 
+				allLines.add(line);					
+			}
+			//adds tag at the end of corpus
+			allLines.add(getCorpusTagC());			
+			
+			System.out.println("... Corpus loaded and tagged.");
+			//re-writting the file
+			PrintWriter writer = new PrintWriter(pathToCorpus);
+			
+			for (String l : allLines){
+				writer.println(l);			
+			}
+			reader.close();
+			writer.close();
+			
+			System.out.println("... File saved as tagged " + xmlDir +  " corpus.");
+			System.out.println("... DONE!");		
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(IOException e){
+			e.printStackTrace();
+		}
+	}
+	
+	private String getCorpusTagC() {		
+		return corpusTagC;
+	}
+
+	private String getCorpusTag() {
+		// TODO Auto-generated method stub
+		return corpusTag;
+	}
+
+	public String getTag1() {
+		return tag1;
+	}
+
+	public void setTag1(String tag1) {
+		this.tag1 = tag1;
+	}
+
+	public String getTag2() {
+		return tag2;
+	}
+
+	public void setTag2(String tag2) {
+		this.tag2 = tag2;
+	}
+	
+	private String getTag4() {
+		// TODO Auto-generated method stub
+		return tag4;
+	}
+
+	private String getTag3() {
+		// TODO Auto-generated method stub
+		return tag3;
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}	
+	
+	private void setCorpusTag(String string) {
+		this.corpusTag = string;
+		
+	}
+	
+	private void setCorpusTagC(String string) {
+		this.corpusTagC = string;
+		
+	}
+
+	private void seTag4(String string) {
+		this.tag4 = string;
+		
+	}
+
+	private void setTag3(String string) {
+		this.tag3 = string;
+		
+	}
+		
+}
+
+
diff --git a/src/preprocessing/SampleCorpus.java b/src/preprocessing/SampleCorpus.java
new file mode 100644
index 0000000..bc65331
--- /dev/null
+++ b/src/preprocessing/SampleCorpus.java
@@ -0,0 +1,233 @@
+package preprocessing;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import configure.ConfigConstants;
+
+/**
+ * Performs document instances sampling
+ * generating training and test files
+ * with specific balance input by user.
+ *   
+ * @author Hayda Almeida
+ * @since 2015
+ *
+ */
+public class SampleCorpus {
+
+	public static void main(String[] args) throws Exception {	
+
+		ConfigConstants pathVars = new ConfigConstants();
+		SampleCorpus sampling = new SampleCorpus();
+		
+		String pathToLiteratureFolder = pathVars.HOME_DIR + pathVars.CORPUS_DIR;
+
+		String positiveDir = pathToLiteratureFolder + pathVars.POS_DIR;
+		List positives = new LinkedList();
+
+		String negativeDir = pathToLiteratureFolder + pathVars.NEG_DIR;
+		List negatives = new LinkedList();
+
+		//train or test sampling
+		Boolean training = Boolean.valueOf(pathVars.SAMPLE_TRAIN);
+		Boolean testing = Boolean.valueOf(pathVars.SAMPLE_TEST);
+
+		//% of test corpus WRT the collection, % positive on test set, % positive on training set 
+		int percTs = Integer.parseInt(pathVars.PERCT_TEST); 
+		int percPosTr = Integer.parseInt(pathVars.PERCT_POS_TRAIN);
+		int percPosTs = Integer.parseInt(pathVars.PERCT_POS_TEST);
+
+		if(!(training || testing)){
+			System.out.println("Training or Test sampling: not set up.\n"
+					+ "Please define sampling params in file: \n"
+					+ "@ config.cfg.");
+			System.exit(0);			
+		}		
+
+		positives = sampling.loadFiles(positiveDir);
+		negatives = sampling.loadFiles(negativeDir);
+		
+		if(testing) sampling.sampleTest(pathToLiteratureFolder + pathVars.TEST_DIR, positives, negatives, percTs, percPosTs);
+		
+		if(training) sampling.sampleTrain(pathToLiteratureFolder + pathVars.TRAIN_DIR, positives, negatives, percPosTr);		
+
+
+	}	
+	
+	/**
+	 * Lists XML files within a folder 
+	 * @param dirSrc folder path
+	 * @return returns list of file IDs
+	 */
+	public List loadFiles(String dirSrc){						
+
+		List fileIDs = new LinkedList();
+		
+		File sourceDir = new File(dirSrc);
+		File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});	
+
+		fileIDs = new LinkedList(Arrays.asList(srcXMLs));
+		
+		return fileIDs;
+	}
+	
+	/**
+	 * Moves a specific number of files 
+	 * in a list from origin folder to a test folder
+	 * @param pathVars 
+	 * @param files List of file IDs
+	 * @param numFiles number of files to be moved
+	 */
+	public void moveFile(String path, List files, int numFiles){
+		
+		Iterator<File> filesList = files.iterator();
+		File testDir = new File(path);
+		
+		if(!testDir.exists()){
+			try{
+				testDir.mkdir();
+			}catch(Exception e){
+				System.out.println("Error creating Test folder.");
+				System.exit(0);
+			}
+		}
+		
+		while(filesList.hasNext() && numFiles > 0){		
+			try{
+				File file = (File) filesList.next();
+				File newFile = new File(testDir + "/" + file.getName());
+				
+				Files.move(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
+				
+				filesList.remove();				
+				numFiles--;
+			}
+			catch(Exception e){
+				System.out.println("Error moving files.");
+				System.exit(0);
+			}
+		}	
+		
+	}
+	
+	/**
+	 * Copies a specific number of files 
+	 * in a list from origin folder to a train folder
+	 * @param pathVars
+	 * @param files  List of file IDs
+	 * @param numFiles number of files to be moved
+	 */
+	public void copyFile(String path, List files, int numFiles, int percPos){
+		
+		Iterator<File> filesList = files.iterator();
+		String trainPath = path.substring(0, path.length()-1) + "_" + percPos + "/";
+		File trainDir = new File(trainPath);
+		
+		if(!trainDir.exists())
+			try{
+				trainDir.mkdir();
+			}catch(Exception e){
+				System.out.println("Error creating Training folder.");
+				System.exit(0);
+			}
+		
+		while(filesList.hasNext() && numFiles > 0){				
+			try{				
+				File file = (File) filesList.next();
+				File newFile = new File(trainDir + "/"+ file.getName());
+				
+				Files.copy(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
+			}
+			catch(Exception e){
+				System.out.println("Error copying files.");
+				System.exit(0);
+			}
+		}
+		
+	}
+	
+	/**
+	 * Samples document instances from the collection
+	 * to generate a test set.
+	 * 
+	 * @param pathVars
+	 * @param positives list of positive documents IDs
+	 * @param negatives list of negative documents IDs
+	 * @param total  percentage of the document collection for test
+	 * @param pos  percentage of positive documents in the test set
+	 */
+	public void sampleTest(String path, List positives, List negatives, int total, int pos){
+		
+		int instances = positives.size() + negatives.size();		
+		int testSize = (instances * total) / 100; 		
+		int posSize = (testSize * pos) / 100;		
+		int negSize = testSize - posSize;		
+		
+		Collections.shuffle(negatives);	
+		System.out.println("===== Test > Negative instances shuffled for test set.");
+		moveFile(path, negatives, negSize);
+		System.out.println("===== Test > Negative instances moved to test folder. \n");
+		
+		Collections.shuffle(positives);	
+		System.out.println("===== Test > Positive instances shuffled for test set.");
+		moveFile(path, positives, posSize);	
+		System.out.println("===== Test > Positive instances moved to test folder. \n");
+		
+	}
+	
+	/**
+	 * Samples document instances from the collection
+	 * to generate a training set.
+	 * 
+	 * @param pathVars
+	 * @param positives list of positive documents IDs
+	 * @param negatives list of negative documents IDs
+	 * @param pos percentage of positive documents in the training set
+	 */	
+    public void sampleTrain(String path, List positives, List negatives, int percPos){
+		
+    	int posSize = positives.size();
+    	int trainSize = (100 * posSize) / percPos;
+    	  	
+    	int negSize = trainSize - posSize;
+    	
+    	if(positives.size() < posSize){
+    		System.out.println("Not enough positive instances for training set.");
+    		System.exit(0);
+    	}
+    	else if(negatives.size() < negSize){
+    		System.out.println("Not enough negative instances for training set.");
+    		System.exit(0);    	
+    	}
+    	else{    		
+    		Collections.shuffle(negatives);
+    		System.out.println("===== Training > Negative instances shuffled for training set.");
+    		copyFile(path, negatives, negSize, percPos);
+    		System.out.println("===== Training > Negative instances copied to training folder. \n");
+    		
+    		Collections.shuffle(positives);
+    		System.out.println("===== Training > Positive instances shuffled for training set.");
+    		copyFile(path, positives, posSize, percPos);
+    		System.out.println("===== Training > Positive instances copied to training folder. \n");
+    	}			
+		
+	}
+	
+
+	
+
+}
diff --git a/usermanual/.gitignore b/usermanual/.gitignore
new file mode 100644
index 0000000..5190f77
--- /dev/null
+++ b/usermanual/.gitignore
@@ -0,0 +1,12 @@
+# User manual files #
+###################
+*.aux
+*.bbl
+*.blg
+*.log
+*.out
+*.backup
+*.toc
+*.*~
+
+
diff --git a/usermanual/Makefile b/usermanual/Makefile
new file mode 100644
index 0000000..822e19f
--- /dev/null
+++ b/usermanual/Makefile
@@ -0,0 +1,31 @@
+DOC=usermanual
+BIB=usermanual
+PDFLATEX=pdflatex
+BIBTEX=bibtex
+GRAPHICSIDR=graphics
+
+all: paper
+#all: images paper
+
+paper:  $(DOC).tex $(BIB).bib 
+	$(PDFLATEX) $(DOC).tex
+	$(BIBTEX) $(DOC)
+	$(PDFLATEX) $(DOC).tex
+	$(PDFLATEX) $(DOC).tex
+	$(BIBTEX) $(DOC)
+
+#images: 
+#	cd $(GRAPHICSIDR) && make
+
+check:
+	$(TEXIDATE) $(DOC).tex
+
+wc:
+	@echo "Paper has:" `pdftotext $(DOC).pdf - | wc -w 2> /dev/null` "words!"
+
+clean:	
+	-rm $(DOC).log $(DOC).aux $(DOC).blg $(DOC).bbl $(DOC).dvi $(DOC).ps $(DOC).out $(DOC).toc $(DOC).lof $(DOC).lot 
+#	cd $(GRAPHICSIDR) && make clean
+
+hyperclean: clean
+	-rm *~ *.backup
diff --git a/usermanual/datetime.sty b/usermanual/datetime.sty
new file mode 100644
index 0000000..cc6580a
--- /dev/null
+++ b/usermanual/datetime.sty
@@ -0,0 +1,487 @@
+%%
+%% This is file `datetime.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% datetime.dtx  (with options: `datetime')
+%% Copyright (C) 2000 Nicola Talbot, all rights reserved.
+%% If you modify this file, you must change its name first.
+%% You are NOT ALLOWED to distribute this file alone. You are NOT
+%% ALLOWED to take money for the distribution or use of either this
+%% file or a changed version, except for a nominal charge for copying
+%% etc.
+%% \CharacterTable
+%%  {Upper-case    \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
+%%   Lower-case    \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
+%%   Digits        \0\1\2\3\4\5\6\7\8\9
+%%   Exclamation   \!     Double quote  \"     Hash (number) \#
+%%   Dollar        \$     Percent       \%     Ampersand     \&
+%%   Acute accent  \'     Left paren    \(     Right paren   \)
+%%   Asterisk      \*     Plus          \+     Comma         \,
+%%   Minus         \-     Point         \.     Solidus       \/
+%%   Colon         \:     Semicolon     \;     Less than     \<
+%%   Equals        \=     Greater than  \>     Question mark \?
+%%   Commercial at \@     Left bracket  \[     Backslash     \\
+%%   Right bracket \]     Circumflex    \^     Underscore    \_
+%%   Grave accent  \`     Left brace    \{     Vertical bar  \|
+%%   Right brace   \}     Tilde         \~}
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{datetime}[2004/11/01 Date Time Package version 2.42]
+\RequirePackage{fmtcount}
+\newif\if@dt@nodate
+\@dt@nodatefalse
+\newif\ifshowdow % show the day of week if true
+
+\providecommand{\formatdate}[3]{}
+
+\newcount\@day
+\newcount\@month
+\newcount\@year
+
+\providecommand{\longdate}{%
+\renewcommand{\formatdate}[3]{\ifshowdow\dayofweekname{##1}{##2}{##3} \fi
+\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\@ordinal{\@day}\ \monthname[\@month], \the\@year}}
+
+\providecommand{\shortdate}{%
+\renewcommand{\formatdate}[3]{\ifshowdow\shortdayofweekname{##1}{##2}{##3} \fi
+\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\@ordinal{\@day}\ \shortmonthname[\@month], \the\@year}}
+
+\providecommand{\twodigit}[1]{{\@dtctr=#1\relax\ifnum\@dtctr<10 0\fi\the\@dtctr}}
+
+\providecommand{\ddmmyyyydate}{%
+\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\twodigit\@day/\twodigit\@month/\the\@year}}
+
+\providecommand{\dmyyyydate}{%
+\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\the\@day/\the\@month/\the\@year}}
+
+\providecommand{\ddmmyydate}{\renewcommand{\formatdate}[3]{%
+\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\@dtctr=\@year%
+\@modulo{\@dtctr}{100}%
+\twodigit\@day/\twodigit\@month/\twodigit\@dtctr}}
+
+\providecommand{\dmyydate}{\renewcommand{\formatdate}[3]{%
+\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\@dtctr=\@year%
+\@modulo{\@dtctr}{100}%
+\the\@day/\the\@month/\twodigit\@dtctr}}
+
+\providecommand{\textdate}{%
+\renewcommand{\formatdate}[3]{\ifshowdow\dayofweekname{##1}{##2}{##3} the \fi
+\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\@Ordinalstring\@day\ of \monthname[\@month], \@Numberstring\@year}}
+
+\providecommand{\usdate}{%
+\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\monthname[\@month]\ \the\@day, \the\@year}}
+
+\providecommand{\mmddyyyydate}{%
+\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\twodigit\@month/\twodigit\@day/\the\@year}}
+
+\providecommand{\mdyyyydate}{%
+\renewcommand{\formatdate}[3]{\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\the\@month/\the\@day/\the\@year}}
+
+\providecommand{\mmddyydate}{\renewcommand{\formatdate}[3]{%
+\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\@dtctr=\@year%
+\@modulo{\@dtctr}{100}%
+\twodigit\@month/\twodigit\@day/\twodigit\@dtctr}}
+
+\providecommand{\mdyydate}{\renewcommand{\formatdate}[3]{%
+\@day=##1\relax\@month=##2\relax\@year=##3\relax
+\@dtctr=\@year%
+\@modulo{\@dtctr}{100}%
+\the\@month/\the\@day/\twodigit\@dtctr}}
+
+\providecommand{\currenttime}{\xxivtime}
+
+\providecommand{\settimeformat}[1]{\def\currenttime{\csname#1\endcsname}}
+\longdate\renewcommand{\fmtord}[1]{\textsuperscript{#1}}\showdowtrue
+\InputIfFileExists{datetime.cfg}{\typeout{Loading local datetime configurations}}{\typeout{No datetime.cfg file found}}
+\RequirePackage{fmtcount}
+\DeclareOption{long}{\longdate}
+\DeclareOption{short}{\shortdate}
+\DeclareOption{ddmmyyyy}{\ddmmyyyydate}
+\DeclareOption{dmyyyy}{\dmyyyydate}
+\DeclareOption{ddmmyy}{\ddmmyydate}
+\DeclareOption{dmyy}{\dmyydate}
+\DeclareOption{text}{\textdate}
+\DeclareOption{us}{\usdate}
+\DeclareOption{mmddyyyy}{\mmddyyyydate}
+\DeclareOption{mdyyyy}{\mdyyyydate}
+\DeclareOption{mmddyy}{\mmddyydate}
+\DeclareOption{mdyy}{\mdyydate}
+\DeclareOption{level}{\renewcommand{\fmtord}[1]{#1}}
+\DeclareOption{raise}{\renewcommand{\fmtord}[1]{\textsuperscript{#1}}}
+\DeclareOption{dayofweek}{\showdowtrue}
+\DeclareOption{nodayofweek}{\showdowfalse}
+\DeclareOption{nodate}{\@dt@nodatetrue}
+\DeclareOption{24hr}{\settimeformat{xxivtime}}
+\DeclareOption{12hr}{\settimeformat{ampmtime}}
+\DeclareOption{oclock}{\settimeformat{oclock}}
+
+
+\DeclareOption{austrian}{\input{dt-austrian.def}}
+\DeclareOption{bahasa}{\input{dt-bahasa.def}}
+\DeclareOption{basque}{\input{dt-basque.def}}
+\DeclareOption{breton}{\input{dt-breton.def}}
+\DeclareOption{bulgarian}{\input{dt-bulgarian.def}}
+\DeclareOption{catalan}{\input{dt-catalan.def}}
+\DeclareOption{croatian}{\input{dt-croatian.def}}
+\DeclareOption{czech}{\input{dt-czech.def}}
+\DeclareOption{danish}{\input{dt-danish.def}}
+\DeclareOption{dutch}{\input{dt-dutch.def}}
+\DeclareOption{esperanto}{\input{dt-esperanto.def}}
+\DeclareOption{estonian}{\input{dt-estonian.def}}
+\DeclareOption{finnish}{\input{dt-finnish.def}}
+\DeclareOption{french}{\input{dt-french.def}}
+\DeclareOption{galician}{\input{dt-galician.def}}
+\DeclareOption{german}{\input{dt-german.def}}
+\DeclareOption{greek}{\input{dt-greek.def}}
+\DeclareOption{hebrew}{\input{dt-hebrew.def}}
+\DeclareOption{icelandic}{\input{dt-icelandic.def}}
+\DeclareOption{irish}{\input{dt-irish.def}}
+\DeclareOption{italian}{\input{dt-italian.def}}
+\DeclareOption{latin}{\input{dt-latin.def}}
+\DeclareOption{lsorbian}{\input{dt-lsorbian.def}}
+\DeclareOption{magyar}{\input{dt-magyar.def}}
+\DeclareOption{naustrian}{\input{dt-naustrian.def}}
+\DeclareOption{ngerman}{\input{dt-ngerman.def}}
+\DeclareOption{norsk}{\input{dt-norsk.def}}
+\DeclareOption{polish}{\input{dt-polish.def}}
+\DeclareOption{portuges}{\input{dt-portuges.def}}
+\DeclareOption{romanian}{\input{dt-romanian.def}}
+\DeclareOption{russian}{\input{dt-russian.def}}
+\DeclareOption{samin}{\input{dt-samin.def}}
+\DeclareOption{scottish}{\input{dt-scottish.def}}
+\DeclareOption{serbian}{\input{dt-serbian.def}}
+\DeclareOption{slovak}{\input{dt-slovak.def}}
+\DeclareOption{slovene}{\input{dt-slovene.def}}
+\DeclareOption{spanish}{\input{dt-spanish.def}}
+\DeclareOption{swedish}{\input{dt-swedish.def}}
+\DeclareOption{turkish}{\input{dt-turkish.def}}
+\DeclareOption{ukraineb}{\input{dt-ukraineb.def}}
+\DeclareOption{usorbian}{\input{dt-usorbian.def}}
+\DeclareOption{welsh}{\input{dt-welsh.def}}
+
+\ProcessOptions
+\RequirePackage{ifthen}
+\if@dt@nodate
+\typeout{datetime package message : option "nodate" used, so not defining \protect\monthname}
+\else
+\providecommand{\monthnameenglish}[1][\month]{%
+\@orgargctr=#1\relax
+\ifcase\@orgargctr
+\PackageError{datetime}{Invalid Month number \the\@orgargctr}{Month numbers should go from 1 (January) to 12 (December)}%
+\or January%
+\or February%
+\or March%
+\or April%
+\or May%
+\or June%
+\or July%
+\or August%
+\or September%
+\or October%
+\or November%
+\or December%
+\else \PackageError{datetime}{Invalid Month number \the\@orgargctr}{Month numbers should go from 1 (January) to 12 (December)}%
+\fi}
+
+\let\monthname=\monthnameenglish
+\fi
+\if@dt@nodate
+\typeout{datetime package message : option "nodate" used, so not defining \protect\shortmonthname}
+\else
+\providecommand{\shortmonthnameenglish}[1][\month]{%
+\@orgargctr=#1\relax
+\ifcase\@orgargctr
+\PackageError{datetime}{Invalid Month number \the\@orgargctr}{Month numbers should go from 1 (jan) to 12 (dec)}%
+\or Jan%
+\or Feb%
+\or Mar%
+\or Apr%
+\or May%
+\or Jun%
+\or Jul%
+\or Aug%
+\or Sept%
+\or Oct%
+\or Nov%
+\or Dec%
+\else%
+\PackageError{datetime}{Invalid Month number \the\@orgargctr}{Month numbers should go from 1 (jan) to 12 (dec)}%
+\fi}
+
+\let\shortmonthname=\shortmonthnameenglish
+
+\fi
+\newif\ifleapyear
+
+\newcount\@dtctr
+
+\if@dt@nodate
+\typeout{datetime package message : option "nodate" used, so not defining \protect\dayofweek \space or \protect\shortdayofweek}
+\else
+
+\providecommand{\testifleapyear}[1]{%
+\leapyearfalse
+\@year=#1\relax
+\@dtctr=\@year
+\@modulo{\@dtctr}{400}%
+\ifnum\@dtctr=0\relax
+\leapyeartrue %         year mod 400 = 0 => leap year
+\else
+\@dtctr=\@year
+\@modulo{\@dtctr}{100}%
+\ifnum\@dtctr=0\relax
+\leapyearfalse %        year mod 100 = 0 && year mod 400 != 0 => not a leap year
+\else
+\@dtctr=\@year
+\@modulo{\@dtctr}{4}%
+\ifnum\@dtctr=0\relax
+\leapyeartrue %         year mod 4 = 0 && year mod 100 != 0 => leap year
+\fi
+\fi
+\fi
+}
+
+\newcount\dayofyear
+
+
+\providecommand{\computedayofyear}[3]{%
+\testifleapyear{#3}%
+\dayofyear=0\relax
+\@day=#1\relax \@month=#2\relax \@year=#3\relax
+\ifcase\@month
+\or
+\or \advance\dayofyear by 31\relax
+\or \advance\dayofyear by 59\relax
+\or \advance\dayofyear by 90\relax
+\or \advance\dayofyear by 120\relax
+\or \advance\dayofyear by 151\relax
+\or \advance\dayofyear by 181\relax
+\or \advance\dayofyear by 212\relax
+\or \advance\dayofyear by 243\relax
+\or \advance\dayofyear by 273\relax
+\or \advance\dayofyear by 304\relax
+\or \advance\dayofyear by 334\relax
+\else
+\PackageError{datetime}{Invalid month number}{The second argument to \protect\computedayofyear
+should lie in the range 1-12}
+\fi
+\ifnum\@month>2\relax
+\ifleapyear\advance\dayofyear by 1\relax\fi
+\fi
+\advance\dayofyear by \@day\relax
+}
+
+\newcount\dayofweek
+
+
+\providecommand{\computedayofweek}[3]{%
+\computedayofyear{#1}{#2}{#3}%
+\@dtctr=#3\relax
+\advance\@dtctr by -1901\relax
+\@modulo{\@dtctr}{28}%
+\dayofweek=\@dtctr
+\divide\dayofweek by 4\relax
+\advance\dayofweek by \@dtctr
+\advance\dayofweek by 2\relax
+\@modulo{\dayofweek}{7}%
+\advance\dayofweek by \dayofyear
+\advance\dayofweek by -1\relax
+\@modulo{\dayofweek}{7}%
+\advance\dayofweek by 1\relax}
+
+\providecommand{\dayofweekname}[3]{%
+\computedayofweek{#1}{#2}{#3}%
+\ifcase\dayofweek
+\or Sunday%
+\or Monday%
+\or Tuesday%
+\or Wednesday%
+\or Thursday%
+\or Friday%
+\or Saturday%
+\fi}
+
+\providecommand{\shortdayofweekname}[3]{%
+\computedayofweek{#1}{#2}{#3}%
+\ifcase\dayofweek
+\or Sun%
+\or Mon%
+\or Tue%
+\or Wed%
+\or Thu%
+\or Fri%
+\or Sat%
+\fi}
+\fi
+\if@dt@nodate
+\else
+\def\today{\formatdate{\day}{\month}{\year}}
+\fi
+\if@dt@nodate
+\else
+\@ifundefined{dateenglish}{}{\let\dateenglish\longdate}
+\@ifundefined{dateaustrian}{}{\input{dt-austrian.def}}
+\@ifundefined{datebahasa}{}{\input{dt-bahasa.def}}
+\@ifundefined{datebasque}{}{\input{dt-basque.def}}
+\@ifundefined{datebreton}{}{\input{dt-breton.def}}
+\@ifundefined{datebulgarian}{}{\input{dt-bulgarian.def}}
+\@ifundefined{datecatalan}{}{\input{dt-catalan.def}}
+\@ifundefined{datecroatian}{}{\input{dt-croatian.def}}
+\@ifundefined{dateczech}{}{\input{dt-czech.def}}
+\@ifundefined{datedanish}{}{\input{dt-danish.def}}
+\@ifundefined{datedutch}{}{\input{dt-dutch.def}}
+\@ifundefined{dateesperanto}{}{\input{dt-esperanto.def}}
+\@ifundefined{dateestonian}{}{\input{dt-estonian.def}}
+\@ifundefined{datefinnish}{}{\input{dt-finnish.def}}
+\@ifundefined{datefrench}{}{\input{dt-french.def}}
+\@ifundefined{dategalician}{}{\input{dt-galician.def}}
+\@ifundefined{dategerman}{}{\input{dt-german.def}}
+\@ifundefined{dategreek}{}{\input{dt-greek.def}}
+\@ifundefined{datehebrew}{}{\input{dt-hebrew.def}}
+\@ifundefined{dateicelandic}{}{\input{dt-icelandic.def}}
+\@ifundefined{dateirish}{}{\input{dt-irish.def}}
+\@ifundefined{dateitalian}{}{\input{dt-italian.def}}
+\@ifundefined{datelatin}{}{\input{dt-latin.def}}
+\@ifundefined{datelsorbian}{}{\input{dt-lsorbian.def}}
+\@ifundefined{datemagyar}{}{\input{dt-magyar.def}}
+\@ifundefined{datenaustrian}{}{\input{dt-naustrian.def}}
+\@ifundefined{datengerman}{}{\input{dt-ngerman.def}}
+\@ifundefined{datenorsk}{}{\input{dt-norsk.def}}
+\@ifundefined{datepolish}{}{\input{dt-polish.def}}
+\@ifundefined{dateportuges}{}{\input{dt-portuges.def}}
+\@ifundefined{dateromanian}{}{\input{dt-romanian.def}}
+\@ifundefined{daterussian}{}{\input{dt-russian.def}}
+\@ifundefined{datesamin}{}{\input{dt-samin.def}}
+\@ifundefined{datescottish}{}{\input{dt-scottish.def}}
+\@ifundefined{dateserbian}{}{\input{dt-serbian.def}}
+\@ifundefined{dateslovak}{}{\input{dt-slovak.def}}
+\@ifundefined{dateslovene}{}{\input{dt-slovene.def}}
+\@ifundefined{datespanish}{}{\input{dt-spanish.def}}
+\@ifundefined{dateswedish}{}{\input{dt-swedish.def}}
+\@ifundefined{dateturkish}{}{\input{dt-turkish.def}}
+\@ifundefined{dateukraineb}{}{\input{dt-ukraineb.def}}
+\@ifundefined{dateusorbian}{}{\input{dt-usorbian.def}}
+\@ifundefined{datewelsh}{}{\input{dt-welsh.def}}
+\fi
+\if@dt@nodate
+\typeout{datetime package message : option "nodate" used, so not defining \protect\newdateformat}
+\else
+
+\providecommand\THEDAY{\the\@day}
+\providecommand\THEMONTH{\the\@month}
+\providecommand\THEYEAR{\the\@year}
+\let\c@DAY=\@day
+\let\c@MONTH=\@month
+\let\c@YEAR=\@year
+
+\providecommand{\dateformat}[4]{\@day=#2\relax\@month=#3\relax\@year=#4\relax#1}
+
+\providecommand{\newdateformat}[2]{%
+\@ifundefined{#1}{\expandafter\def\csname#1\endcsname{\renewcommand{\formatdate}{\dateformat{#2}}}}{%
+\PackageError{datetime}{Can't create new date format, command \textbackslash#1 already defined}{You will need to
+give you new date format a different name}}}
+\fi
+
+\newcount\c@HOUR
+\newcount\c@HOURXII
+\newcount\c@MINUTE
+\newcount\c@TOHOUR
+\newcount\c@TOMINUTE
+\def\THEHOUR{\the\c@HOUR}
+\def\THEHOURXII{\the\c@HOURXII}
+\def\THEMINUTE{\the\c@MINUTE}
+\def\THETOHOUR{\the\c@TOHOUR}
+\def\THETOMINUTE{\the\c@TOMINUTE}
+
+\providecommand{\newtimeformat}[2]{%
+\@ifundefined{#1}{%
+\expandafter\def\csname#1\endcsname{%
+\c@HOUR=\time%
+\divide\c@HOUR by 60\relax
+\c@HOURXII=\c@HOUR
+\ifnum\c@HOURXII>12
+\advance\c@HOURXII by -12\relax
+\fi
+\c@MINUTE=\time%
+\@modulo{\c@MINUTE}{60}%
+\c@TOHOUR=\c@HOURXII
+\advance\c@TOHOUR by 1\relax
+\@modulo{\c@TOHOUR}{12}%
+\c@TOMINUTE=\c@MINUTE
+\advance\c@TOMINUTE by -60\relax
+\multiply\c@TOMINUTE by -1\relax
+#2
+}}{%
+\PackageError{datetime}{Command \textbackslash#1  already defined}{%
+You can't create a new time format called "#1" as the command \textbackslash#1 already exists}}}
+\newtimeformat{xxivtime}{\twodigit\THEHOUR:\twodigit\THEMINUTE}
+
+\newtimeformat{ampmtime}{\THEHOURXII:\twodigit\THEMINUTE\ifthenelse{\value{HOUR}<12}{\amname}{\ifthenelse{\time=720}{ \noon}{\pmname}}}
+
+\newtimeformat{oclock}{\ifthenelse{\time=0 \or \time=720}{%
+\ifthenelse{\time=0}{\midnight}{\noon}}{%
+\ifthenelse{\value{MINUTE}=0}{%
+\Numberstring{HOUR} \oclock}{%
+\ifthenelse{\value{MINUTE}=15}{%
+\quarterpast\ \Numberstring{HOUR}}{%
+\ifthenelse{\value{MINUTE}=30}{%
+\halfpast\ \Numberstring{HOUR}}{%
+\ifthenelse{\value{MINUTE}=45}{%
+\quarterto\ \Numberstring{TOHOUR}}{%
+\ifthenelse{\value{MINUTE}<30}{%
+\Numberstring{MINUTE} \ifthenelse{\value{MINUTE}=1}{minute}{minutes} past \Numberstring{HOURXII}}{%
+\Numberstring{TOMINUTE} \ifthenelse{\value{TOMINUTE}=1}{minute}{minutes} to \Numberstring{TOHOUR}}}}}}
+\ifthenelse{\value{HOUR}<12}{%
+\amstring}{%
+\pmstring}}}
+
+\providecommand{\amname}{am}
+\providecommand{\pmname}{pm}
+\providecommand{\amstring}{in the morning}
+\providecommand{\pmstring}{in the afternoon}
+\providecommand{\halfpast}{Half past}
+\providecommand{\quarterpast}{Quarter past}
+\providecommand{\quarterto}{Quarter to}
+\providecommand{\noon}{Noon}
+\providecommand{\midnight}{Midnight}
+\providecommand{\oclockstring}{O'Clock}
+
+\newcount\pdftimectr
+\newcount\pdfdatectr
+
+\pdftimectr=0\relax
+\@dtctr=\time%
+\divide\@dtctr by 60\relax
+\multiply\@dtctr by 10000\relax
+\pdftimectr=\@dtctr
+\@dtctr=\time%
+\@modulo{\@dtctr}{60}%
+\multiply\@dtctr by 100\relax
+\advance\pdftimectr by \@dtctr
+\pdfdatectr=\day
+\@dtctr=\month
+\multiply\@dtctr by 100\relax
+\advance\pdfdatectr by \@dtctr
+\@dtctr=\year
+\multiply\@dtctr by 10000\relax
+\advance\pdfdatectr by \@dtctr
+\ifnum\pdftimectr<100000
+\edef\pdfdate{\the\pdfdatectr0\the\pdftimectr}
+\else
+\edef\pdfdate{\the\pdfdatectr\the\pdftimectr}
+\fi
+\endinput
+%%
+%% End of file `datetime.sty'.
diff --git a/usermanual/fmtcount.sty b/usermanual/fmtcount.sty
new file mode 100644
index 0000000..304bacb
--- /dev/null
+++ b/usermanual/fmtcount.sty
@@ -0,0 +1,587 @@
+%%
+%% This is file `fmtcount.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% datetime.dtx  (with options: `fmtcount')
+%% Copyright (C) 2000 Nicola Talbot, all rights reserved.
+%% If you modify this file, you must change its name first.
+%% You are NOT ALLOWED to distribute this file alone. You are NOT
+%% ALLOWED to take money for the distribution or use of either this
+%% file or a changed version, except for a nominal charge for copying
+%% etc.
+%% \CharacterTable
+%%  {Upper-case    \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
+%%   Lower-case    \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
+%%   Digits        \0\1\2\3\4\5\6\7\8\9
+%%   Exclamation   \!     Double quote  \"     Hash (number) \#
+%%   Dollar        \$     Percent       \%     Ampersand     \&
+%%   Acute accent  \'     Left paren    \(     Right paren   \)
+%%   Asterisk      \*     Plus          \+     Comma         \,
+%%   Minus         \-     Point         \.     Solidus       \/
+%%   Colon         \:     Semicolon     \;     Less than     \<
+%%   Equals        \=     Greater than  \>     Question mark \?
+%%   Commercial at \@     Left bracket  \[     Backslash     \\
+%%   Right bracket \]     Circumflex    \^     Underscore    \_
+%%   Grave accent  \`     Left brace    \{     Vertical bar  \|
+%%   Right brace   \}     Tilde         \~}
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{fmtcount}[2004/10/22 v1.0]
+\RequirePackage{ifthen}
+\newcount\@DT@modctr
+\def\@modulo#1#2{%
+\@DT@modctr=#1\relax
+\divide \@DT@modctr by #2\relax
+\multiply \@DT@modctr by #2\relax
+\advance #1 by -\@DT@modctr}
+\providecommand{\fmtord}[1]{#1}
+\newcount\@ordinalctr
+\newcount\@orgargctr
+\def\@ordinal#1{%
+\@orgargctr=#1\relax
+\@ordinalctr=#1%
+\@modulo{\@ordinalctr}{100}%
+\ifnum\@ordinalctr=11
+\the\@orgargctr\fmtord{th}%
+\else
+\ifnum\@ordinalctr=12
+\the\@orgargctr\fmtord{th}%
+\else
+\ifnum\@ordinalctr=13
+\the\@orgargctr\fmtord{th}%
+\else
+\@modulo{\@ordinalctr}{10}%
+\ifcase\@ordinalctr
+\the\@orgargctr\fmtord{th}%      case 0
+\or \the\@orgargctr\fmtord{st}%  case 1
+\or \the\@orgargctr\fmtord{nd}%  case 2
+\or \the\@orgargctr\fmtord{rd}%  case 3
+\else
+\the\@orgargctr\fmtord{th}%      default case
+\fi
+\fi
+\fi
+\fi
+}
+\newcommand{\@@unitstring}[1]{%
+\ifcase#1\relax
+zero%
+\or one%
+\or two%
+\or three%
+\or four%
+\or five%
+\or six%
+\or seven%
+\or eight%
+\or nine%
+\fi
+}
+
+\newcommand{\@@tenstring}[1]{%
+\ifcase#1\relax
+\or ten%
+\or twenty%
+\or thirty%
+\or fourty%
+\or fifty%
+\or sixty%
+\or seventy%
+\or eighty%
+\or ninety%
+\fi
+}
+
+\newcommand{\@@teenstring}[1]{%
+\ifcase#1\relax
+ten%
+\or eleven%
+\or twelve%
+\or thirteen%
+\or fourteen%
+\or fifteen%
+\or sixteen%
+\or seventeen%
+\or eighteen%
+\or nineteen%
+\fi
+}
+
+\newcommand{\@@Unitstring}[1]{%
+\ifcase#1\relax
+Zero%
+\or One%
+\or Two%
+\or Three%
+\or Four%
+\or Five%
+\or Six%
+\or Seven%
+\or Eight%
+\or Nine%
+\fi
+}
+
+\newcommand{\@@Tenstring}[1]{%
+\ifcase#1\relax
+\or Ten%
+\or Twenty%
+\or Thirty%
+\or Fourty%
+\or Fifty%
+\or Sixty%
+\or Seventy%
+\or Eighty%
+\or Ninety%
+\fi
+}
+
+\newcommand{\@@Teenstring}[1]{%
+\ifcase#1\relax
+Ten%
+\or Eleven%
+\or Twelve%
+\or Thirteen%
+\or Fourteen%
+\or Fifteen%
+\or Sixteen%
+\or Seventeen%
+\or Eighteen%
+\or Nineteen%
+\fi
+}
+
+\newcount\strctr
+\newcommand{\@@numberstring}[1]{%
+\ifnum#1>99000
+\PackageError{fmtcount}{Out of range}%
+{This macro only works for values less than 100000}%
+\else
+\ifnum#1<0
+\PackageError{fmtcount}{Negative numbers not permitted}%
+{This macro does not work for negative numbers, however
+you can try typing "minus" first, and then pass the modulus of
+this number}%
+\fi
+\fi
+\strctr=#1\relax \divide\strctr by 1000\relax
+\ifnum\strctr>9
+\divide\strctr by 10
+\ifnum\strctr>1
+\@tenstring{\strctr}%
+\strctr=#1 \divide\strctr by 10000
+\ifnum\strctr>0 -\@unitstring{\strctr}\fi
+\else
+\strctr=#1 \divide\strctr by 1000
+\@teenstring{\strctr}%
+\fi
+\ \@thousand%
+\else
+\ifnum\strctr>0 \@unitstring{\strctr}\ \@thousand\fi
+\fi
+\strctr=#1\relax \@modulo{\strctr}{1000}%
+\divide\strctr by 100
+\ifnum\strctr>0
+\ifnum#1>1000 \ \fi\@unitstring{\strctr}\ \@hundred%
+\fi
+\strctr=#1\relax \@modulo{\strctr}{100}%
+\ifnum#1>100 \ifnum\strctr>0 \ and \fi\fi
+\ifnum\strctr>19
+\divide\strctr by 10
+\@tenstring{\strctr}%
+\strctr=#1\relax \@modulo{\strctr}{10}%
+\ifnum\strctr>0
+-\@unitstring{\strctr}%
+\fi
+\else
+\ifnum\strctr<10
+\ifnum\strctr=0
+\ifnum#1<100 \@unitstring{\strctr}\fi
+\else
+\@unitstring{\strctr}%
+\fi
+\else
+\@modulo{\strctr}{10}%
+\@teenstring{\strctr}%
+\fi
+\fi
+}
+
+\newcommand{\@numberstring}[1]{%
+\let\@unitstring=\@@unitstring \let\@teenstring=\@@teenstring \let\@tenstring=\@@tenstring
+\def\@hundred{hundred}\def\@thousand{thousand}%
+\@@numberstring{#1}}
+
+\newcommand{\@Numberstring}[1]{%
+\let\@unitstring=\@@Unitstring \let\@teenstring=\@@Teenstring \let\@tenstring=\@@Tenstring
+\def\@hundred{Hundred}\def\@thousand{Thousand}%
+\@@numberstring{#1}}
+\newcommand{\@@unitthstring}[1]{%
+\ifcase#1\relax
+zeroth%
+\or first%
+\or second%
+\or third%
+\or fourth%
+\or fifth%
+\or sixth%
+\or seventh%
+\or eighth%
+\or nineth%
+\fi
+}
+
+\newcommand{\@@tenthstring}[1]{%
+\ifcase#1\relax
+\or tenth%
+\or twentieth%
+\or thirtieth%
+\or fourtieth%
+\or fiftieth%
+\or sixtieth%
+\or seventieth%
+\or eightieth%
+\or ninetieth%
+\fi
+}
+
+\newcommand{\@@teenthstring}[1]{%
+\ifcase#1\relax
+tenth%
+\or eleventh%
+\or twelfth%
+\or thirteenth%
+\or fourteenth%
+\or fifteenth%
+\or sixteenth%
+\or seventeenth%
+\or eighteenth%
+\or nineteenth%
+\fi
+}
+
+\newcommand{\@@Unitthstring}[1]{%
+\ifcase#1\relax
+Zeroth%
+\or First%
+\or Second%
+\or Third%
+\or Fourth%
+\or Fifth%
+\or Sixth%
+\or Seventh%
+\or Eighth%
+\or Nineth%
+\fi
+}
+
+\newcommand{\@@Tenthstring}[1]{%
+\ifcase#1\relax
+\or Tenth%
+\or Twentieth%
+\or Thirtieth%
+\or Fourtieth%
+\or Fiftieth%
+\or Sixtieth%
+\or Seventieth%
+\or Eightieth%
+\or Ninetieth%
+\fi
+}
+
+\newcommand{\@@Teenthstring}[1]{%
+\ifcase#1\relax
+Tenth%
+\or Eleventh%
+\or Twelfth%
+\or Thirteenth%
+\or Fourteenth%
+\or Fifteenth%
+\or Sixteenth%
+\or Seventeenth%
+\or Eighteenth%
+\or Nineteenth%
+\fi
+}
+
+\newcommand{\@@ordinalstring}[1]{%
+\ifnum#1>99000
+\PackageError{fmtcount}{Out of range}%
+{This macro only works for values less than 100000}%
+\else
+\ifnum#1<0
+\PackageError{fmtcount}{Negative numbers not permitted}%
+{This macro does not work for negative numbers, however
+you can try typing "minus" first, and then pass the modulus of
+this number}%
+\fi
+\fi
+\strctr=#1\relax \divide\strctr by 1000\relax
+\ifnum\strctr>9
+\divide\strctr by 10
+\ifnum\strctr>1
+\@tenstring{\strctr}%
+\strctr=#1 \divide\strctr by 10000
+\ifnum\strctr>0 -\@unitstring{\strctr}\fi
+\else
+\strctr=#1 \divide\strctr by 1000
+\@teenstring{\strctr}%
+\fi
+\strctr=#1\relax \@modulo{\strctr}{1000}%
+\ifnum\strctr=0\ \@thousandth\else\ \@thousand \fi
+\else
+\ifnum\strctr>0\relax
+\@unitstring{\strctr}%
+\strctr=#1\relax \@modulo{\strctr}{1000}%
+\ifnum\strctr=0\ \@thousandth\else\ \@thousand\fi
+\fi
+\fi
+\strctr=#1\relax \@modulo{\strctr}{1000}%
+\divide\strctr by 100
+\ifnum\strctr>0
+\ifnum#1>1000 \ \fi\@unitstring{\strctr}%
+\strctr=#1\relax \@modulo{\strctr}{100}%
+\ifnum\strctr=0\ \@hundredth\else\ \@hundred\fi
+\fi
+\strctr=#1\relax \@modulo{\strctr}{100}%
+\ifnum#1>100 \ifnum\strctr>0\ and \fi\fi
+\ifnum\strctr>19
+\@dtctr=\strctr
+\divide\strctr by 10
+\@modulo{\@dtctr}{10}%
+\ifnum\@dtctr=0 \@tenthstring{\strctr}\else \@tenstring{\strctr}\fi
+\strctr=#1\relax \@modulo{\strctr}{10}%
+\ifnum\strctr>0
+-\@unitthstring{\strctr}%
+\fi
+\else
+\ifnum\strctr<10
+\ifnum\strctr=0
+\ifnum#1<100 \@unitthstring{\strctr}\fi
+\else
+\@unitthstring{\strctr}%
+\fi
+\else
+\@modulo{\strctr}{10}%
+\@teenthstring{\strctr}%
+\fi
+\fi
+}
+
+\newcommand{\@ordinalstring}[1]{%
+\let\@unitthstring=\@@unitthstring \let\@teenthstring=\@@teenthstring \let\@tenthstring=\@@tenthstring
+\let\@unitstring=\@@unitstring \let\@teenstring=\@@teenstring \let\@tenstring=\@@tenstring
+\def\@hundred{hundred}\def\@thousand{thousand}%
+\def\@hundredth{hundredth}\def\@thousandth{thousandth}%
+\@@ordinalstring{#1}}
+
+\newcommand{\@Ordinalstring}[1]{%
+\let\@unitthstring=\@@Unitthstring \let\@teenthstring=\@@Teenthstring \let\@tenthstring=\@@Tenthstring
+\let\@unitstring=\@@Unitstring \let\@teenstring=\@@Teenstring \let\@tenstring=\@@Tenstring
+\def\@hundred{Hundred}\def\@thousand{Thousand}%
+\def\@hundredth{Hundredth}\def\@thousandth{Thousandth}%
+\@@ordinalstring{#1}}
+\newcount\c@padzeroesN
+\c@padzeroesN=1\relax
+\providecommand{\padzeroes}[1][17]{\c@padzeroesN=#1}
+
+\newif\if@DT@padzeroes
+\newcount\@DT@loopN
+\newcount\@DT@X
+\newcommand{\@binary}[1]{%
+\@DT@padzeroestrue
+\@DT@loopN=17\relax
+\strctr=65536\relax
+\@DT@X=#1\relax
+\loop
+\@DT@modctr=\@DT@X
+\divide\@DT@modctr by \strctr
+\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\the\@DT@modctr}%
+\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi
+\multiply\@DT@modctr by \strctr
+\advance\@DT@X by -\@DT@modctr
+\divide\strctr by 2\relax
+\advance\@DT@loopN by -1\relax
+\ifnum\strctr>1
+\repeat
+\the\@DT@X}
+\newcommand{\@octal}[1]{%
+\ifnum#1>32768
+\PackageError{fmtcount}{Value of counter too large for \protect\@octal}{Maximum value 32768}
+\else
+\@DT@padzeroestrue
+\@DT@loopN=6\relax
+\strctr=32768\relax
+\@DT@X=#1\relax
+\loop
+\@DT@modctr=\@DT@X
+\divide\@DT@modctr by \strctr
+\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\the\@DT@modctr}%
+\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi
+\multiply\@DT@modctr by \strctr
+\advance\@DT@X by -\@DT@modctr
+\divide\strctr by 8\relax
+\advance\@DT@loopN by -1\relax
+\ifnum\strctr>1
+\repeat
+\the\@DT@X
+\fi}
+\newcommand{\@@hexadecimal}[1]{\ifcase#10\or1\or2\or3\or4\or5\or6\or7\or8\or9\or a\or b\or c\or d\or e\or f\fi}
+
+\newcommand{\@hexadecimal}[1]{%
+\@DT@padzeroestrue
+\@DT@loopN=5\relax
+\strctr=65536\relax
+\@DT@X=#1\relax
+\loop
+\@DT@modctr=\@DT@X
+\divide\@DT@modctr by \strctr
+\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\@@hexadecimal\@DT@modctr}%
+\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi
+\multiply\@DT@modctr by \strctr
+\advance\@DT@X by -\@DT@modctr
+\divide\strctr by 16\relax
+\advance\@DT@loopN by -1\relax
+\ifnum\strctr>1
+\repeat
+\@@hexadecimal\@DT@X}
+
+\newcommand{\@@Hexadecimal}[1]{\ifcase#10\or1\or2\or3\or4\or5\or6\or7\or8\or9\or A\or B\or C\or D\or E\or F\fi}
+
+\newcommand{\@Hexadecimal}[1]{%
+\@DT@padzeroestrue
+\@DT@loopN=5\relax
+\strctr=65536\relax
+\@DT@X=#1\relax
+\loop
+\@DT@modctr=\@DT@X
+\divide\@DT@modctr by \strctr
+\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\@@Hexadecimal\@DT@modctr}%
+\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi
+\multiply\@DT@modctr by \strctr
+\advance\@DT@X by -\@DT@modctr
+\divide\strctr by 16\relax
+\advance\@DT@loopN by -1\relax
+\ifnum\strctr>1
+\repeat
+\@@Hexadecimal\@DT@X}
+
+\newcommand{\@aaalph}[1]{%
+\@DT@loopN=#1\relax
+\advance\@DT@loopN by -1\relax
+\divide\@DT@loopN by 26\relax
+\@DT@modctr=\@DT@loopN
+\multiply\@DT@modctr by 26\relax
+\@DT@X=#1\relax
+\advance\@DT@X by -1\relax
+\advance\@DT@X by -\@DT@modctr
+\advance\@DT@loopN by 1\relax
+\advance\@DT@X by 1\relax
+\loop
+\@alph\@DT@X
+\advance\@DT@loopN by -1\relax
+\ifnum\@DT@loopN>0
+\repeat
+}
+
+\newcommand{\@AAAlph}[1]{%
+\@DT@loopN=#1\relax
+\advance\@DT@loopN by -1\relax
+\divide\@DT@loopN by 26\relax
+\@DT@modctr=\@DT@loopN
+\multiply\@DT@modctr by 26\relax
+\@DT@X=#1\relax
+\advance\@DT@X by -1\relax
+\advance\@DT@X by -\@DT@modctr
+\advance\@DT@loopN by 1\relax
+\advance\@DT@X by 1\relax
+\loop
+\@Alph\@DT@X
+\advance\@DT@loopN by -1\relax
+\ifnum\@DT@loopN>0
+\repeat
+}
+
+\newcommand{\@abalph}[1]{%
+\ifnum#1>17576
+\PackageError{fmtcount}{Value of counter too large for \protect\@abalph}{Maximum value 17576}
+\else
+\@DT@padzeroestrue
+\strctr=17576\relax
+\@DT@X=#1\relax
+\advance\@DT@X by -1\relax
+\loop
+\@DT@modctr=\@DT@X
+\divide\@DT@modctr by \strctr
+\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=1\)}{}{\@alph\@DT@modctr}%
+\ifnum\@DT@modctr=1\else\@DT@padzeroesfalse\fi
+\multiply\@DT@modctr by \strctr
+\advance\@DT@X by -\@DT@modctr
+\divide\strctr by 26\relax
+\ifnum\strctr>1
+\repeat
+\advance\@DT@X by 1\relax
+\@alph\@DT@X
+\fi}
+
+\newcommand{\@ABAlph}[1]{%
+\ifnum#1>17576
+\PackageError{fmtcount}{Value of counter too large for \protect\@ABAlph}{Maximum value 17576}
+\else
+\@DT@padzeroestrue
+\strctr=17576\relax
+\@DT@X=#1\relax
+\advance\@DT@X by -1\relax
+\loop
+\@DT@modctr=\@DT@X
+\divide\@DT@modctr by \strctr
+\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=1\)}{}{\@Alph\@DT@modctr}%
+\ifnum\@DT@modctr=1\else\@DT@padzeroesfalse\fi
+\multiply\@DT@modctr by \strctr
+\advance\@DT@X by -\@DT@modctr
+\divide\strctr by 26\relax
+\ifnum\strctr>1
+\repeat
+\advance\@DT@X by 1\relax
+\@Alph\@DT@X
+\fi}
+
+\newcommand{\@decimal}[1]{%
+\ifnum#1>10000
+\PackageError{fmtcount}{Value of counter too large for \protect\@decimal}{Maximum value 10000}
+\else
+\@DT@padzeroestrue
+\@DT@loopN=6\relax
+\strctr=10000\relax
+\@DT@X=#1\relax
+\loop
+\@DT@modctr=\@DT@X
+\divide\@DT@modctr by \strctr
+\ifthenelse{\boolean{@DT@padzeroes} \and \(\@DT@modctr=0\) \and \(\@DT@loopN>\c@padzeroesN\)}{}{\the\@DT@modctr}%
+\ifnum\@DT@modctr=0\else\@DT@padzeroesfalse\fi
+\multiply\@DT@modctr by \strctr
+\advance\@DT@X by -\@DT@modctr
+\divide\strctr by 10\relax
+\advance\@DT@loopN by -1\relax
+\ifnum\strctr>1
+\repeat
+\the\@DT@X
+\fi}
+\providecommand{\ordinal}[1]{\expandafter\protect\expandafter\@ordinal{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\ordinalstring}[1]{\expandafter\protect\expandafter\@ordinalstring{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\Ordinalstring}[1]{\expandafter\protect\expandafter\@Ordinalstring{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\numberstring}[1]{\expandafter\protect\expandafter\@numberstring{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\Numberstring}[1]{\expandafter\protect\expandafter\@Numberstring{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\binary}[1]{\expandafter\protect\expandafter\@binary{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\aaalph}[1]{\expandafter\protect\expandafter\@aaalph{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\AAAlph}[1]{\expandafter\protect\expandafter\@AAAlph{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\abalph}[1]{\expandafter\protect\expandafter\@abalph{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\ABAlph}[1]{\expandafter\protect\expandafter\@ABAlph{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\hexadecimal}[1]{\expandafter\protect\expandafter\@hexadecimal{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\Hexadecimal}[1]{\expandafter\protect\expandafter\@Hexadecimal{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\octal}[1]{\expandafter\protect\expandafter\@octal{\expandafter\the\csname c@#1\endcsname}}
+\providecommand{\decimal}[1]{\expandafter\protect\expandafter\@decimal{\expandafter\the\csname c@#1\endcsname}}
+\endinput
+%%
+%% End of file `fmtcount.sty'.
diff --git a/usermanual/graphics/Makefile b/usermanual/graphics/Makefile
new file mode 100644
index 0000000..d34f02e
--- /dev/null
+++ b/usermanual/graphics/Makefile
@@ -0,0 +1 @@
+all: 
diff --git a/usermanual/graphics/concordialogo.png b/usermanual/graphics/concordialogo.png
new file mode 100644
index 0000000..8019bd1
Binary files /dev/null and b/usermanual/graphics/concordialogo.png differ
diff --git a/usermanual/graphics/genomicslogogreen.jpg b/usermanual/graphics/genomicslogogreen.jpg
new file mode 100644
index 0000000..27e5692
Binary files /dev/null and b/usermanual/graphics/genomicslogogreen.jpg differ
diff --git a/usermanual/graphics/genozymeslogo.jpg b/usermanual/graphics/genozymeslogo.jpg
new file mode 100644
index 0000000..7f8a2e1
Binary files /dev/null and b/usermanual/graphics/genozymeslogo.jpg differ
diff --git a/usermanual/usefulsymbols.sty b/usermanual/usefulsymbols.sty
new file mode 100644
index 0000000..68d50b2
--- /dev/null
+++ b/usermanual/usefulsymbols.sty
@@ -0,0 +1,55 @@
+% packages
+\usepackage{float}						% for algorithm float
+\usepackage{graphicx}					% for figures, plots, etc
+\graphicspath{{./data/}}				% ''
+\usepackage{amssymb}					% special math fonts
+\usepackage{amsmath}
+\usepackage{epsf,amsfonts,amsmath,amssymb}		% defaults
+\usepackage{url} 
+% for footnotes in the author fields
+%\newcommand{\footnoteremember}[2]{
+%	\footnote{#2}
+%	\newcounter{#1}
+%	\setcounter{#1}{\value{footnote}}
+%}
+%\newcommand{\footnoterecall}[1]{
+%	\footnotemark[\value{#1}]
+%}
+\usepackage[usenames]{color}			% for colour
+\newcommand{\blue}{\color{blue}}		% "
+% math
+\renewcommand{\vec}{\mathbf}
+\newcommand{\mat}[1]{\boldsymbol#1}
+\newcommand{\s}{\vec{s}}
+\newcommand{\w}{\vec{w}}
+\newcommand{\x}{\vec{x}}
+\newcommand{\xtest}{\vec{\tilde{x}}}
+\newcommand{\y}{\vec{y}}
+\newcommand{\z}{\vec{z}}
+\renewcommand{\a}{\vec{a}}
+\renewcommand{\b}{\vec{b}}
+\renewcommand{\c}{\vec{c}}
+\renewcommand{\o}{\vec{o}}
+\newcommand{\p}{\vec{p}}
+\newcommand{\argmax}{\operatornamewithlimits{argmax}}
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\X}{\mathcal{X}}
+\newcommand{\Y}{\mathcal{Y}}
+\renewcommand{\L}{\mathcal{L}}
+\newcommand{\tup}[1]{\langle#1\rangle}
+% refs
+\newcommand{\code}[1]{Algorithm~\ref{#1}}
+\newcommand{\Fig}[1]{Figure~\ref{#1}}
+\newcommand{\tab}[1]{Table~\ref{#1}}
+\newcommand{\Sec}[1]{Section~\ref{#1}}
+% custom float
+\floatstyle{ruled}
+\newfloat{algorithm}{thp}{lop}
+\floatname{algorithm}{Algorithm}
+% special definitions
+\newcommand{\keyword}[1]{{\it #1}}
+\newcommand{\framework}[1]{{\sc #1}}
+% ref styles
+% text
+\newcommand{\ie}{i.e., }
diff --git a/usermanual/usermanual.bib b/usermanual/usermanual.bib
new file mode 100644
index 0000000..8141f6d
--- /dev/null
+++ b/usermanual/usermanual.bib
@@ -0,0 +1,349 @@
+@article{Mathes2004,
+author = {Mathes, Adam},
+journal = {Computer Mediated Communication},
+pages = {1--13},
+title = {{Folksonomies-cooperative classification and communication through shared metadata}},
+year = {2004}
+}
+
+@book{manning2008introduction,
+  title={Introduction to information retrieval},
+  author={Manning, Christopher D and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich},
+  volume={1},
+  year={2008},
+  publisher={Cambridge University Press Cambridge}
+}
+
+@article{Macgregor2006,
+author = {Macgregor, G and McCulloch, E},
+journal = {Library review},
+pages = {291--300},
+title = {{Collaborative tagging as a knowledge organisation and resource discovery tool}},
+volume = {55},
+year = {2006}
+}
+
+@INPROCEEDINGS{sebastiani2005,
+author = {Fabrizio Sebastiani},
+title = {Text categorization},
+booktitle = {Text Mining and its Applications to Intelligence, CRM and Knowledge Management},
+pages = {109--129},
+publisher = {WIT Press},
+year = {2005}
+}
+
+@article{Voss2004,
+author = {Voss, Jakob},
+journal = {arXiv preprint cs/0604036},
+keywords = {classification,ddc,ontology,tagging,thesaurus,wikipedia},
+number = {1},
+pages = {1--7},
+title = {Collaborative thesaurus tagging the Wikipedia way},
+volume = {1},
+year = {2006}
+}
+
+@inproceedings{Charton2010a,
+author = {Charton, Eric and Torres-Moreno, J.M.},
+publisher = {Proceedings of LREC 2010, the International Conference on Language Resources and Evaluation},
+title = {{NLGbAse: a free linguistic resource for Natural Language Processing systems}},
+year = {2010}
+}
+
+@InProceedings{zhang2012,
+author = {Ziqi Zhang and Philip Webster and Victoria Uren and Andrea Varga and Fabio Ciravegna},
+title = {Automatically Extracting Procedural Knowledge from Instructional Texts using Natural Language Processing},
+booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)},
+year = {2012},
+month = {may},
+date = {23-25},
+address = {Istanbul, Turkey},
+isbn = {978-2-9517408-7-7},
+language = {english}
+}
+ 
+@inproceedings{schumacher2012,
+address = {Lyon, France},
+title = {Extraction of Procedural Knowledge from the Web},
+booktitle = {Workshop Proceedings: {WWW'12}},
+author = {Schumacher, Pol and Minor, Mirjam and Walter, Kirstin and Bergmann, Ralph},
+year = {2012}
+}
+
+@article{Schein2002,
+author = {Schein, AI and Popescul, Alexandrin},
+isbn = {1581135610},
+journal = {Proceedings of the 25th annual international ACM SIGIR conference on Research and development in information retrieval},
+number = {Sigir},
+pages = {253--260},
+url = {http://dl.acm.org/citation.cfm?id=564421},
+title = {{Methods and metrics for cold-start recommendations}},
+year = {2002}
+}
+
+@article{Dave2003,
+author = {Dave, Kushal and Lawrence, Steve and Pennock, DM},
+isbn = {1581136803},
+journal = {WWW '03 Proceedings of the 12th international conference on World Wide Web },
+title = {{Mining the peanut gallery: Opinion extraction and semantic classification of product reviews}},
+url = {http://dl.acm.org/citation.cfm?id=775226},
+year = {2003}
+}
+
+@inproceedings{Groin2007,
+author = {Groin, Cyril and Berthelin, Jean-Baptiste and Ayari, Sarra El and Heitz, Thomas and Hurault-plantet, Martine and Jardino, Michele},
+booktitle = {AFIA 2007},
+pages = {1--8},
+title = {{Pr\'{e}sentation de DEFT'07}},
+year = {2007}
+}
+
+@article{pang2008,
+author = {Pang, Bo and Lee, Lillian},
+doi = {10.1561/1500000001},
+issn = {1554-0669},
+journal = {Foundations and Trends in Information Retrieval},
+number = {2},
+pages = {91--231},
+title = {{Opinion Mining and Sentiment Analysis}},
+volume = {1},
+year = {2008}
+}
+
+@article{Koppel2006,
+author = {Koppel, Moshe and Shtrimberg, Itai},
+journal = {Computing attitude and affect in text: Theory and Application, The Information Retrieval Series},
+keywords = {automated labeling,financial analysis,sentiment analysis},
+title = {{Good news or bad news? let the market decide}},
+volume = {20},
+pages = {297--301},
+url = {http://link.springer.com/chapter/10.1007/1-4020-4102-0\_22},
+year = {2006}
+}
+
+@article{Wu2004,
+archivePrefix = {arXiv},
+author = {Wu, Fang and Huberman, BA},
+journal = {arXiv preprint cond-mat/0407252},
+title = {{Social structure and opinion formation}},
+year = {2004}
+}
+
+@article{Grouin2013,
+author = {Grouin, Cyril and Zweigenbaum, Pierre and Paroubek, Patrick},
+journal = {Actes du neuvi\`{e}me D\'{E}fi Fouille de Texte},
+number = {June},
+pages = {3--16},
+title = {{DEFT2013 se met \`{a} table: pr\'{e}sentation du d\'{e}fi et r\'{e}sultats}},
+url = {http://deft.limsi.fr/actes/actes\_deft2013.pdf\#page=13},
+year = {2013}
+}
+
+@inproceedings{wang2008,
+ author = {Wang, Liping and Li, Qing and Li, Na and Dong, Guozhu and Yang, Yu},
+ title = {Substructure similarity measurement in chinese recipes},
+ booktitle = {Proceedings of the 17th international conference on World Wide Web},
+ series = {WWW '08},
+ year = {2008},
+ isbn = {978-1-60558-085-2},
+ location = {Beijing, China},
+ pages = {979--988},
+ numpages = {10},
+ url = {http://doi.acm.org/10.1145/1367497.1367629},
+ doi = {10.1145/1367497.1367629},
+ publisher = {ACM}
+} 
+
+@inproceedings{wang2006,
+author = {Wang, Liping and Li, Qing and Li, Yu and Meng, Xiaofeng},
+booktitle = {Semantics, Knowledge and Grid, 2006. SKG '06. Second International Conference on},
+  pages = 6,
+  publisher = {IEEE Computer Society},
+  title = {Dish Master: an Intelligent and Adaptive Manager for a Web-based Recipe Database System.},
+  year = 2006
+}
+
+@inproceedings{blatak2004,
+ author = {Blat\'{a}k, Jan and Mr\'{a}kov\'{a}, Eva and Popel\'{\i}nsk\'{y}, Lubo\v{s}},
+ title = {Fragments and text categorization},
+ booktitle = {Proceedings of the ACL 2004 on Interactive poster and demonstration sessions},
+ series = {ACLdemo '04},
+ year = {2004},
+ location = {Barcelona, Spain},
+ articleno = {34},
+ url = {http://dx.doi.org/10.3115/1219044.1219078},
+ doi = {10.3115/1219044.1219078},
+ acmid = {1219078},
+ publisher = {Association for Computational Linguistics},
+ address = {Stroudsburg, PA, USA},
+} 
+
+
+@phdthesis{hall1999correlation,
+  title={Correlation-based feature selection for machine learning},
+  author={Hall, Mark A},
+  year={1999},
+  school={The University of Waikato}
+}
+
+@ARTICLE{Pearl1986,
+  author = {Pearl, J},
+  title = {Fusion, propagation, and structuring in belief networks},
+  journal = {Artificial Intelligence},
+  year = {1986},
+  volume = {29},
+  pages = {241--288},
+  number = {3},
+  address = {Essex, UK},
+  doi = {http://dx.doi.org/10.1016/0004-3702(86)90072-X},
+    issn = {0004-3702},
+  publisher = {Elsevier Science Publishers Ltd.}
+}
+
+@BOOK{Pearl1998,
+  title = {Bayesian networks},
+  publisher = {MIT Press},
+  year = {1998},
+  author = {Pearl, Judea},
+  pages = {149--153},
+  address = {Cambridge, MA, USA},
+  book = {The handbook of brain theory and neural networks},
+  isbn = {0-262-51102-9}
+}
+
+@article{cooper1992bayesian,
+  title={A Bayesian method for the induction of probabilistic networks from data},
+  author={Cooper, Gregory F and Herskovits, Edward},
+  journal={Machine learning},
+  volume={9},
+  number={4},
+  pages={309--347},
+  year={1992},
+  publisher={Springer}
+}
+
+@article{cooper1992bayesian,
+  title={A Bayesian method for the induction of probabilistic networks from data},
+  author={Cooper, Gregory F and Herskovits, Edward},
+  journal={Machine learning},
+  volume={9},
+  number={4},
+  pages={309--347},
+  year={1992},
+  publisher={Springer}
+}
+
+@article{hall2009weka,
+  title={The WEKA data mining software: an update},
+  author={Hall, Mark and Frank, Eibe and Holmes, Geoffrey and Pfahringer, Bernhard and Reutemann, Peter and Witten, Ian H},
+  journal={ACM SIGKDD Explorations Newsletter},
+  volume={11},
+  number={1},
+  pages={10--18},
+  year={2009},
+  publisher={ACM}
+}
+
+@article{lmt2005,
+  title={Logistic model trees},
+  author={Landwehr, Niels and Hall, Mark and Frank, Eibe},
+  journal={Machine Learning},
+  volume={59},
+  number={1-2},
+  pages={161--205},
+  year={2005},
+  publisher={Springer}
+}
+
+@article{collins2002logistic,
+  title={Logistic regression, AdaBoost and Bregman distances},
+  author={Collins, Michael and Schapire, Robert E and Singer, Yoram},
+  journal={Machine Learning},
+  volume={48},
+  number={1-3},
+  pages={253--285},
+  year={2002},
+  publisher={Springer}
+}
+
+@incollection{lmtspeeding2005,
+  title={Speeding up logistic model tree induction},
+  author={Sumner, Marc and Frank, Eibe and Hall, Mark},
+  booktitle={Knowledge Discovery in Databases: PKDD 2005},
+  pages={675--683},
+  year={2005},
+  publisher={Springer}
+}
+
+@inproceedings{Charton2007b,
+author = {Charton, Eric and Acuna-Agost, Rodrigo},
+booktitle = {DEFT},
+title = {{Quel mod\`{e}le pour d\'{e}tecter une opinion? Trois propositions pour g\'{e}n\'{e}raliser lextraction dune id\'{e}e dans un corpus}},
+year = {2007}
+}
+
+@BOOK{Vapnik1995,
+  title = {The Nature of Statistical Learning Theory},
+  publisher = {Springer-Verlag},
+  year = {1995},
+  author = {Vapnik, Vladimir}
+}
+
+@ARTICLE{multiclasssvm2002,
+author={Chih-Wei Hsu and Chih-Jen Lin},
+journal={Neural Networks, IEEE Transactions on},
+title={A comparison of methods for multiclass support vector machines},
+year={2002},
+volume={13},
+number={2},
+pages={415-425}
+}
+
+@article{chang2011libsvm,
+  title={LIBSVM: a library for support vector machines},
+  author={Chang, Chih-Chung and Lin, Chih-Jen},
+  journal={ACM Transactions on Intelligent Systems and Technology (TIST)},
+  volume={2},
+  number={3},
+  pages={27},
+  year={2011},
+  publisher={ACM}
+}
+
+@article{el2005wlsvm,
+  title={WLSVM: Integrating libsvm into WEKA environment},
+  author={El-Manzalawy, Yasser and Honavar, Vasant},
+  journal={Software available at http://www. cs. iastate. edu/yasser/wlsvm},
+  year={2005}
+}
+
+@inproceedings{Charton2013,
+address = {Sables d'Olonnes},
+author = {Charton, Eric and Jean-Louis, Ludovic and Meurs, Marie-Jean and Gagnon, Michel},
+booktitle = {Actes de DEFT2013},
+editor = {Cyril, Groin},
+publisher = {ACLWeb},
+title = {{Trois recettes d'apprentissage automatique pour un syst\`{e}me d'extraction d'information et de classification de recettes de cuisines}},
+url = {http://deft.limsi.fr/2013/},
+year = {2013}
+}
+
+@book{quinlan1993c4,
+  title={C4.5: programs for machine learning},
+  author={Quinlan, John Ross},
+  volume={1},
+  year={1993},
+  publisher={Morgan kaufmann}
+}
+
+@Article{chartoninformatics2013,
+AUTHOR = {Eric Charton, Marie-Jean Meurs, Ludovic Jean-Louis, Michel Gagnon},
+TITLE = {Using Collaborative Tagging for Text Classification: From Text Classification to Opinion Mining},
+JOURNAL = {Informatics},
+VOLUME = {1},
+YEAR = {2014},
+NUMBER = {1},
+PAGES = {32--51},
+URL = {https://www.mdpi.com/2227-9709/1/1/32},
+DOI = {10.3390/informatics1010032}
+} 
\ No newline at end of file
diff --git a/usermanual/usermanual.pdf b/usermanual/usermanual.pdf
new file mode 100644
index 0000000..3649ce7
Binary files /dev/null and b/usermanual/usermanual.pdf differ
diff --git a/usermanual/usermanual.tex b/usermanual/usermanual.tex
new file mode 100644
index 0000000..2d7ab9b
--- /dev/null
+++ b/usermanual/usermanual.tex
@@ -0,0 +1,600 @@
+\documentclass[11pt]{article}
+\usepackage[margin=1.1in]{geometry}
+\usepackage{usefulsymbols}
+\usepackage{hyperref}
+\newcommand{\mycos}{{\bf{mycoSORT{ }}}}
+\newcommand{\homefolder}{\texttt{mycosort-pck-\version{ }}}
+\newcommand{\configfile}{\texttt{config.cfg{ }}}
+\newcommand{\configsample}{\texttt{config-sample.cfg{ }}}
+
+\def\version{{\tt 1.0}}
+\usepackage{listings}
+\usepackage{courier}
+\lstset{basicstyle=\small\ttfamily,breaklines=true,frame=L,xleftmargin=\parindent}
+%\lstset{framextopmargin=50pt,frame=bottomline}
+%\lstset{breaklines=true} 
+%\lstset{breakatwhitespace=true} 
+
+% if you just need a simple heading
+% Usage:
+%   \heading{the text of the heading}
+\newcommand{\heading}[1]{
+    \vspace{0.3cm} \noindent \textbf{#1} \newline
+}
+
+\usepackage{datetime}
+\newdateformat{mydate}{\monthname[\THEMONTH] \THEYEAR}
+
+
+\graphicspath{{./graphics/}}
+
+\begin{document}
+
+\title{\mycos{} ~ \version \\~\\~\\User Manual\\~\\}
+
+\author{Hayda Almeida\\Marie-Jean Meurs\\~\\~\\Tsang Lab}
+
+\date{\mydate\today}
+
+\maketitle
+
+\begin{center}
+	\includegraphics[width=0.2\textwidth]{genomicslogogreen}$\qquad$\includegraphics[width=0.2\textwidth]{genozymeslogo}$\qquad$\includegraphics[width=0.25\textwidth]{concordialogo}\\
+\end{center}
+
+\pagestyle{empty}
+
+\pagebreak
+\tableofcontents
+
+% \pagestyle{empty}
+\pagebreak
+
+\section{Introduction to mycoSORT}
+
+\mycos{} is an open source text classification software program written in Java.
+The software is based on data sampling and machine learning methods.
+\mycos{} was primarily developed to perform text classification of scientific literature related to fungal enzymes.
+However, it can also execute classification of literature related to different topics. 
+This tool can be applied to support scientific researchers in the selection of relevant documents when performing literature review.
+
+\mycos{} utilizes a labeled document collection to learn from, 
+and generate a classification model through supervised learning, 
+which will be used to predict a label for new scientific papers.
+In order to obtain a classification prediction for a given document, 
+\mycos{} must first train or load a classification model.
+
+To generate the classification models, \mycos{} makes use of the standard implementation 
+of classification algorithms provided by the Weka workbench~\cite{hall2009weka}, 
+which is also developed in Java.
+In addition, \mycos{} utilizes the following packages:
+% commons-lang3-3.2.1
+% jsoup-1.7.3
+\begin{itemize}
+ \item Apache Commons Lang\footnote{\url{https://commons.apache.org}}(version 3.2.1 or above), a Java utility package;
+ \item jsoup\footnote{\url{http://jsoup.org/}}(version 1.7.3 or above), a Java HTML/XML parser;
+ \item LIBSVM\footnote{\url{http://www.csie.ntu.edu.tw/~cjlin/libsvm/}}(version 3.2 or above), a wrapper library for the SVM classifier;
+ \item Apache Ant\footnote{\url{http://ant.apache.org/}}(version 1.9.3 or above), a Java library used to build Java applications.
+\end{itemize}
+
+In the following sections, you will find instructions on how to 
+have access to the system source code, install, and run the software tool.
+\mycos{} toolkit is available at \url{https://github.com/TsangLab/mycosort}.
+This user manual describes \mycos{} version {\version}. 
+
+\section{Getting Started}
+\label{sec:start}
+To download \mycos{} toolkit, please access \url{https://github.com/TsangLab/mycosort}. 
+This user manual is written for version \version; 
+and assumes that you have downloaded and extracted the \homefolder{} as a folder, 
+and that this folder is currently in your working directory.
+
+\subsection{Package Content}
+\label{subsec:pckcontent}
+\mycos{} toolkit contains several folders and files in its root folder.
+These items are used to provide inputs and keep outpus for the system subtasks. 
+Their usage and content are further explained here. 
+
+\paragraph{Folders list} 
+\textit{\texttt{arff} $\rightarrow$} contains the .ARFF files representing the data as vector matrices. These files are used to generate the classification models. \\
+\textit{\texttt{corpus} $\rightarrow$} keeps the training and test sets in .XML format, used to build and apply the classification models. \\
+\textit{\texttt{executables} $\rightarrow$} holds the .JAR files that compose the system and are used to perform the system tasks. \\
+\textit{\texttt{features} $\rightarrow$} contains the features extracted from the training sets, saved in .TXT format. \\
+\textit{\texttt{jars} $\rightarrow$} contains the external .JAR packages which are bundled with the system.\\
+\textit{\texttt{src} $\rightarrow$} holds the system .JAVA source files. 
+
+\paragraph{Files list} 
+\textit{\texttt{build.xml} $\rightarrow$} master file used by Apache Ant to build \mycos{} executables. \\
+\textit{\configsample $\rightarrow$} a sample of the configuration file used to set specific parameters for the system different tasks. \\
+\textit{\texttt{entities.txt} $\rightarrow$} a list of bioentities that are found annotated in the dataset. \\
+\textit{\texttt{stopList.txt} $\rightarrow$} a list of stop-words to be considered for feature extraction.
+
+\subsection{Requirements}
+\mycos{} requires Java JDK (version 1.8.0 or above)  and Apache Ant (version 1.9.3 or above) to be installed.\\
+For further information on how to install Java JDK, please refer to: \\ \url{http://www.oracle.com/technetwork/java/javase/downloads/index.html}. \\
+For information on how to install Apache Ant, please refer to: \\ \url{http://ant.apache.org/manual/install.html}.
+
+% \section{Dataset Format}
+% \label{sec:format}
+% 
+% \mycos{} uses the following file formats in its processing:
+% \begin{itemize}
+% 	 \item XML - eXtensible Markup Language - dataset format containing annotated papers 
+% 	 \item TXT - Text File - \mycos{} features extraction format 
+% 	 \item ARFF - Attribute-Relation File Format\footnote{\url{http://cs.waikato.ac.nz/~ml/weka/arff}} - instance and features list format 
+% \end{itemize}
+
+\section{Configuration Setup}
+\label{sec:configuration}
+The general working environment and configurations of \mycos{} are defined in a file named \configfile{}.
+To generate a \configfile{} file, create a copy of \configsample{} and rename it to \configfile{}.
+Before compiling and running \mycos{}, it is required to edit the \configfile{} file.
+
+\subsection{Directory Setting}
+\label{directory}
+To set up the main directory for using \mycos{}, firstly update the \texttt{HOME\_DIR} directory for your own desired folder, as in the following example.
+The \texttt{HOME\_DIR} should contain the path of the system main folder, where the \mycos{} toolkit was extracted.
+\begin{lstlisting}
+HOME_DIR=/home/usr/mycosort-pck-version/
+\end{lstlisting}
+
+The following directories are set by default, and generally should not be changed, 
+since they refer to folder paths inside of your \texttt{HOME\_DIR}.
+The corpus directory contains the dataset .XML files, as well as the training and testing files.
+\begin{lstlisting}
+CORPUS_DIR=corpus/
+\end{lstlisting}
+
+The positive and negative folders contain the positive and negative .XML instances, and are found inside of \texttt{CORPUS\_DIR}.
+\begin{lstlisting}
+POS_DIR=positives/
+NEG_DIR=negatives/
+\end{lstlisting}
+
+The train and test folders will contain the train and test .XML instances, and are found inside of \texttt{CORPUS\_DIR}.
+\begin{lstlisting}
+TRAIN_DIR=train/
+TEST_DIR=test/
+\end{lstlisting}
+
+The arff directory contains the .ARFF files, used to feed the classification algorithms.
+\begin{lstlisting}
+OUTPUT_MODEL=arff/
+\end{lstlisting}
+
+The feature directory contains the .TXT files listing all feature types extracted from the training sets.
+\begin{lstlisting}
+FEATURE_DIR=features/
+\end{lstlisting}
+
+The duplicates directory is a directory in which the user wants to look for duplicates.
+Its value should be edited to fit the folder name, which should be placed inside of \texttt{CORPUS\_DIR}.
+\begin{lstlisting}
+DUP_DIR=test/
+\end{lstlisting}
+
+\subsection{Corpus Sampling Setting}
+\label{subsec:corpussamp}
+Data sampling can be used to split the document collection into training and test collections, 
+as well as to generate several training collections with different class distributions.
+To enable the training or test sampling, set the following variables to true:
+\begin{lstlisting}
+SAMPLE_TRAIN=false
+SAMPLE_TEST=false
+\end{lstlisting}
+The following variables will control the data sampling settings.
+To determine the size of the test set with regards to the entire document collection, use \texttt{PERCT\_TEST} to set the percentage of the test collection.
+\begin{lstlisting}
+PERCT_TEST=15
+\end{lstlisting}
+To generate a training collection, first define the percentage of positive instances to be sampled for this corpus.
+This variable is also used when generating .ARFF files. 
+\begin{lstlisting}
+PERCT_POS_TRAIN=50
+\end{lstlisting}
+To generate a test collection, first determine its the percentage of positive instances.
+\begin{lstlisting}
+PERCT_POS_TEST=10
+\end{lstlisting}
+
+\subsection{File Setting}
+\label{subsec:fileset}
+We describe here the files used as input for \mycos{}.
+The \texttt{TRAINING\_FILE} and \texttt{TEST\_FILE} should contain the name of the XML files generated as training and test sets.
+The training file is used by the extractors to extract features, and to build the .ARFF files.
+\begin{lstlisting}
+TRAINING_FILE=triage0.xml
+TEST_FILE=triage1.xml
+\end{lstlisting}
+
+The .ARFF files are used to feed the classification models. 
+When wanting to re-train a model, \texttt{ARFF\_TRAIN} should contain the .ARFF file name used for training.
+When wanting to test new instances, \texttt{ARFF\_TEST} should contain the .ARFF file name used for testing.
+\begin{lstlisting}
+ARFF_TRAIN=triage0.arff
+ARFF_TEST=triage1.arff
+\end{lstlisting}
+
+The stopwords list used by the extractors is defined here.
+We recommend to keep this variable as it is defined in the \configsample.
+\begin{lstlisting}
+STOP_LIST=stopList.txt
+\end{lstlisting}
+
+When executing subtasks, \mycos{} produces the following files as output, 
+which are later on used as input for new subtasks.
+These files contain the features extracted from a given training set.
+\begin{lstlisting}
+ECNUM_FEATURES=ecnumbers.txt
+JOURNAL_TITLE_FEATURES=journaltitles.txt
+ANNOTATION_FEATURES=annotations.txt
+TITLE_FEATURES=titleAnnotations.txt
+NGRAM_FEATURES=ngrams_features.txt
+TITLE_NGRAMS=titleGrams.txt
+DOC_IDS=docIDs.txt
+\end{lstlisting}
+
+\subsection{Feature Setting}
+\label{subsec:featureset}
+The feature configuration is taken into account when generating .ARFF files.
+In order to choose a feature type to be used when creating an .ARFF file, simply set its value to ``true'', as the examples below. \\
+\paragraph{General features}
+More than one feature can be combined when generating .ARFF files.
+The following variables will load general features: the size of a paper abstract, 
+the name of the publication journal, and the EC numbers found in a paper.
+\begin{lstlisting} 
+USE_TEXT_SIZE=true
+USE_JOURNAL_TITLE_FEATURE=true
+USE_ECNUM_FEATURE=true
+\end{lstlisting}
+
+The \texttt{USE\_DOC\_ID} variable extracts the paper PMID.
+This variable must be maintained with its value set to ``true'', 
+since it is needed to output the classification predictions according to the document ID.
+\begin{lstlisting}
+USE_DOC_ID=true
+\end{lstlisting}
+
+The following variables set specific conditions for feature frequency (number of times it was found in the training set)
+and feature lenght (number of characters in a feature) to be taking into account when extracting the feature list.
+The default parameters are defined below, but they can be adjusted according to the user needs.
+\begin{lstlisting}
+FEATURE_MIN_FREQ=2
+FEATURE_MIN_LENGTH=3
+\end{lstlisting}
+
+\paragraph{Annotation features}
+The following variables will provide annotation features to generate .ARFF files.
+To load the bioentity annotations extracted from the training set, the value of \texttt{USE\_ANNOTATION\_FEATURE} must be set to true.
+When setting \texttt{USE\_ANNOTATION\_TYPE} to true, the bioentity types will be loaded to generate .ARFFs.
+Finally, when setting \texttt{USE\_TITLE\_FEATURE} to true, the bioentities annotated in paper titles will be 
+considered separately from the annotations found in abstracts.
+\begin{lstlisting}
+USE_ANNOTATION_FEATURE=true
+USE_ANNOTATION_TYPE=true
+USE_TITLE_FEATURE=true
+\end{lstlisting}
+
+\paragraph{N-Gram features}
+The following variables will provide n-gram features to generate .ARFF files.
+To load the n-grams extracted from the training set, the value of \texttt{USE\_NGRAM\_FEATURE} must be set to true.
+When setting \texttt{USE\_TITLE\_NGRAMS} to true, the n-grams found in paper titles will be 
+considered separately from the n-grams found in abstracts.
+Use \texttt{NGRAM\_STOP} to remove stopwords from the feature list.
+\begin{lstlisting}
+USE_NGRAM_FEATURE=true
+USE_TITLE_NGRAMS=false
+NGRAM_STOP=true
+\end{lstlisting}
+The variable \texttt{NGRAM\_SIZE} determines the number of words used to
+form n-grams. The default value is 1, however the system is also capable of generating 
+bigrams (\texttt{NGRAM\_SIZE=2}) and trigrams (\texttt{NGRAM\_SIZE=3}). 
+\begin{lstlisting}
+NGRAM_SIZE=1
+\end{lstlisting}
+
+To apply a weight in a n-gram, set the following variable to true and
+determine the value of the weight.
+This configuration will simply multiply the current n-gram frequency by the value provided in \texttt{WEIGHT}.
+\begin{lstlisting}
+USE_WEIGHTED_NGRAM=false
+WEIGHT=3
+\end{lstlisting}
+
+\subsection{Feature Selection Setting}
+\label{subsec:featselec}
+The feature selection configuration is taken into account before feeding .ARFF files to the classification algorithms.
+To enable Odds Ratio (OR) or IDF filtering, just set one of the following variables to true:
+\begin{lstlisting}
+USE_ODDS_RATIO=true
+USE_IDF=false
+\end{lstlisting}
+It is recommended to apply Odds Ratio or IDF, but not both together.
+To determine the minimum threshold considered to keep a feature, adjust the following variables (default is set to 1):
+\begin{lstlisting}
+OR_THRESHOLD=1
+IDF_THRESHOLD=1
+\end{lstlisting}
+
+\subsection{Experiment}
+The experiment type is used to generate .XML and .ARFF files. 
+To generate training files, set \texttt{EXP\_TYPE=0}, and to generate test files, set \texttt{EXP\_TYPE=1}.
+\begin{lstlisting}
+EXP_TYPE=0
+\end{lstlisting}
+
+% 
+% \subsubsection{N-Grams}
+% \label{ngrams}
+% To determine the size of N-Grams features, please set the number of \texttt{NGRAM\_SIZE} variable on the file to \texttt{1}, \texttt{2} or \texttt{3}.
+% In order to have a single relation of all the N-Grams from both paper abstract and title, the features should be configured as the following:
+% \begin{lstlisting}
+% USE_NGRAM_FEATURE=true
+% USE_TITLE_NGRAMS=false
+% \end{lstlisting}
+% If you require the title N-Grams as separated features from the abstract N-Grams, please define its value also as \texttt{true}.
+% 
+% Yet, if you require that N-Grams from the paper abstract should not be considered and only the title text must be taken into account, use the following configuration:
+% \begin{lstlisting}
+% USE_NGRAM_FEATURE=false
+% USE_TITLE_NGRAMS=true
+% \end{lstlisting}
+% 
+% \subsubsection{Annotations}
+% \label{annotations}
+% The same configuration set for abstract and title is valid for the annotations. To have a single relation from both paper abstract and title, use:
+% \begin{lstlisting}
+% USE_ANNOTATION_FEATURE=true
+% USE_TITLE_FEATURE=false
+% \end{lstlisting}
+% If separated lists of annotation features from abstract and title are needed, please define both values as \texttt{true}.
+% 
+% However, if you wish to have only the annotations found on the paper title, but not on the paper abstract, just apply the variables value as the following:
+% \begin{lstlisting}
+% USE_ANNOTATION_FEATURE=false
+% USE_TITLE_FEATURE=true
+% \end{lstlisting} 
+
+
+\section{Using \mycos{}}
+\mycos{} can be used from a command line interface. 
+The system utilizes Apache Ant to build the five different modules (.JAR files), 
+which are available in the \texttt{executables} folder.
+To execute \mycos{} modules, it is necessary to access the system home folder.
+In a command line interface (a terminal in Linux OS, or a prompt in Microsoft Windows), 
+navigate until the \homefolder{} folder, such as:
+
+\begin{lstlisting}
+ user@machine $ cd /home/usr/mycosort-pck-version 
+\end{lstlisting}
+
+On a Microsoft Windows system, the forward slashes should be replaced by back slashes
+(e.g. \texttt{home\textbackslash usr\textbackslash ...}). 
+From now on, the instructions will assume that a Linux OS is being used.
+
+\paragraph{Compiling} 
+After accessing the system home folder, it is necessary to first compile \mycos{} modules.
+To do so, simply type \texttt{"ant"} in the command line, as the example below:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+\end{lstlisting}
+The system should be re-compiled if any parameter is changed or edited in the \configfile file.
+Following we describe the usage and configuration for each of the five \mycos{} modules:
+\begin{itemize}
+\item SampleCorpus
+\item CorpusHandler
+\item FeatureExtractor
+\item NgramExtractor
+\item BuildModel
+\item Trainer
+\end{itemize}
+
+
+\subsection{SampleCorpus} 
+The \texttt{SampleCorpus} module allows the user to generate training and test collections. 
+It utilizes all .XML documents contained in the \texttt{corpus/positive} and \texttt{corpus/negative} folders.
+\paragraph{Example 1}
+When generating the test collection, the .XML instances randomly selected will be moved 
+from the \texttt{corpus/positive} and \texttt{corpus/negative} folders to the 
+\texttt{corpus/test} folder.
+To execute the test sampling and generate a test collection that represents
+15\% of the entire document collection, and that contains 10\% of positive instances, 
+edit the following variables in the \configfile:
+\begin{lstlisting}
+SAMPLE_TEST=true
+PERCT_TEST=15
+PERCT_POS_TEST=10
+\end{lstlisting}
+
+\paragraph{Example 2}
+When generating the training collection, the .XML instances randomly selected will be 
+copied from \texttt{corpus/positive} and \texttt{corpus/negative} folders to the 
+\texttt{corpus/train\_ (PERCT\_POS\_TRAIN)} folder.
+To execute the training sampling and generate a trainng collection that contains
+50\% of negative instances and 50\% of positive instances,
+edit the following variables in the \configfile:
+\begin{lstlisting}
+SAMPLE_TRAIN=true
+PERCT_POS_TRAIN=50
+\end{lstlisting}
+
+To execute \texttt{SampleCorpus}, run the following instruction in the command line interface:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+user@home/usr/mycosort-pck-version $ ant sample-corpus
+\end{lstlisting}
+After running the instruction, the selected sampling (training or test) will be executed.
+The training collection will then be copied to a \texttt{corpus/train\_50} folder.
+The training collection can be generated multiple times, with different class distributions.
+
+\subsection{CorpusHandler}
+The \texttt{CorpusHandler} module is used to create the training and test corpus,
+by generating a combined .XML file containing all .XML instances either in the 
+\texttt{corpus/test} folder or in the \texttt{corpus/train\_(PERCT\_POS\_TRAIN)} folder.
+\paragraph{Example 3}
+Besides generating the training and test corpora, this module can also perform a check for duplicates.
+To check for duplicates between an existing training file 
+and a given \texttt{DUP\_DIR} folder containing several .XML files, 
+edit the following variables in the \configfile file:
+\begin{lstlisting}
+TRAINING_FILE=triage0.xml
+DUP_DIR=test/
+\end{lstlisting}
+To execute \texttt{CorpusHandler} and check for duplicates between training file and a given folder,
+run the following instruction in the command line interface:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+user@home/usr/mycosort-pck-version $ ant -Doptions=df corpus-handler
+\end{lstlisting}
+\paragraph{Example 4}
+To check for duplicates between the train or test folders containing all .XMLs and 
+a given \texttt{DUP\_DIR} folder containing other several .XML files, 
+edit the following variables in the \configfile file:
+\begin{lstlisting}
+DUP_DIR=test/
+EXP_TYPE=0
+\end{lstlisting}
+In this case, \texttt{EXP\_TYPE=0} if the train .XMLs must be considered, 
+or \texttt{EXP\_TYPE=1} if the test .XMLs must be considered.
+To execute \texttt{CorpusHandler} and check for duplicates between training file and a given folder,
+run the following instruction in the command line interface:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+user@home/usr/mycosort-pck-version $ ant -Doptions=dc corpus-handler
+\end{lstlisting}
+When checking for duplicates, the duplicates found will by default renamed only in the \texttt{DUP\_DIR} folder. \\
+
+\paragraph{Example 5}
+To generate a training corpus, the following variables must be edited in the \configfile file:
+\begin{lstlisting}
+PERCT_POS_TRAIN=50
+EXP_TYPE=0
+\end{lstlisting}
+To generate a testing corpus, edit the following variables:
+\begin{lstlisting}
+PERCT_POS_TEST=10
+EXP_TYPE=1
+\end{lstlisting}
+In order to generate the corpora, it is first required to clean (\texttt{-Doptions=cl}), 
+and only then concatenate (\texttt{-Doptions=cc}) all .XMLs in a given folder.
+Thus, when creating the training or test corpora, 
+run the following instruction in the command line interface:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+user@home/usr/mycosort-pck-version $ ant -Doptions=cl,cc corpus-handler
+\end{lstlisting}
+
+\subsection{NgramExtractor}
+The \texttt{NgramExtractor} is a feature extraction module, used to extract n-grams 
+(small units of text) from the paper article title and abstract.
+N-grams can be generated in three different sizes: unigrams (one word), bigrams (two words), and trigrams (three words).
+The default ngram size is one word, since this extraction already results in long list of features.
+It is also recommended to discard stopwords when extracting n-grams, 
+and this can be set by keeping the value of \texttt{NGRAM\_STOP} as \texttt{true}.
+\paragraph{Example 6}
+To perform the ngram extraction, the following variables must be edited in the \configfile file:
+\begin{lstlisting}
+TRAINING_FILE=triage0.xml
+NGRAM_STOP=true
+NGRAM_SIZE=1
+FEATURE_MIN_FREQ=2
+FEATURE_MIN_LENGTH=3
+\end{lstlisting}
+To execute \texttt{NgramExtractor} run the following instruction in the command line interface:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+user@home/usr/mycosort-pck-version $ ant ngram-extractor
+\end{lstlisting}
+
+\subsection{FeatureExtractor}
+The \texttt{FeatureExtractor} is a feature extraction module, used to extract domain annotations 
+(specific XML tags) from the paper article title and abstract.
+To specify the list of annotations (tags) to be considered in the \texttt{FeatureExtractor} module,
+please refer to the \texttt{entities.txt} file in the root of \homefolder folder.
+In order to consider new annotation type (tag) beside the ones provided in the file, 
+simply add a new line containing the annotation type name and its level (sentence or entity).
+Should a given type not be considered, simply add a \# at the beggining of the specific line.
+
+\paragraph{Example 7}
+To perform the feature extraction, the following variables must be edited in the \configfile file:
+\begin{lstlisting}
+TRAINING_FILE=triage0.xml
+FEATURE_MIN_FREQ=2
+FEATURE_MIN_LENGTH=3
+\end{lstlisting}
+To execute \texttt{FeatureExtractor} run the following instruction in the command line interface:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+user@home/usr/mycosort-pck-version $ ant feature-extractor
+\end{lstlisting}
+
+
+\subsection{BuildModel}
+The \texttt{BuildModel} is the module used to represent the training and test sets as
+matrix of document vectors, that will be later fed to a classification algorithm.
+Models are saved in the .ARFF file format, and can be generated with several different configurations of features.
+All generated models are saved in the \texttt{arff} folder.
+
+\paragraph{Example 8} 
+To determine the feature configuration used in a given model, 
+the chosen options, as described in~\ref{subsec:featureset}, 
+must be set to \texttt{true} in the \configfile file.
+As an example, if the user wants to generate a model based only in unigram features,
+the setup of n-gram features must be set to true, as described in~\ref{subsec:featureset},
+while the annotation features setup must be set to false.
+
+In addition, the following variables must also be edited, 
+to indicate if the model should be generated based on the training set or the test set,
+as well as to indicate which percentage of positives was currently considered in the training set.
+\begin{lstlisting}
+PERCT_POS_TRAIN=50
+EXP_TYPE=1
+\end{lstlisting}
+
+To execute \texttt{BuildModel}, run the following instruction in the command line interface:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+user@home/usr/mycosort-pck-version $ ant build-model
+\end{lstlisting}
+
+\subsection{Trainer}
+The \texttt{Trainer} module processes the training .ARFF files and utilizes a classification algorithm to
+generalize a function, and output predictions for instances in the test .ARFF files.
+The corresponding training and test .ARFF files must be indicated 
+in the \configfile file before executing the \texttt{Trainer} module.
+In order to specify the correct files, please refer to these two items:
+\begin{lstlisting}
+ARFF_TRAIN=triage0.arff
+ARFF_TEST=triage1.arff
+\end{lstlisting}
+
+While training and testing the models, feature selection methods can also be set up.
+To perform IDF or Odds Ratio filtering, please refer to the items described in~\ref{subsec:featselec}.
+It is recommended to perform one of the filterings at once, either IDF or Odds Ratio,
+as opposed to both at the same execution.
+
+\paragraph{Example 9}
+A model can be trained using three different classification algorithms: 
+{Na\"{\i}ve} Bayes (\texttt{-Dclassifier=nb}), Support Vector Machine (\texttt{-Dclassifier=svm}), or Logistic Model Tree (\texttt{-Dclassifier=lmt}).
+
+To execute \texttt{Trainer} using LMT, run the following instruction in the command line interface:
+\begin{lstlisting}
+user@home/usr/mycosort-pck-version $ ant
+user@home/usr/mycosort-pck-version $ ant -Dclassifier=lmt trainer
+\end{lstlisting}
+
+
+\section{Contacts}
+Should you have any questions, comments or bug reports, the authors can be reached at the following addresses:\\
+\url{hayda.almeida@concordia.ca} \\
+\url{marie-jean.meurs@concordia.ca}
+
+
+\appendix
+
+
+\bibliographystyle{acm}
+% \renewcommand{\baselinestretch}{0.0}
+\bibliography{usermanual}
+
+\end{document}