diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..038e6d4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..76992cf
--- /dev/null
+++ b/README.md
@@ -0,0 +1,12 @@
+#mycoSORT
+
+A machine learning system for supporting the triage of biological literature.
+
+
+
+
+
+
+
+
+
diff --git a/config-sample.cfg b/config-sample.cfg
new file mode 100644
index 0000000..a9b3483
--- /dev/null
+++ b/config-sample.cfg
@@ -0,0 +1,118 @@
+#################################################
+#
+#
+#  Configuration file for mycoSORT
+#
+#
+##################################################
+########################### DIRECTORIES ##########
+# project home
+HOME_DIR=/.
+#
+# corpus directory
+CORPUS_DIR=corpus/
+#
+# train directory
+TRAIN_DIR=train/
+#
+# test directory
+TEST_DIR=test/
+#
+# feature directory
+FEATURE_DIR=features/
+#
+# output directory for arff files
+OUTPUT_MODEL=arff/
+#
+#################################################
+########################## INPUT FILES ##########
+# training file
+TRAINING_FILE=/triagecorpus_train.xml
+#
+# test file
+TEST_FILE=/triagecorpus_test.xml
+#
+# arff training file
+ARFF_TRAIN=triage0.arff
+#
+# arff testing file
+ARFF_TEST=triage1.arff
+#
+# stopwords list
+STOP_LIST=stopList.txt
+#
+##################################################
+########################## OUTPUT FILES ##########
+# EC numbers feature list
+ECNUM_FEATURES=ecnumbers.txt
+#
+# Journal title feature list
+JOURNAL_TITLE_FEATURES=journaltitles.txt
+#
+# Abstract annotations feature list
+ANNOTATION_FEATURES=annotations.txt
+#
+# Paper title annotations feature list
+TITLE_FEATURES=titleAnnotations.txt
+#
+# Abstract ngrams feature list
+NGRAM_FEATURES=ngrams_features.txt
+#
+# Paper title n-grams feature list
+TITLE_NGRAMS=titleGrams.txt
+#
+###################################################
+########################## FEATURE SETUP ##########
+# Extract size of abstract and title 
+USE_TEXT_SIZE=false
+#
+# Extract Journal of publication 
+USE_JOURNAL_TITLE_FEATURE=false
+#
+# Extract EC Numbers
+USE_ECNUM_FEATURE=true
+#
+# minimum frequency to consider a feature
+FEATURE_MIN_FREQ=2
+#
+# minimum length (in chars) to consider a feature
+FEATURE_MIN_LENGTH=3
+#
+#############################
+######### ANNOTATIONS #######
+# Extract annotation content
+USE_ANNOTATION_FEATURE=true
+#
+# Extract annotation entities
+USE_ANNOTATION_TYPE=true
+#
+# Extract annotations from title separately
+USE_TITLE_FEATURE=false
+#
+#############################
+########## N-GRAMS ##########
+# Extract ngrams 
+USE_NGRAM_FEATURE=false
+#
+# Extract ngrams from title separately
+USE_TITLE_NGRAMS=false
+#
+#use of stopwords list on ngrams
+NGRAM_STOP=true
+#
+# Define size of extracted n-grams
+NGRAM_SIZE=1
+#
+# Apply weights to ngrams
+#USE_WEIGHTED_NGRAM=false
+#
+# Define weight of features
+#WEIGHT=3
+#
+#################################################
+########################### TASK SETUP ##########
+# experiment type : train = 0 / test = 1
+EXP_TYPE=0
+#
+# limit numbers of parameters - quantity (top) or -1 all file
+NB_PARAMS=-1
diff --git a/entities.txt b/entities.txt
new file mode 100644
index 0000000..7714e43
--- /dev/null
+++ b/entities.txt
@@ -0,0 +1,23 @@
+annotation_type annotation_level
+AccessionNumber entity
+ActivityAssayConditions sentence
+Assay entity
+Buffer entity
+Characterization entity
+Enzyme entity
+Expression sentence
+Family entity
+Fungus entity
+Gene entity
+Glycoside_Hydrolase entity
+Glycosylation sentence
+Kinetics sentence
+Laccase entity
+Lipase entity
+Peroxidase entity
+pH sentence
+ProductAnalysis sentence
+Temperature sentence
+SpecificActivity sentence
+Substrate entity
+SubstrateSpecificity sentence
\ No newline at end of file
diff --git a/jar/README b/jar/README
new file mode 100644
index 0000000..9a9b435
--- /dev/null
+++ b/jar/README
@@ -0,0 +1,7 @@
+Please add to this folder the following libraries:
+commons-lang3-3.2.1.jar
+jsoup-1.7.3.jar
+weka.jar
+LibSVM.jar
+LibSVM/libsvm.jar
+
diff --git a/jar/README~ b/jar/README~
new file mode 100644
index 0000000..56f2ce9
--- /dev/null
+++ b/jar/README~
@@ -0,0 +1,7 @@
+Please add to this folder the following libraries:
+commons-lang3-3.2.1.jar
+jsoup-1.7.3.jar
+weka.jar
+LibSVM.jar
+libsvm.jar
+
diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore
new file mode 100644
index 0000000..6b468b6
--- /dev/null
+++ b/src/analyse/.gitignore
@@ -0,0 +1 @@
+*.class
diff --git a/src/analyse/ConcatXML.java b/src/analyse/ConcatXML.java
new file mode 100644
index 0000000..9c24173
--- /dev/null
+++ b/src/analyse/ConcatXML.java
@@ -0,0 +1,734 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+package analyse;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Date;
+import java.util.List;
+
+import configure.PathConstants;
+
+/**
+ * Generates a corpus from raw XML doc instances, 
+ * so that features can be extracted from it
+ *   
+ * @author halmeida
+ *
+ */
+public class ConcatXML extends Extractor{
+	
+	private String tag1;
+	private String tag2;
+	private String tag3;
+
+
+	public ConcatXML(){		
+		this.id = "<PubmedArticleSet><PubmedArticle><MedlineCitation";
+		this.endId = "</PMID>";	
+		this.openFile = "<corpus>";
+		this.endFile = "</corpus>";
+		this.openJournal = "<Title>";
+		this.tag1 = "<?xml version=\"1.0\"?>";
+		this.tag2 = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>";
+		this.tag3 = "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st January 2013//EN\" \"http://www.ncbi.nlm.nih.gov/corehtml/query/DTD/pubmed_130101.dtd\">";
+	}
+	
+	public static void main(String[] args) throws IOException {	
+		
+		PathConstants pathVars = new PathConstants();									
+		
+		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
+		
+		String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE;
+		String xmlDir = "train";
+		String sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "all_nbs/"+ xmlDir;
+		String duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "/src"+ "/annotated_GH27-36_2013_12_31";
+				
+		String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml";
+		String tagCorpus = concatCorpus;
+		
+		ConcatXML concat = new ConcatXML();		
+		
+		//================= Checking for duplicates =====================//
+		//concat.checkDupCorpus(trainCorpusPath, sourceDir);
+		//concat.checkDupFolder(sourceDir, duplicatesDir);
+		
+		
+		//================== Creating corpus ==========================//		
+		concat.cleanXML(sourceDir);
+		//concat.cleanXML(duplicatesDir);
+		concat.concatenateXML(sourceDir, "", concatCorpus);
+		concat.tagCorpus(tagCorpus);	
+	}	
+	
+	/**
+	 * Reads the file IDs in a folder and 
+	 * checks a second folder for duplicates. 
+	 *  
+	 * @param dirSrc source folder
+	 * @param dirDup folder to check for duplicates
+	 */
+	
+	public void checkDupFolder(String dirSrc, String dirDup){
+		ArrayList<String> sourceIDs = new ArrayList<String>();
+		ArrayList<String> duplicated = new ArrayList<String>();
+		ArrayList<String> dupIDs = new ArrayList<String>();
+		int ids = 0;
+
+		if(dirSrc.contentEquals(dirDup)){		
+			System.out.println("Source and duplicates directories are the same.\n\n========================\n");			
+		}		
+		else {		
+
+			File sourceDir = new File(dirSrc);
+			File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
+				@Override
+				public boolean accept(File dir, String name){
+					return name.endsWith(".xml");
+				}
+			});		
+
+			try{
+				//for each file on the source dir 
+				for (File xml : srcXMLs){				
+
+					try{
+						BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+						String line = null;
+
+						String id  = null;
+
+						while((line = reader.readLine()) != null){
+
+							line = line.replaceAll("\t","");
+							line = line.replace("\"", "");	
+
+							//get the IDs of the new files
+							if (line.contains(getid())){
+
+								line = line.substring(line.indexOf("><PMID"), line.indexOf(getendId()));
+								line = line.replaceAll("><PMID Version=1>", "");
+
+								id = line.replace(getendId(), "");
+
+								sourceIDs.add(id);							
+
+								line = reader.readLine();
+								line = line.replaceAll("\t","");
+							}
+
+							if(line.contains(getOpenJournal())){
+								ids++;					
+							}
+
+							line = line.replaceAll("\t","");
+							line = line.replace("\"", "");
+						}
+
+						reader.close();
+
+					}catch (FileNotFoundException e) {
+						e.printStackTrace();
+					}
+
+				}				
+
+			}catch (FileNotFoundException e) {
+				e.printStackTrace();
+			}
+			catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			System.out.println(ids + " source file IDs encountered.");
+			ids = 0;
+
+			File dupDir = new File(dirDup);
+
+			File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+				@Override
+				public boolean accept(File dir, String name){
+					return name.endsWith(".xml");
+				}
+			});		
+
+			try{
+				//for each file on the possibly duplicated dir 
+				for (File xml : dupXMLs){				
+
+					try{
+						BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+						String line = null;
+
+						String id  = null;
+
+						while((line = reader.readLine()) != null){
+
+							line = line.replaceAll("\t","");
+							line = line.replace("\"", "");	
+
+							//get the IDs of the new files
+							if (line.contains(getid())){
+
+								line = line.substring(line.indexOf("><PMID"), line.indexOf(getendId()));
+								line = line.replaceAll("><PMID Version=1>", "");
+
+								id = line.replace(getendId(), "");
+
+								dupIDs.add(id);
+								String dupFileID = id;
+
+								for(int j = 0; j < sourceIDs.size(); j++){
+									if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){
+										//moving the original file									
+										Path from = xml.toPath(); //convert from File to Path
+										Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+										Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+									}
+								}
+
+
+								line = reader.readLine();
+								line = line.replaceAll("\t","");
+							}
+
+							if(line.contains(getOpenJournal())){
+								ids++;					
+							}
+
+							line = line.replaceAll("\t","");
+							line = line.replace("\"", "");
+						}
+
+						reader.close();
+
+					}catch (FileNotFoundException e) {
+						e.printStackTrace();
+					}
+
+				}				
+
+			}catch (FileNotFoundException e) {
+				e.printStackTrace();
+			}
+			catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			//count number of existing papers on possibly duplicated folder
+			//just to make sure we are gathering all IDs
+			System.out.println(ids + " new file IDs encountered.");
+			ids = 0;
+
+			//for each possible duplicated ID, 
+			//check if it exists on source folder ID list
+			//if yes, list the duplicated ones
+			for(int i = 0; i < dupIDs.size(); i++){
+				for(int j = 0; j < sourceIDs.size(); j++){
+					if(sourceIDs.get(j).equalsIgnoreCase(dupIDs.get(i))){
+						duplicated.add(dupIDs.get(i));
+					}
+				}
+			}
+
+			//sorting the list of duplicated IDs
+			Collections.sort(duplicated, new Comparator<String>(){
+				@Override
+				public int compare(String one, String two){
+					return one.compareTo(two);
+				}
+			});	
+
+			System.out.println("\nReaded source files: " + sourceIDs.size());				
+			System.out.println("Readed new files: " + dupIDs.size());	
+
+			System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+
+			System.out.println("\nDuplicated files IDs: ");
+			for(int i = 0; i < duplicated.size(); i++){
+				System.out.println(duplicated.get(i));
+			}
+
+			System.out.println("\n========================\n");
+		}
+
+
+	}
+	
+	/**
+	 * Reads the corpus and checks the papers IDs
+	 * to identify duplicates in case new papers 
+	 * are being concatenated to corpus.
+	 * 
+	 * @param corpus path to current corpora to check
+	 * @param dir path to folder with new files to be concatenated
+	 */
+	
+	public void checkDupCorpus(String corpus, String dir){
+		ArrayList<String> trainingIDs = new ArrayList<String>();
+		ArrayList<String> duplicated = new ArrayList<String>();
+		ArrayList<String> newFiles = new ArrayList<String>();
+		
+		int ids = 0;
+		
+		try 
+		{			
+			BufferedReader reader = new BufferedReader(new FileReader(corpus));
+		
+			String line = null;
+			String id = null;
+			
+			
+			while((line = reader.readLine()) != null){
+				
+				line = line.replaceAll("\t","");
+				line = line.replace("\"", "");				
+
+				//on the previous training corpus
+				//find exact paper ID and store it
+				if (line.contains(getid())){					
+					
+					line = line.substring(line.indexOf("><PMID"), line.indexOf(getendId()));
+					line = line.replaceAll("><PMID Version=1>", "");
+										
+					id = line.replace(getendId(), "");
+					
+					//insert paper ID to existing training file list
+					trainingIDs.add(id);									
+					
+					line = reader.readLine();
+					line = line.replaceAll("\t","");					
+				}
+				
+				//count number of existing papers on the training file
+				//just to make sure we are gathering all IDs
+				if(line.contains(getOpenJournal())){
+					ids++;					
+				}
+				
+				line = line.replaceAll("\t","");
+				line = line.replace("\"", "");
+			}
+			
+			reader.close();
+		
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+		
+		System.out.println(ids + " training file IDs encountered.");
+		ids = 0;
+		
+		File corpusDir = new File(dir);
+		File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});		
+		
+		try{
+			//for each file on the corpus dir 
+			for (File xml : newXMLs){				
+
+				try{
+					BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+					String line = null;
+
+					String id  = null;
+
+					while((line = reader.readLine()) != null){
+
+						line = line.replaceAll("\t","");
+						line = line.replace("\"", "");	
+
+						//get the IDs of the new files
+						if (line.contains(getid())){
+
+							line = line.substring(line.indexOf("><PMID"), line.indexOf(getendId()));
+							line = line.replaceAll("><PMID Version=1>", "");
+
+							id = line.replace(getendId(), "");
+
+							newFiles.add(id);
+							String newFileID = id;
+							
+							for(int j = 0; j < trainingIDs.size(); j++){
+								if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){
+									 //moving the original file									
+									Path from = xml.toPath(); //convert from File to Path
+									Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+						    	    Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+								}
+							}
+							
+
+							line = reader.readLine();
+							line = line.replaceAll("\t","");
+						}
+						
+						if(line.contains(getOpenJournal())){
+							ids++;					
+						}
+
+						line = line.replaceAll("\t","");
+						line = line.replace("\"", "");
+					}
+				
+					reader.close();
+						
+				}catch (FileNotFoundException e) {
+					e.printStackTrace();
+				}
+
+			}				
+
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+		
+		//count number of existing papers on the training file
+		//just to make sure we are gathering all IDs
+		System.out.println(ids + " new file IDs encountered.");
+		ids = 0;
+		
+		//for each new ID, check if it exists on training file ID list
+		//if yes, list the duplicated ones
+		for(int i = 0; i < newFiles.size(); i++){
+			for(int j = 0; j < trainingIDs.size(); j++){
+				if(trainingIDs.get(j).equalsIgnoreCase(newFiles.get(i))){
+					duplicated.add(newFiles.get(i));
+				}
+			}
+		}
+		
+		//sorting the list of duplicated IDs
+		Collections.sort(duplicated, new Comparator<String>(){
+			@Override
+			public int compare(String one, String two){
+				return one.compareTo(two);
+			}
+		});	
+		
+		System.out.println("\nReaded training files: " + trainingIDs.size());				
+		System.out.println("Readed new files: " + newFiles.size());	
+				
+		System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+		
+		System.out.println("\nDuplicated files IDs: ");
+		for(int i = 0; i < duplicated.size(); i++){
+			System.out.println(duplicated.get(i));
+		}
+		
+		System.out.println("\n========================\n");
+		
+	}
+	
+	
+	/**
+	 * Reads and edits a list of XMLs files in a folder
+	 * to remove XML and previous corpus tags, 
+	 * preparing the files to be concatenated. 
+	 *  
+	 * @param dir string with folder path
+	 */
+	
+	public void cleanXML(String dir){		
+
+		//listing files on corpus dir
+		File sourceDir = new File(dir);
+		
+		File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});		
+
+		System.out.println("... Files list loaded.");				
+
+		try{
+			//for each file on the corpus dir 
+			for (File xml : newXMLs){				
+
+				try{
+					BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+					String line = null;
+					ArrayList<String> allLines = new ArrayList<String>();
+					String content  = null;
+
+					while((line = reader.readLine()) != null){						
+						content = line;	
+
+						//cleaning XML markups
+						if(content.contains(getTag1())){							
+							content = content.replace(getTag1(), "");
+							allLines.add(content);							
+						}
+						if(content.contains(getTag2())){							
+							content = content.replace(getTag2(), "");
+							allLines.add(content);							
+						}
+						if(content.contains(getTag3())){							
+							content = content.replace(getTag3(), "");
+							allLines.add(content);							
+						}
+						
+						//cleaning previous corpus tags
+						if(content.contains(getOpenFile())){
+							content = content.replace(getOpenFile(), "");
+							allLines.add(content);
+						}
+						if(content.contains(getendFile())){
+							content = content.replace(getendFile(), "");
+							allLines.add(content);
+						}
+
+						allLines.add(content);						
+					}					
+
+					PrintWriter writer = new PrintWriter(xml.getPath());
+
+					for (String l : allLines){
+						writer.println(l);			
+					}					
+					reader.close();
+					writer.close();				
+
+				}catch (FileNotFoundException e) {
+					e.printStackTrace();
+				}
+
+			}				
+
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+
+		System.out.println("... Files cleaned and saved.");
+		System.out.println("Ready for concatenation.");
+		System.out.println("\n========================\n");
+	}
+	
+	/**
+	 * Concatenates all XMLs in one folder or between two folders.
+	 * @param sourceDir main directory with XML files.
+	 * @param duplicDir second directory with duplicated XML files 
+	 * @param concatFile path name to saved concatenated corpus
+	 */
+	
+	public void concatenateXML(String sourceDir, String duplicDir, String concatFile){		
+
+		final int BUFFER = 1024 << 8;
+		byte[] buffer = new byte[BUFFER];
+
+		//listing files on corpus dir
+		File srcDir = new File(sourceDir);
+		File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});
+		
+		File dupDir = new File(duplicDir);
+		File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name) {				
+				return name.endsWith(".xml");
+			}			
+		}); 
+		
+		System.out.println("... Files list loaded.");		
+
+		//defining the output file (concatenated)
+		File newCorpus = new File(concatFile);		
+
+		try{	
+			OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus));
+			
+
+			//for each file on the corpus dir 
+			for (File xmls : srcXMLs){				
+				InputStream input = new FileInputStream(xmls);				
+				int count;				
+				
+				//if the file is not empty/finished
+				try{
+					while((count = input.read(buffer)) >= 0){										
+						
+						//write it on the concatenated final file
+						output.write(buffer, 0, count);
+					}
+				}finally{
+					input.close();
+				}
+			}
+			
+		if(dupXMLs != null){
+			for(File xmld : dupXMLs){
+				InputStream input = new FileInputStream(xmld);				
+				int count;				
+				
+				//if the file is not empty/finished
+				try{
+					while((count = input.read(buffer)) >= 0){										
+						
+						//write it on the concatenated final file
+						output.write(buffer, 0, count);
+					}
+				}finally{
+					input.close();
+				}
+			}
+		}
+			output.flush();
+			output.close();				
+			
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+
+		System.out.println("... File concatenated and saved.");
+		System.out.println("Ready for corpus tagging.");
+		System.out.println("\n========================\n");
+	}
+	
+	/**
+	 * Inserts corpus tag on XML file
+	 * 
+	 * @param pathToCorpus path to 
+	 * 		  concatenated corpus 
+	 */
+	
+	public void tagCorpus(String pathToCorpus){
+		
+		//tagging as corpus		
+		try{
+			BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus));
+						
+			String line = null;
+			String edit = null;
+			List<String> allLines = new ArrayList<String>();
+			
+			//adds tag at beggining of corpus
+			allLines.add(getOpenFile());
+			
+			while((line = reader.readLine()) != null){	
+				 
+				allLines.add(line);					
+			}
+			//adds tag at the end of corpus
+			allLines.add(getendFile());			
+			
+			System.out.println("... Corpus loaded and tagged.");
+			//re-writting the file
+			PrintWriter writer = new PrintWriter(pathToCorpus);
+			
+			for (String l : allLines){
+				writer.println(l);			
+			}
+			reader.close();
+			writer.close();
+			
+			System.out.println("... File saved as tagged corpus.");			
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(IOException e){
+			e.printStackTrace();
+		}
+	}
+	
+	public String getTag1() {
+		return tag1;
+	}
+
+	public void setTag1(String tag1) {
+		this.tag1 = tag1;
+	}
+
+	public String getTag2() {
+		return tag2;
+	}
+
+	public void setTag2(String tag2) {
+		this.tag2 = tag2;
+	}
+	
+	public String getTag3() {
+		return tag3;
+	}
+
+	public void setTag3(String tag3) {
+		this.tag3 = tag3;
+	}
+	
+		
+}
+
+
diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java
new file mode 100644
index 0000000..c97cfa7
--- /dev/null
+++ b/src/analyse/Extractor.java
@@ -0,0 +1,442 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package analyse;
+
+import java.io.BufferedWriter;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+/**
+ * Implements common tools to FeatureExtractor 
+ * and NgramExtractor classes that are used to 
+ * extract features from doc instances 
+ * 
+ * @author halmeida
+ *
+ */
+public class Extractor {
+	
+	//String pathFile;
+	String id;
+	String endId;
+	String openFile;	
+	String endFile;
+	String openAbst;
+	String closeAbst;
+	String abstractLabel;
+	String openEC;
+	String closeEC;
+	String classTag;
+	String openTitle;
+	String closeTitle;
+	String openJournal;
+	String closeJournal;
+	String copyR;
+	String closeCopyR;
+	
+	/**
+	 * Replaces special characters to clean 
+	 * text for tokenizing.
+	 * 
+	 * @param str text to be cleaned
+	 * @return string with cleaned text
+	 */
+	public String removeSpecialChar(String str){
+		str = str.replace("}", "");
+		str = str.replace("{", "");
+		str = str.replace("]", "");
+		str = str.replace("[", "");
+		str = str.replace("#", "");
+		str = str.replace("*", "");
+		str = str.replace("&gt", "");
+		str = str.replace("&apos", "");
+		str = str.replace("%", "");
+		str = str.replace("&quot", "");
+		str = str.replace("&", "");
+		str = str.replace("=", "");
+		str = str.replace("?", "");
+		str = str.replace(";", "");
+		str = str.replace(":", "");
+		str = str.replace(",", "");
+		str = str.replace(".", "");
+		str = str.replace(")", "");
+		str = str.replace("(", "");
+		str = str.replace("\t\t", "\t");
+		str = str.replace("-", "");
+		str = str.replace("  ", "");
+		
+		return str;
+	}
+	
+	/**
+	 * Handles external tags (and multiple abstract 
+	 * text tags) present in a single paper
+	 * @param str abstract content
+	 * @return string without external tags 
+	 */
+	
+	public String processAbstract(String str){
+		str = str.replace("  ", "");		
+		String[] remove = str.split("");
+		StringBuilder sb = new StringBuilder();
+		String temp = "";
+		String abstrac = "";
+		
+		for(int i = 0; i < remove.length; i++){
+			temp = temp + remove[i];
+			
+			if(temp.contains("<AbstractText ")){
+				temp = "";				
+				do{
+					i++;
+				} while(!(remove[i].equalsIgnoreCase(">")));
+			}
+			if(temp.contains("Copyright ")){
+				temp = "";
+				do{
+					i++;
+					//an exception here can mean that a copyright information
+					//tag content did not ended with a period
+				}while(!(remove[i]).equalsIgnoreCase("."));
+			}
+			else sb.append(remove[i]);		
+		}
+		
+		 abstrac = sb.toString();
+		 abstrac = removeAbstractTags(abstrac);
+				 
+		 return abstrac;
+	}
+	
+
+	/**
+	 * Removes specific tags encountered on Abstract texts.
+	 * This is used to clean the abstract text before 
+	 * processing the feature count on the model. 
+	 * @param str
+	 * @return
+	 */
+	
+	public String removeAbstractTags(String str){		
+		//this order of removing tags matters to 
+		//exclude the first tag from the abstracts.
+		
+		str = str.replace("<AbstractText>", "");
+		str = str.replace("<AbstractText", "");
+		str = str.replace("<CopyrightInformation>", "");
+		str = str.replace("</CopyrightInformation>", "");
+		str = str.replace("Copyright", "");		
+		str = str.replace("</AbstractText>", "");
+		str = str.replace("<Abstract>", "");
+		str = str.replace("</Abstract>", "");
+		str = str.replace("<AbstractText.*?>", "");		
+		
+		return str;
+	}
+	
+	
+	/**
+	 * Removes the markup annotations of a
+	 * text field, and keeps its content
+	 * 
+	 * @param str text containing markups
+	 * @return string with cleaned text 
+	 */	
+	public String removeTags(String str) {
+		String[] remove = str.split("");
+		StringBuilder sb = new StringBuilder();
+		
+		for(int i = 0; i < remove.length; i++){
+			
+			if(remove[i].equalsIgnoreCase("<")){
+				do{
+					i++;
+				}
+				while(!(remove[i].equalsIgnoreCase(">")));
+			}
+			else sb.append(remove[i]);
+		}
+				
+		return sb.toString();		
+	}
+	
+	
+	/**
+	 * Displays the keys and values of the
+	 * maps created.
+	 * 
+	 * @param hash  HashMap containing list,
+	 * values, counts
+	 */
+	public void displayList(HashMap hash){
+		Iterator<Object> itr = hash.keySet().iterator();
+		int sum = 0;
+		while(itr.hasNext()){
+			Object str = itr.next();
+			System.out.println("key: "+str+"\t value: "+hash.get(str));			
+		}		
+	}
+	
+	
+	/**
+	 * Exports hashmap of values extracted  
+	 * from dataset to external file
+	 * 
+	 * @param location folder, file name and file extension
+	 * @param list values to be exported
+	 */	
+	public void exportFile(String location, HashMap list){
+
+		String SEPARATOR = "\t";
+		StringBuffer line = new StringBuffer();
+		Iterator<Object> itr = list.keySet().iterator();
+		
+			try{
+				BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8"));
+
+				while(itr.hasNext()){
+					Object str = itr.next();
+					if(str != null){
+						line.append(str).append(SEPARATOR).append(list.get(str));
+						if(line.toString().contains("="))
+							line.replace(line.indexOf("="), line.indexOf("=")+1,SEPARATOR);	
+						//handling specificities from title content extraction
+						if(line.toString().contains(","))
+							line.replace(line.indexOf(","), line.indexOf(",")+1,SEPARATOR);						
+					}
+					if(itr.hasNext()){						
+						//writer.newLine();					
+						line.append("\n");					
+					}
+					writer.write(removeSpecialChar(line.toString()));					
+					line.replace(0, line.length(), "");
+					//writer.newLine();
+				}
+				writer.flush();
+				writer.close();
+			}
+			catch(UnsupportedEncodingException e){
+				e.printStackTrace();
+			}
+			catch(FileNotFoundException e){
+				e.printStackTrace();
+			}
+			catch(IOException e){
+				e.printStackTrace();
+			}		
+			
+			
+		//}
+	}
+	
+	
+	/**
+	 * Exports list of values extracted  
+	 * from dataset to a string variable
+	 * 
+	 * @param list list of values to be exported
+	 * @return string containing values on list
+	 * @deprecated
+	 */		
+	public String exportContent(HashMap list){
+		String SEPARATOR = "\t";
+		Iterator<String> itr = list.keySet().iterator();
+		StringBuffer export = new StringBuffer();		
+		//try{
+		while(itr.hasNext()){
+			String str = itr.next();
+			if(str != null){
+				export.append(str).append(SEPARATOR).append(list.get(str));
+				
+				if(export.toString().contains("="))
+					export.replace(export.indexOf("="), export.indexOf("=")+1,SEPARATOR);				
+			}
+			
+			if(itr.hasNext()){
+				export.append("\n");
+			}
+		}
+		/*}
+		catch(Exception e){
+			
+		}*/
+		
+		return removeSpecialChar(export.toString());			
+	}
+	
+	
+	/**
+	 * Exports list of values extracted  
+	 * from dataset to external file
+	 * 
+	 * @param location folder, file name and file extension
+	 * @param list list of values to be exported
+	 *
+	 */	
+	public void exportList(String location, ArrayList<String> list){
+
+		String SEPARATOR = "\n";
+		StringBuffer line = new StringBuffer();		
+
+		try{
+			BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8"));
+
+			for(int i = 0; i < list.size(); i++){
+				String str = list.get(i);
+				if(str != null){
+					line.append(str).append(SEPARATOR);											
+				}
+			}
+			writer.write(removeSpecialChar(line.toString()));
+
+			writer.flush();
+			writer.close();
+		}
+		catch(UnsupportedEncodingException e){
+			e.printStackTrace();
+		}
+		catch(FileNotFoundException e){
+			e.printStackTrace();
+		}
+		catch(IOException e){
+			e.printStackTrace();
+		}		
+
+	}
+	
+	
+	public void initialize(){
+				
+	} 
+	
+		
+	/**
+	 * Accessors and mutators methods
+	 * for Extractor variables. 
+	 * @return
+	 */
+	/*public String getPathFile() {
+		return pathFile;
+	}
+	public void setPathFile(String pathFile) {
+		this.pathFile = pathFile;
+	}*/
+	public String getid() {
+		return id;
+	}
+	public void setid(String id) {
+		this.id = id;
+	}
+	public String getendId() {
+		return endId;
+	}
+	public void setendId(String endId) {
+		this.endId = endId;
+	}
+	public String getOpenFile() {
+		return openFile;
+	}
+	public void setOpenFile(String openFile) {
+		this.openFile = openFile;
+	}
+	public String getendFile() {
+		return endFile;
+	}
+	public void setendFile(String endFile) {
+		this.endFile = endFile;
+	}
+	public String getopenAbst() {
+		return openAbst;
+	}
+	public void setopenAbst(String openAbst) {
+		this.openAbst = openAbst;
+	}
+	public String getcloseAbst() {
+		return closeAbst;
+	}
+	public void setcloseAbst(String closeAbst) {
+		this.closeAbst = closeAbst;
+	}
+	public String getOpenEC() {
+		return openEC;
+	}
+	public void setOpenEC(String openEC) {
+		this.openEC = openEC;
+	}
+	public String getCloseEC() {
+		return closeEC;
+	}
+	public void setCloseEC(String closeEC) {
+		this.closeEC = closeEC;
+	}
+	public String getAbstractLabel() {
+		return abstractLabel;
+	}
+	public void setAbstractLabel(String abstractLabel) {
+		this.abstractLabel = abstractLabel;
+	}	
+	public String getClassTag() {
+		return classTag;
+	}
+	public void setClassTag(String classTag) {
+		this.classTag = classTag;
+	}
+	public String getOpenTitle() {
+		return openTitle;
+	}
+	public void setOpenTitle(String titleTag) {
+		this.openTitle = titleTag;
+	}
+	public String getCloseTitle() {
+		return closeTitle;
+	}
+	public void setCloseTitle(String closeTitle) {
+		this.closeTitle = closeTitle;
+	}
+	public String getOpenJournal() {
+		return openJournal;
+	}
+	public void setOpenJournal(String openJournal) {
+		this.openJournal = openJournal;
+	}
+	public String getCloseJournal() {
+		return closeJournal;
+	}
+	public void setCloseJournal(String closeJournal) {
+		this.closeJournal = closeJournal;
+	}
+
+}
\ No newline at end of file
diff --git a/src/analyse/FeatureExtractor.java b/src/analyse/FeatureExtractor.java
new file mode 100644
index 0000000..4ca93aa
--- /dev/null
+++ b/src/analyse/FeatureExtractor.java
@@ -0,0 +1,591 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package analyse;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import configure.PathConstants;
+
+
+/**
+ * This class extracts and parses domain 
+ * annotation features from doc instances
+ *   
+ * @author halmeida
+ */
+
+public class FeatureExtractor extends Extractor{	
+	
+	public FeatureExtractor(){
+				
+		this.id = "<PMID Version=1>";
+		this.endId = "</PMID>";
+		this.endFile = "</PubmedArticleSet>";
+		this.openAbst = "<AbstractText>";
+		this.closeAbst = "</AbstractText>";
+		this.abstractLabel = "<AbstractText ";
+		this.openEC = "<RegistryNumber>EC ";
+		this.closeEC = "</RegistryNumber>";
+		this.classTag = "<TRIAGE>";
+		this.openJournal = "<Title>";
+		this.closeJournal = "</Title>";
+		this.openTitle = "<ArticleTitle>";
+		this.closeTitle = "</ArticleTitle>";		
+	}
+		
+	
+	public static void main(String[] args) {
+		
+		PathConstants pathVars = new PathConstants();
+		
+		String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +  pathVars.TRAIN_DIR + pathVars.TRAINING_FILE;
+		FeatureExtractor fextrac = new FeatureExtractor();	
+				
+		//store all features, type and count
+		HashMap<Map<String,String>,Integer> abstract_count = new HashMap<Map<String,String>,Integer>();
+		//store all features, type and classification
+		HashMap<Map<String,String>,String> abstract_type = new HashMap<Map<String,String>,String>();
+		
+		//store title features, type and classification
+		HashMap<Map<String,String>,String> title_type = new HashMap<Map<String,String>,String>();
+		//store title features, type and count
+		HashMap<Map<String,String>, Integer> title_count = new HashMap<Map<String,String>, Integer>();
+		//store title features, whole journal title content and classification
+		HashMap<Map<String,String>,String> title_content = new HashMap<Map<String,String>,String>();
+		
+		//store title content and EC numbers
+		ArrayList<String> ec_numbers = new ArrayList<String>();
+				
+		fextrac.initialize();
+		int jTitle = 0;
+				
+		try 
+		{
+			BufferedReader reader = new BufferedReader(new FileReader(AnCorpus));			
+
+			//---------------------------
+			// repeat until all lines of the file are read
+			//---------------------------
+			String line = null;
+			String features = null;
+			// String id = null;			
+
+
+			while((line = reader.readLine()) != null){
+
+				line = line.replaceAll("\t","");
+				line = line.replace("\"", "");
+
+				//find paper ID and store it
+				if (line.contains(fextrac.getid())){
+					line = line.replace(fextrac.getid(), "");
+				//	id = line.replace(fextrac.getendId(), "");		
+
+					//continue reading
+					features = reader.readLine();
+					features = features.replaceAll("\t","");
+										
+					String journal = "";
+
+					//continue reading until the end of file
+					while(!(features.contentEquals(fextrac.getendFile()))){											
+						
+						//find relevant doc section - Journal title
+						if(features.contains(fextrac.getOpenJournal())){
+							
+							features = features.replace(fextrac.getOpenJournal(),"");
+							features = features.replace(fextrac.getCloseJournal(), "");
+							features = fextrac.removeSpecialChar(features);
+														
+							//separating only the journal title content													
+							journal = fextrac.removeTags(features);
+							//counting # of journal titles					
+							jTitle++;
+							
+							features = reader.readLine();
+							features = features.replaceAll("\t","");								
+						}						
+												
+						//find relevant doc section - Article title
+						if(features.contains(fextrac.getOpenTitle())){
+							
+							features = features.replace(fextrac.getOpenTitle(),"");
+							features = features.replace(fextrac.getCloseTitle(), "");
+							features = fextrac.removeSpecialChar(features);
+							
+							//separating the title by annotations													
+							String title_annotation = features;																				
+							
+							//extracting annotations and inserting them on lists
+							fextrac.annotations(title_annotation, title_count, title_type, pathVars);
+							fextrac.addContent(title_annotation, journal, title_content);							
+							
+							features = reader.readLine();
+							features = features.replaceAll("\t","");							
+						}						
+						
+						if(features.contains(fextrac.getAbstractLabel())){
+							
+							String temp = "";
+							String newAbs = fextrac.getopenAbst();						
+							
+							//handling cases when the tag is already within abstract content
+							if(features.contains("</Abstract>")){
+								temp = temp + fextrac.processAbstract(features);
+							}
+							else{												
+								do{							
+									temp = temp + fextrac.processAbstract(features);								
+									features = reader.readLine();							
+								}while(!(features.contains("</Abstract>")));
+							}							
+							newAbs = newAbs + temp;
+							features = newAbs + fextrac.getcloseAbst();							
+						}
+						
+						//find relevant doc section - Abstract
+						if(features.contains(fextrac.getopenAbst())){
+							
+							features = features.replace(fextrac.getopenAbst(),"");
+							features = features.replace(fextrac.getcloseAbst(), "");
+							features = fextrac.removeSpecialChar(features);
+							
+							//handle lines in which abstract text tag
+							//is separated from the actual text
+							if(features.isEmpty()){
+								features = reader.readLine();
+								features = features.replaceAll("\t","");
+								features = features.replace(fextrac.getopenAbst(),"");
+								features = features.replace(fextrac.getcloseAbst(), "");
+								features = fextrac.removeSpecialChar(features);
+							}																					
+							
+							features = fextrac.removeAbstractTags(features);
+							
+							//gathering abstract annotations
+							String abstrac = features;
+							
+							//extract annotations and insert them on lists
+							fextrac.annotations(abstrac, abstract_count, abstract_type, pathVars);					
+
+							features = reader.readLine();
+							features = features.replaceAll("\t","");
+							//features = features.replaceAll("\\s+", "");
+						}
+						
+						//identifying EC number
+						if(features.contains(fextrac.getOpenEC())){
+							features = features.replace(fextrac.getOpenEC(), "");
+							features = features.replace(fextrac.getCloseEC(), "");
+							features = fextrac.removeSpecialChar(features);
+							
+							ec_numbers.add(features);
+							
+							features = reader.readLine();
+							features = features.replaceAll("\t","");
+						}
+						
+						//find classification of the document	
+						if(features.contains(fextrac.getClassTag())){	
+														
+							//adding classification to the list of annotations
+							String classif = fextrac.getClassif(features);
+							fextrac.addClass(classif, abstract_type);							
+							fextrac.addClass(classif, title_type);
+							fextrac.addClass(classif, title_content);													
+							
+							features = reader.readLine();
+							features = features.replaceAll("\t","");
+						}
+
+						features = reader.readLine();
+						features = features.replaceAll("\t","");						
+
+					}					
+
+				}				
+
+			}
+
+			reader.close();
+
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();			
+		} 
+		catch (IOException e) {
+			e.printStackTrace();
+		}
+		
+		
+		//Use for sample output
+		//System.out.println("\n===========TITLE==ANNOTATIONS=============");
+		//fextrac.displayList(title_count);		
+		//fextrac.displayList(title_type);
+		//fextrac.displayList(title_content);
+		//System.out.println("\n========ABSTRACT==ANNOTATIONS=============");
+		//fextrac.displayList(abstract_count);		
+		//fextrac.displayList(abstract_type);		
+		
+		//Before exporting, take into account the 
+		//occurence of all extracted features 
+		fextrac.considerOccurence(abstract_count, pathVars);
+		fextrac.considerOccurence(title_count, pathVars);
+		
+		
+		System.out.println("\n===========FEATURE==EXPORT===============");			
+		fextrac.exportList(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES, ec_numbers);
+		System.out.println("..."+ ec_numbers.size()+" EC numbers saved.");				
+		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES, abstract_count);
+		System.out.println("..."+ abstract_count.size()+" unique Abstract annotations saved.");
+		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES, title_count);
+		System.out.println("..."+ title_count.size() +" unique Title annotations saved.");
+		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES, title_content);
+		System.out.println("..."+jTitle+" Journal titles saved.");
+		System.out.println("\n=========================================\n");
+		
+	}		
+
+	/**
+	 * Identifies the classification on doc
+	 * 
+	 * @param clas text containing classification (after char removal)
+	 * @return classification of doc
+	 */	
+	private String getClassif(String clas) {
+		
+		//parsing the not edited text into HTML using Jsoup
+		Document doc = Jsoup.parseBodyFragment(clas);
+		//saving the text as an Jsoup element, with a main tag (the HTML body), 
+		//attributes and child nodes (TRIAGE tags)
+		Element text = doc.body();
+		
+		Elements classification = text.getElementsByTag("TRIAGE");
+				
+		return classification.text();		
+	}
+	
+	/**
+	 * Inserts the classification 
+	 * on the list of features
+	 * 
+	 * @param class information to insert on list
+	 * @param list list of features used
+	 */	
+	private void addClass(String element, HashMap<Map<String,String>, String> list){
+		//going over list to insert
+		//classif on document instances		
+		Iterator<Map<String, String>>it = list.keySet().iterator();
+		
+		while(it.hasNext()){		
+			Map<String,String> str = it.next();
+								
+			if(list.get(str).contains("positive") || list.get(str).contains("negative")){
+					
+			}
+			else list.put(str, element);
+		}
+	}
+	
+	
+	/**
+	 * Removes from feature list all features with 
+	 * frequency not statistically relevant (2 or less)
+	 * @param list to be cleaned
+	 */	
+	private void considerOccurence(HashMap<Map<String,String>,Integer> list, PathConstants vars){
+		//going over the list of annotations and removing the
+		//features with occurance lower than specified.
+		
+		Iterator<Map<String, String>> iterator = list.keySet().iterator();
+							
+		while(iterator.hasNext()){
+			Map<String, String> key = iterator.next();
+			int valor = list.get(key).intValue();			
+			
+			if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
+				iterator.remove();				
+			}
+		}		
+	}	
+	
+	
+	/**
+	 * Extract the annotations from a determined section
+	 * of the document and add them to the specified lists.
+	 * 
+	 * @param annotation cleaned and splitted line with annotation
+	 * @param count list that holds annotation, its type and its count
+	 * @param type list that holds annotation, its type and its classification
+	 */	
+	private void annotations(String annot, HashMap<Map<String, String>, Integer> count, HashMap<Map<String,String>,String> type, PathConstants pathVars) {		
+		HashMap<String,String> features = loadAnnotationEntities();
+		PathConstants pathVar = new PathConstants(); 
+		NgramExtractor nextrac = new NgramExtractor();
+		ArrayList<String> content = new ArrayList<String>();
+
+		//parsing the not edited text into HTML using Jsoup
+		Document doc = Jsoup.parseBodyFragment(annot);
+		//saving the text as an Jsoup element, with a main tag (the HTML body), 
+		//attributes and child nodes (annotation tags)
+		Element annotations = doc.body();
+
+		//iterating over list of entities
+		for(Map.Entry<String,String> value : features.entrySet()){
+
+			String an_type = value.getKey();
+			String an_level = value.getValue();
+
+			//for each entity, find the annotations on abstract
+			Elements annots = annotations.getElementsByTag(an_type);			
+
+			//for each annotation found, 
+			for(Element an : annots){
+
+				//grabbing annotation content:
+				//if the annotation is made on the sentence level:
+				if(an_level.contains("sentence")){
+
+					//checkingh if sentence contains inner annotations
+					if(an.childNodeSize() != 0){
+
+						//going over list of inner annotations
+						for(Element child : an.children()){
+
+							//if child is sentence (sentence inside of sentence),  
+							//then add annotations as ngrams on this
+							if(features.get(child.nodeName()).contains("sentence")) {
+								content.addAll(nextrac.nGrams(child.text(), pathVar));
+								insertAnnotation(content, an.nodeName(), count, type, pathVars);
+							}
+							//adding annotations on sentence as they are - no ngrams on this
+							else {
+								content.add(child.text());	
+								insertAnnotation(content, an.nodeName(), count, type, pathVars);
+							}
+						}
+						
+						//removing inner annotations from sentence, they are already added
+						Element tempAnnot = an.clone();
+						tempAnnot.children().remove();
+
+						//splitting content in ngrams to whats left on the sentence
+						content.addAll(nextrac.nGrams(tempAnnot.text(), pathVar));
+						insertAnnotation(content, an.nodeName(), count, type, pathVars);
+					}			
+
+				}
+				else {
+					//keeping original annotation content for other cases					
+					content.add(an.text()); 
+					insertAnnotation(content, an.nodeName(), count, type, pathVars);
+				}
+			}
+
+		}
+
+	}	
+	
+	
+	/**
+	 * Insert annotation (or ngram list of annotation) 
+	 * on lists, used on @annotations method 
+	 * @param content content of annotation
+	 * @param an_type type extracted from text (entity)
+	 * @param count list of annotations and their count
+	 * @param type  list of annotations and their type
+	 */	
+	private void insertAnnotation(ArrayList<String> content, String an_type, HashMap<Map<String, String>, Integer> count, HashMap<Map<String,String>,String> type, PathConstants pathVars){
+		
+		//iterating over list of annotations
+		for(int i = 0; i < content.size(); i++){
+
+			if(content.get(i).length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){
+
+				//creating the list key as: content - type mapping
+				Map<String, String> an_content = new HashMap<String, String>();				
+				an_content.put(content.get(i), an_type);
+
+				//for each annotation (or ngram on annotation)
+				//insert content and related type
+				if(count.containsKey(an_content)){						
+					try{
+						int cnt = count.get(an_content);								
+						count.put(an_content, cnt+1);
+
+					}catch(Exception e){
+						count.put(an_content, 1);															
+					}
+				}					
+				else{					
+					count.put(an_content, 1);					
+				}
+				//populating list of feature_an_types, with:
+				//feature--an_type--class				
+				type.put(an_content, "");
+			}
+		}
+		
+		content.clear();
+		
+	}
+
+	
+	/**
+	 * Inserts the text (e.g.title) content into   
+	 * a list of features (e.g.title features)
+	 *  
+	 * @param annot text with the annotations to be handled
+	 * @param wContent whole field to be added on the list of features
+	 * @param list features used
+	 * 
+	 */	
+	private void addContent(String annot, String wContent, HashMap<Map<String,String>,String> list) {
+
+		HashMap<String,String> features = loadAnnotationEntities();
+		ArrayList<String> content = new ArrayList<String>();
+		NgramExtractor nextrac = new NgramExtractor();
+		PathConstants pathVar = new PathConstants();
+
+		//parsing not edited text into HTML using Jsoup
+		Document doc = Jsoup.parseBodyFragment(annot);
+		//saving the text as an Jsoup element, with a main tag (the HTML body), 
+		//attributes and child nodes (annotation tags)
+		Element annotations = doc.body();
+
+		//iterating over annotation types
+		for(Map.Entry<String,String> value : features.entrySet()){
+
+			String an_type = value.getKey();
+			String an_level = value.getValue();
+
+			//for each annotation type, find all related annotations on the abstract
+			Elements annots = annotations.getElementsByTag(an_type);			
+
+			//for each annotation type, 
+			for(Element an : annots){
+
+				//grab annotation content								
+				if(an_level.contains("sentence"))
+					//splitting in ngrams for sentence level annotations
+					content = nextrac.nGrams(an.text(), pathVar);
+				else 
+					//keeping original annotation for other cases
+					content.add(an.text());
+
+				//iterating over list of annotations
+				for(int i = 0; i < content.size(); i++){
+					
+					Map<String,String> an_content = new HashMap<String,String>();
+					an_content.put(content.get(i), wContent);
+					
+					//populating list of feature_an_types, with:
+					//feature--an_type--class
+					list.put(an_content, "");									
+				}
+				content.clear();
+			}
+		}
+	}
+
+	
+	/**
+	 * Loads list of entities from external file
+	 * 
+	 * @param str list of entities
+	 * @param pathVar constants from 
+	 * @return
+	 */	
+	public HashMap<String,String> loadAnnotationEntities(){
+		
+		String pathEntities = "entities.txt";		
+		HashMap<String,String> values = new HashMap<String,String>();
+						
+		try{			
+			BufferedReader reader = new BufferedReader(new FileReader(pathEntities));
+			
+			String line = null;	
+			
+			while((line = reader.readLine()) != null){				
+                
+				String[] value = StringUtils.split(line, " ");
+				values.put(value[0].toLowerCase(), value[1].toLowerCase());				
+			}
+			
+			reader.close();
+			
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }		
+		//String[] entities = values.toArray(new String[values.size()]);
+		
+		return values;
+	}
+
+	
+	/**
+	 * Handles the content of annotations; when
+	 * there is multiple elements, they are 
+	 * concatenated after extracted 
+	 * 
+	 * @param str list of annotation elements
+	 * @return single string with all elements
+	 */	
+	public String contentToString(String[] str){
+		String cont = "";
+		
+		for(int i = 0; i < str.length; i++){
+				if(cont.contentEquals("")){
+					cont = cont + str[i];	
+				}
+				else cont = cont+" "+ str[i];
+				
+			}		
+		
+		return cont;
+	}
+	
+	
+
+}
diff --git a/src/analyse/NgramExtractor.java b/src/analyse/NgramExtractor.java
new file mode 100644
index 0000000..e3a8085
--- /dev/null
+++ b/src/analyse/NgramExtractor.java
@@ -0,0 +1,442 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package analyse;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import configure.PathConstants;
+
+/**
+ * This class extracts and parses n-grams
+ * from doc instances.
+ * 
+ * @author halmeida
+ */
+
+public class NgramExtractor extends Extractor{
+		
+	public NgramExtractor(){
+		this.id = "<PMID Version=1>";
+		this.endId = "</PMID>";
+		this.endFile = "</PubmedArticleSet>";
+		this.openAbst = "<AbstractText>";
+		this.closeAbst = "</AbstractText>";
+		this.abstractLabel = "<AbstractText ";
+		this.classTag = "<TRIAGE>";
+		this.openTitle = "<ArticleTitle>";
+		this.closeTitle = "</ArticleTitle>";
+	}	
+	
+	static String certainty = "?"; //very relevant, relevant, fairly relevant
+	
+	
+	public static void main(String[] args) {
+		
+		PathConstants pathVars = new PathConstants();
+		
+		String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE;
+		NgramExtractor nextrac = new NgramExtractor();
+		//store abstract ngrams and its count
+		HashMap<String,Integer> ngram_count = new HashMap<String,Integer>();
+		//store abstract ngrams, count and "relevance(TBD)"
+		HashMap<Map<String,String>,Integer> ngrams  = new HashMap<Map<String,String>,Integer>();
+		//store title ngrams and its count
+		HashMap<String,Integer> ngram_title_count = new HashMap<String,Integer>();
+		//store title ngrams, count and "relevance(TBD)"
+		HashMap<Map<String,String>,Integer> ngram_title = new HashMap<Map<String,String>,Integer>();
+		
+		nextrac.initialize();		
+		
+		try 
+		{			
+			BufferedReader reader = new BufferedReader(new FileReader(AnCorpus));	       
+
+			//---------------------------
+			// repeat until all lines 
+			// of the file are read
+			//---------------------------
+			String line = null;
+			String features = null;
+			String id = null;
+
+
+			while((line = reader.readLine()) != null){
+
+				line = line.replaceAll("\t","");
+				line = line.replace("\"", "");
+
+				//find paper ID and store it
+				if (line.contains(nextrac.getid())){
+					line = line.replace(nextrac.getid(), "");
+					id = line.replace(nextrac.getendId(), "");
+
+					//keep reading the file
+					features = reader.readLine();
+					features = features.replaceAll("\t","");	       		
+
+					String tit_content = "";
+
+					//continue reading until the end of file
+					while(!(features.contentEquals(nextrac.getendFile()))){
+						
+						String abstrac = "";
+
+						//find relevant doc section - Article title
+						if(features.contains(nextrac.getOpenTitle())){
+
+							//cleaning title content
+							features = features.replace(nextrac.getOpenTitle(),"");
+							features = features.replace(nextrac.getCloseTitle(), "");
+							features = nextrac.removeSpecialChar(features);
+							tit_content = nextrac.removeTags(features);
+
+							//extract n-grams from section
+							ArrayList<String> title_c = nextrac.nGrams(tit_content, pathVars);
+							nextrac.addNGram(title_c, ngram_title_count,ngram_title, pathVars);
+
+							features = reader.readLine();
+							features = features.replaceAll("\t","");
+						}
+						
+
+						if(features.contains(nextrac.getAbstractLabel())){
+							
+							String temp = "";
+							String newAbs = nextrac.getopenAbst();
+							
+							if(features.contains("</Abstract>")){
+								temp = temp + nextrac.processAbstract(features);
+							}
+							else{						
+								do{							
+									temp = temp + nextrac.processAbstract(features);								
+									features = reader.readLine();							
+								}while(!(features.contains("</Abstract>")));
+							}
+								
+							newAbs = newAbs + temp;
+							features = newAbs + nextrac.getcloseAbst();							
+						}
+
+						//find relevant paper section
+						if(features.contains(nextrac.getopenAbst())){							
+							
+							features = features.replace(nextrac.getopenAbst(),"");
+							features = features.replace(nextrac.getcloseAbst(), "");
+							features = features.replace("-", " ");
+							features = nextrac.removeSpecialChar(features);
+							
+							//handle lines in which abstract text tag
+							//is separated from the actual text
+							if(features.isEmpty()){
+								features = reader.readLine();
+								features = features.replaceAll("\t","");
+								features = features.replace(nextrac.getopenAbst(),"");
+								features = features.replace(nextrac.getcloseAbst(), "");
+								features = features.replace("-", " ");
+								features = nextrac.removeSpecialChar(features);
+							}					
+							
+							//features = nextrac.removeSpecialChar(features);
+							abstrac = nextrac.removeTags(features);
+							abstrac = nextrac.removeAbstractTags(abstrac);
+							//extract n-grams from section
+							ArrayList<String> abstract_c = nextrac.nGrams(abstrac, pathVars);
+							nextrac.addNGram(abstract_c, ngram_count, ngrams, pathVars);												
+
+							//keep reading file
+							features = reader.readLine();
+							features = features.replaceAll("\t","");
+							//features = features.replaceAll("\\s+", "");
+						}
+						
+						features = reader.readLine();
+						features = features.replaceAll("\t","");
+						//features = features.replaceAll("\\s+", "");
+					}			
+				}
+			}			
+
+			reader.close();      				
+
+
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        } 
+        
+		//print list of extracted n-grams
+		//System.out.println("\n========ABSTRACT==NGRAMS=============");
+		//nextrac.displayList(ngram_count);
+		//nextrac.displayList(ngram_title);
+		//System.out.println("\n===========TITLE==NGRAMS=============");
+		//nextrac.displayList(ngram_title_count);
+		
+		
+		nextrac.considerOccurance(ngram_count, pathVars);
+		nextrac.considerOccurance(ngram_title_count, pathVars);
+		
+		
+		System.out.println("\n===========NGRAMS==EXPORT===============\n");		
+		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES, ngram_count);
+		System.out.println("..."+ ngram_count.size()+" unique Abstract ngrams saved.");
+		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS, ngram_title_count);
+		System.out.println("... "+ ngram_title_count.size() +" unique Title ngrams saved.");		
+		System.out.println("\n========================================\n");		
+               
+	}
+	
+	
+	/**
+	 * Removes from feature list all features with 
+	 * frequency not statistically relevant (2 or less)
+	 * @param list to be cleaned
+	 */
+	
+	private void considerOccurance(HashMap<String,Integer> list, PathConstants vars){
+		//going over the list of annotations and removing the
+		//statistically not significant features - frequency less than 2
+
+		Iterator <Integer> iterator = list.values().iterator();
+
+		while(iterator.hasNext()){
+			Integer key = iterator.next();
+
+			if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
+				iterator.remove();				
+			}
+		}
+	}
+	
+	private void addNGram(ArrayList<String> str, HashMap<String,Integer> list_count, HashMap<Map<String,String>,Integer> list, PathConstants pathVars){
+		
+		for(int i = 0; i < str.size(); i++){
+			String currentNGram = str.get(i);
+			
+			if(list_count.containsKey(currentNGram)){
+				int count = list_count.get(currentNGram);
+				list_count.put(currentNGram, count+1);
+
+				/*if(list.containsKey(currentNGram)){		
+					int cnt = list.get(currentNGram).get(certainty);
+					list.get(currentNGram).put(certainty, cnt+1);
+				}
+				else{
+					list.get(currentNGram).put(certainty, 1);
+				}*/
+			}
+			else {
+				if(currentNGram.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){
+					list_count.put(currentNGram, 1);
+					
+				/*	list.put(currentNGram, new HashMap<String, Integer>());
+					list.get(currentNGram).put(certainty, 1);*/
+				}
+			}
+		}
+	}
+	
+	/**
+	 * Extracts n-grams from the content field
+	 * and populates mapping with n-gram +count
+	 * @param str
+	 * @param id
+	 * @param gram
+	 */
+	
+	public ArrayList<String> nGrams(String str, PathConstants pathVar){
+
+		//cleaning further chars on sentence		
+		str = str.replace("/", "");
+		str = str.replace("\\", "");		
+		str = str.replace(" ", "-");
+		//Tokenize the sentence
+		String[] words = StringUtils.split(str,"-"); 
+		ArrayList<String> ngramList = new ArrayList<String>();
+
+		int ngram =Integer.parseInt(pathVar.NGRAM_SIZE);
+
+		if(Boolean.valueOf(pathVar.NGRAM_STOP)){
+			words = StringUtils.split(removeStopList(words, pathVar)," ");
+		}		
+
+		for(int i=0; i < words.length - (ngram - 1); i++){
+			switch(pathVar.NGRAM_SIZE){
+			case "1":
+				ngramList.add(words[i].toLowerCase());
+				break;
+			case "2":
+				ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase());
+				break;
+			case "3":
+				ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()+" "+words[i+2].toLowerCase());
+				break;				
+			}			
+		}
+		
+		return ngramList;
+	}
+	
+	/**
+	 * Removes the stopwords from ngrams list
+	 * 
+	 * @param str list of ngrams
+	 * @param pathVar constants from 
+	 * @return
+	 */
+	
+	public String removeStopList(String[] str, PathConstants pathVar){
+		
+		String pathStop = "stopList.txt";
+		String[] stop = null;
+		StringBuilder cleaned = new StringBuilder();
+		
+		try{
+			
+			BufferedReader reader = new BufferedReader(new FileReader(pathStop));
+			
+			String line = null;	
+			
+			while((line = reader.readLine()) != null){
+				stop = StringUtils.split(line,",");
+				line = reader.readLine();
+			}
+			
+			reader.close();
+			
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        } 		
+		
+		for(int i = 0; i < str.length; i++){
+			for(int j = 0; j < stop.length; j++){
+				
+				if(str[i].equalsIgnoreCase(stop[j])){
+					str[i] = str[i].replace(str[i],"*");					
+				}				
+			}
+			if(!(str[i].contentEquals("*"))){
+				cleaned.append(str[i]).append(" ");				
+			}
+		}		
+		return cleaned.toString().replace("  ", " ");
+	}
+	
+	/**
+	 * Evaluates the level of certainty... 
+	 * TBD!!!
+	 * @param list
+	 * @return
+	 */
+	
+	public String getCertainty(HashMap<String,Map<String,Integer>> list){
+		
+		ArrayList<Object> gramsAr = new ArrayList<Object>(list.entrySet());
+		//String certainty;
+
+		Iterator<?> itr = gramsAr.iterator();
+		while(itr.hasNext()){
+			String str = itr.next().toString();
+			String[] splitted = StringUtils.split(str,"=");
+
+			int relevance = 0;
+			int count = 0;
+
+
+			try{
+				count = list.get(splitted[0]).get(certainty);
+			} catch(Exception e){
+				e.printStackTrace();
+			}
+
+			//relevance = count * getWeight();
+
+			if(relevance == 1)			
+				list.get(splitted[0]).put("fairly relevant", list.get(splitted[0]).get(certainty));							
+			else if (relevance == 2)
+				list.get(splitted[0]).put("relevant", list.get(splitted[0]).get(certainty));
+			else
+				list.get(splitted[0]).put("very relevant", list.get(splitted[0]).get(certainty));
+
+		}
+		
+		return certainty;		
+	}
+	
+	/**
+	 * Displays the keys and values of the
+	 * maps created with n-grams and counts.
+	 * @param hash  HashMap containing n-grams
+	 */
+	@Override
+	public void displayList(HashMap hash){
+		super.displayList(hash);
+			//sum = sum + hash.get(str);		
+		System.out.println("\n=======================================\n");
+		System.out.println("Number of unique n-grams: "+hash.size());
+		System.out.println("\n=======================================\n");
+	}
+	
+	
+	/**
+	 * Accessor and mutator methods for the export
+	 * string with list values - so vector class
+	 * can access its content.
+	 * @return string with list of values.
+	 */
+	/*public static String getNgramCount() {
+		//ngramCount = exportContent(ngram_count);
+		return ngramCount;	
+	}
+	public void setNgramCount(String ngramCount) {
+		this.ngramCount = ngramCount;
+	}
+	public static String getNgram() {
+		//ngram = exportContent(ngrams);
+		return ngram;
+	}
+	public void setNgram(String ngram) {
+		this.ngram = ngram;
+	}	*/
+	
+	
+}
diff --git a/src/arffmatrix/.gitignore b/src/arffmatrix/.gitignore
new file mode 100644
index 0000000..ec5761d
--- /dev/null
+++ b/src/arffmatrix/.gitignore
@@ -0,0 +1,2 @@
+/buildmodel.class
+/buildtest.class
diff --git a/src/arffmatrix/BuildModel.class b/src/arffmatrix/BuildModel.class
new file mode 100644
index 0000000..0be977c
Binary files /dev/null and b/src/arffmatrix/BuildModel.class differ
diff --git a/src/arffmatrix/BuildModel.java b/src/arffmatrix/BuildModel.java
new file mode 100644
index 0000000..65869e8
--- /dev/null
+++ b/src/arffmatrix/BuildModel.java
@@ -0,0 +1,299 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+***
+* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/corpus/buildmodel.java
+* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
+*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
+*                   
+* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term
+* http://creativecommons.org/licenses/by-nc/3.0/
+*/
+
+package arffmatrix;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import analyse.Extractor;
+import arffvector.CreateVector;
+import configure.PathConstants;
+
+/**
+ * This class reads the corpus instances and uses
+ * the CreateVector class to generate a model file (ARFF) *  
+ *
+ */
+
+public class BuildModel {
+	
+	private static String configfile =  null;
+	
+	public static void main(String[] args) {
+		
+		//-----------------------------------
+		// instantiate classes of constants
+		// and configuration file.
+		//-----------------------------------
+
+		PathConstants pathVars;
+
+		if (configfile == null){
+			pathVars = new PathConstants();
+		}
+		else{
+			pathVars = new PathConstants(configfile);
+		}		
+
+		Extractor model = new Extractor();
+		model.initialize();
+		CreateVector vectorgenerator = new CreateVector(pathVars);
+		String attributes = vectorgenerator.informFeatures(pathVars);
+		System.out.println("Features loaded ...");
+		
+		// name output ARFF files
+		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
+		String arffFileName = "triage" + pathVars.EXP_TYPE + attributes +"_"+ timeStamp + ".arff";
+				
+		try 
+	    {		
+			//by default
+			String sortarffFileName = pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + arffFileName; // default
+			
+			// create file 
+			//FileWriter fstream = new FileWriter(sortarffFileName);
+			BufferedWriter out = new BufferedWriter(new FileWriter(sortarffFileName));
+			 
+			// load ARFF header and write it
+			String outHeaderArff = vectorgenerator.genArffHeader(pathVars,Integer.parseInt(pathVars.EXP_TYPE));
+			//System.out.println(outHeaderArff); // verbose
+			out.write(outHeaderArff + "\n");			
+
+			// reader for corpus
+			BufferedReader reader = null;
+			//train corpus
+			if(Integer.parseInt(pathVars.EXP_TYPE) == 0)
+				reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR + pathVars.TRAINING_FILE));	
+			//test corpus
+			else if(Integer.parseInt(pathVars.EXP_TYPE) ==1)
+				reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_DIR + pathVars.TEST_FILE));
+						
+	        //--------------------------------------------
+	        // repeat until all lines have been read
+	        // from the file
+	        //--------------------------------------------
+			String text = null;
+			String content = null;
+			
+			String abstracttext = "";
+			String journaltitle = "";
+			String title = "";
+			String ecnumber = "";
+			String classtriage = "";
+			int hasText = 0;
+			int journaltitlecount = 0;
+			int abstracttitlecount = 0;
+			int abstracttextcount = 0;
+			int positivecount = 0;
+			int negativecount = 0;
+			
+			
+	        while ((text = reader.readLine()) != null) { 		        	
+	        	
+	        	// detect a PubMed abstract
+	        	if (text.contains("<PMID Version=\"1\">")){
+	        		
+	        		// Reinitialize journal title 
+	        		 journaltitle = "";
+	        		 
+	        		// Reinitialize abstract title 
+	        		 title = ""; 
+	        		
+	        		 // Reinitialize abstract text 
+	        		 abstracttext = ""; 
+	        		 
+	        		 // Reinitialize hasText to false
+	        		hasText = 0;
+	        		
+	        		String pmid = text.replaceFirst("<PubmedArticleSet>.*<PMID Version=\"1\">", "");
+	        		pmid = pmid.replace("</PMID>", "");
+	        		System.out.println("PMID : " + pmid);
+	        		
+	        		// continue to read
+	        		content = reader.readLine();
+	        		content = content.replaceAll("\t", "");
+	        		content = content.replaceFirst("\\s+", "");	        		
+	        		
+	        		while ( ! content.contentEquals("</PubmedArticleSet>") ) {
+	        			
+	        			if (content.contains("<Title>")){
+	        				
+	        				journaltitlecount++;
+	        				
+	        				content = content.replace("<Title>", "");
+	        				content = content.replace("</Title>", "");
+	        				journaltitle = content;
+	        				System.out.println("Journal title : " + content);
+	        			}
+	        			
+	        			if (content.contains("<ArticleTitle>")){
+	        				
+	        				abstracttitlecount++;
+	        				
+	        				content = content.replace("<ArticleTitle>", "");
+	        				content = content.replace("</ArticleTitle>", "");
+	        				title = content;
+	        				System.out.println("Paper title : " + content);
+	        			}
+	        			
+	        			        			
+	        			if (content.contains("<AbstractText>")){
+
+	        				abstracttextcount++;
+	        				hasText = 1; // use it to indicate if the abstract has some text or not 
+
+	        				content = content.replace("<AbstractText>", "");
+	        				
+	        				//checks if there are empty lines after AbstractText tag
+	        				//and keeps reading until finds the abstract content
+	        				while(content.isEmpty()){
+	        						content = reader.readLine();     					
+	        				}	        				
+	        					abstracttext = abstracttext + content; 	        					
+	        					// clean
+	        					abstracttext = model.removeAbstractTags(abstracttext);        					
+	        				
+
+	        				content = reader.readLine();
+	        				// converting toLowerCase is not relevant in bio context
+	        				// because it introduces ambiguities (ie Gene name / Enzyme alias)
+	        				// abstracttext = abstracttext.toLowerCase();
+	        			}
+
+	        			if (content.contains("<AbstractText ")){       				        				
+	        				
+	        				String temp = "";
+							String newAbs = "<AbstractText>";
+							
+							if(content.contains("</Abstract>")){
+								temp = temp + model.processAbstract(content);
+							}
+							else{
+								do{							
+									temp = temp + model.processAbstract(content);								
+									content = reader.readLine();							
+								}while(!(content.contains("</Abstract>")));
+							}
+							
+							newAbs = newAbs + temp;
+							content = newAbs + "</AbstractText>"; 
+							
+							abstracttext = content;
+							abstracttext = model.removeAbstractTags(abstracttext);
+							
+							content = reader.readLine();
+								        				
+	        			}	        			
+	        			
+	        			if (content.contains("<RegistryNumber>EC ")){
+	        				content = content.replace("<RegistryNumber>EC ", "");
+							content = content.replace("</RegistryNumber>", "");
+							ecnumber = content;	        				
+	        			}
+	        			
+	        			if (content.contains("<TRIAGE>")){
+        				
+        				content = content.replace("<TRIAGE>", "");
+        				content = content.replace("</TRIAGE>", "");
+        				classtriage = content;
+        				if(content.contains("positive")){
+        					positivecount++;
+        				}
+        				if(content.contains("negative")){
+        					negativecount++;
+        				}
+        				System.out.println("Triage classification : " + content);
+        			}
+	        			
+	        			content = reader.readLine();
+	        			content = content.replaceAll("\t", "");
+	        			content = content.replaceFirst("\\s+", "");
+	        		}
+	        		
+	        		System.out.println("Abstract : " + abstracttext.toString() + "\n\n");
+
+	        		// end of if: collect data and write ARFF
+	        		String Arffline = vectorgenerator.getArffLine(
+	        				journaltitle, 
+	        				title, 
+	        				abstracttext,
+	        				ecnumber,
+	        				classtriage,	        				
+	        				Integer.parseInt(pathVars.EXP_TYPE)
+	        				);
+	        		
+	        		Arffline = Arffline + "\n";
+	        		// write line on disc
+	        		out.write(Arffline);	        		
+	        		// out.write(id + " " + Arffline + "\n"); // 	        		
+	        	}      	
+	        	
+	        }
+	        
+	        System.out.println(
+	        		"Abstracts processed: " + abstracttitlecount
+	        		+ "\t with text content: " + abstracttextcount
+	        		+ "\t from " + journaltitlecount + " journals"
+	        		+ "\nTotal of: \n" + positivecount + " positive"
+	        		+ "\t and " + negativecount + " negative documents");
+	        out.write("\n");
+	        out.close();
+	        
+	        reader.close();
+	      
+	        
+	    }catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }		
+		
+	}	
+	
+	
+	
+}
+
+
+
diff --git a/src/arffvector/.gitignore b/src/arffvector/.gitignore
new file mode 100644
index 0000000..bdc0ba3
--- /dev/null
+++ b/src/arffvector/.gitignore
@@ -0,0 +1,7 @@
+/buildvector.class
+/FeatureVector.class
+/CreateVector.class
+/CreateWeightedVector.class
+/ArbitraryWeight.class
+/CountsWeightedVector.class
+/ArbitraryWeightedVector.class
diff --git a/src/arffvector/CreateVector.java b/src/arffvector/CreateVector.java
new file mode 100644
index 0000000..ce81dee
--- /dev/null
+++ b/src/arffvector/CreateVector.java
@@ -0,0 +1,872 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+***
+* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/vector/buildvector.java
+* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
+*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
+*                   
+* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term
+* http://creativecommons.org/licenses/by-nc/3.0/
+*/
+
+
+
+
+package arffvector;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import org.apache.commons.lang3.StringUtils;
+import configure.PathConstants;
+
+/**
+ * Uses the features extracted and the 
+ * generated corpus to create a feature vector
+ * (a matrix representation of the corpus) 
+ * 
+ * 
+ */
+public class CreateVector {	
+	
+	ArrayList<String> annotations = new ArrayList<String>();
+	ArrayList<String> annotationsType = new ArrayList<String>();
+	ArrayList<String> journalTitles = new ArrayList<String>();
+	ArrayList<String> ecnumbers = new ArrayList<String>();
+	ArrayList<String> titleGrams = new ArrayList<String>();
+	ArrayList<String> titleAnnot = new ArrayList<String>();
+	ArrayList<String> nGrams = new ArrayList<String>();	
+		
+	PathConstants pathVars = null;
+	
+	/**
+	 * Constructor to load all features extracted
+	 * from training files. These features will be 
+	 * used to generate the ARFF header and the
+	 * ARFF vector lines.
+	 * 
+	 * @param extVars Variables holding system paths
+	 */
+	
+	public CreateVector(PathConstants extVars) {
+		
+		pathVars = extVars;
+		
+		String pathJournalT = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES;
+		try{
+			String journalT = "";
+			
+			//receiving journal title
+			BufferedReader reader = new BufferedReader(new FileReader(pathJournalT));
+			int featcount = 0;
+			while (( journalT = reader.readLine()) != null) {
+				
+				if (Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
+										
+					String[] features = StringUtils.split(journalT,"\n"); 
+
+					for(int i = 0; i < features.length; i++){
+
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for journal titles duplicates 						
+						if(featurename[1] != "" && !(journalTitles.contains(featurename[1]))){
+							journalTitles.add(featurename[1]);
+						}
+					}		
+				}
+				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
+
+			}
+			reader.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+		
+		String pathAnnotations = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES;
+		String pathTitleAnnot = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES;
+		
+		try{
+			String abstAnnot = "";
+			String tAnnot = "";
+			
+			//receiving abstract annotations (features)
+			BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations));
+			BufferedReader readerT = new BufferedReader(new FileReader(pathTitleAnnot));
+			
+			int featcount = 0;			
+			
+			while (( abstAnnot = reader.readLine()) != null) {				
+				
+				if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
+					String[] features = StringUtils.split(abstAnnot,"\n"); 
+
+					for(int i = 0; i < features.length; i++){
+
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for duplicate abstract annotations
+						if(featurename[0] != "" && !(annotations.contains(featurename[0]))){
+							annotations.add(featurename[0]);
+						}
+					}		
+				}				
+				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
+			}
+			
+			
+			if(!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){
+				while((tAnnot = readerT.readLine()) != null){
+					
+					String[] features = StringUtils.split(tAnnot,"\n");
+					
+					for(int i = 0; i < features.length; i++){
+
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for duplicate annotations
+						if(featurename[0] != "" && !(annotations.contains(featurename[0]))){
+							annotations.add(featurename[0]);
+						}
+					}	
+					
+				}
+				
+			}
+			
+			reader.close();
+			readerT.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+		
+		try{
+			String abstAnType = "";
+			
+			//receiving abstract annotation types
+			BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations));
+			int featcount = 0;
+			while (( abstAnType = reader.readLine()) != null) {
+				
+				if (Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
+										
+					String[] features = StringUtils.split(abstAnType,"\n"); 
+
+					for(int i = 0; i < features.length; i++){
+
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for duplicate abstract annotation types 
+						if(featurename[1] != "" && !(annotationsType.contains(featurename[1]))){							
+							annotationsType.add(featurename[1]);
+						}
+						
+					}		
+				}
+				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
+
+			}
+			reader.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}		
+		
+		
+		
+		try{
+			String titAnnot = "";
+			
+			//receiving title annotations (features)
+			BufferedReader reader = new BufferedReader(new FileReader(pathTitleAnnot));
+			// int featcount = 0;
+			while (( titAnnot = reader.readLine()) != null) {
+
+				if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){
+					
+					//String titAnnot = FeatureExtractor.getTitCount();
+
+					String[] features = StringUtils.split(titAnnot,"\n");				
+
+					for(int i = 0; i < features.length; i++){
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//checking for duplicate title annotations
+						if(!(titleAnnot.contains(featurename[0]))){
+							titleAnnot.add(featurename[0]);	
+						}
+					}								
+				}
+			}
+			reader.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+		
+		
+		String pathECNumFeatures = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES;
+
+		try{
+			String ECNum = "";
+
+			//receiving EC numbers (features)
+			BufferedReader reader = new BufferedReader(new FileReader(pathECNumFeatures));
+			// int featcount = 0;
+			while ((ECNum = reader.readLine()) != null) {
+
+				if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
+
+					//String titAnnot = FeatureExtractor.getTitCount();
+
+					String[] features = StringUtils.split(ECNum,"\n");				
+
+					for(int i = 0; i < features.length; i++){
+						String[] featurename = StringUtils.split(features[i],"\t");
+
+						//checking for duplicate EC numbers
+						if(!(ecnumbers.contains(featurename[0]))){
+							ecnumbers.add(featurename[0]);	
+						}
+					}								
+				}
+			}
+			reader.close();
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+		
+		
+		String pathTitleGrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS;
+		
+		
+		try{
+			String titCont = "";
+			// String grams = "";
+			
+			//receiving title ngrams
+			BufferedReader reader = new BufferedReader(new FileReader(pathTitleGrams));
+			
+			int featcount = 0;
+			while (( titCont = reader.readLine()) != null) {
+
+				if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
+					
+					String[] content = StringUtils.split(titCont,"\n");				
+
+					for(int i = 0; i < content.length; i++){				
+						String[] featurename = StringUtils.split(content[i],"\t");			
+						
+						//check for duplicate title ngrams
+						if(!(titleGrams.contains(featurename[0]))){
+							titleGrams.add(featurename[0]);
+						}
+					}			
+				}
+			}
+						
+			reader.close();
+
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+				
+		String pathNgrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES;
+		try{
+			String grams = "";
+			String tgrams = "";
+			
+			//receiving ngrams
+			BufferedReader reader = new BufferedReader(new FileReader(pathNgrams));
+			BufferedReader readerT = new BufferedReader(new FileReader(pathTitleGrams));
+			
+			// int featcount = 0;
+			while (( grams = reader.readLine()) != null) {
+
+				if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
+
+					String[] features = StringUtils.split(grams,"\n");
+
+					for(int i = 0; i < features.length; i++){
+						String[] featurename = StringUtils.split(features[i],"\t");
+
+						//check for duplicate abstract ngrams
+						if(!(nGrams.contains(featurename[0]))){
+							nGrams.add(featurename[0]);
+						}
+					}
+				}
+
+			}
+			
+			//if not using title grams separately, 
+			// then insert title grams with abstract grams.  
+			if (!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){
+				while (( tgrams = readerT.readLine()) != null) {
+					
+					String[] features = StringUtils.split(tgrams,"\n");
+					
+					for(int i = 0; i < features.length; i++){
+						String[] featurename = StringUtils.split(features[i],"\t");
+						
+						//check for duplicate ngrams
+						if(!(nGrams.contains(featurename[0]))){
+							nGrams.add(featurename[0]);
+						}
+					}					
+				}				
+			}
+			
+			reader.close();
+			readerT.close();
+			
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch (IOException e) {			
+			e.printStackTrace();
+		}
+	}
+	
+	/**
+	 * Gathers the list of features, according to 
+	 * experimental configurations. The list of 
+	 * features will be written on the ARFF header.
+	 * 
+	 * @param pathVars Variables holding system paths
+	 * @param exp experiment type: train or test
+	 * @return a String containing the ARFF header
+	 */
+	
+	public String genArffHeader(PathConstants pathVars, int exp){
+		
+		StringBuilder headerArff = new StringBuilder();
+		
+		switch(exp){
+			case 0: 
+				headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2014\n\n");
+			break;			
+			case 1: 
+				headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2014\n\n");
+			break;
+		}		
+		
+		headerArff.append("@RELATION triage\n");
+		
+		if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){
+			// writing the list of text sizes
+			headerArff.append("@ATTRIBUTE sizeoftitle \tREAL \t\t%size of title\n");
+			headerArff.append("@ATTRIBUTE sizeoftext \tREAL \t\t%size of text\n");			
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
+			for(int i = 0; i < journalTitles.size(); i++){
+			// writing list of journal titles
+				String feature = journalTitles.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "journalTitle" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+							
+			}
+		}
+		
+		if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
+			// writing list of annotation features
+			for(int i = 0; i < annotations.size(); i++){
+
+				String feature = annotations.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "annotation" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+
+			}
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
+			// writing list of annotation entities
+			for(int i = 0; i < annotationsType.size(); i++){
+				String feature = annotationsType.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "annotationType" + String.valueOf(i) + namefeature;
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+				
+			}
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){			
+			// write list of title features
+			for( int i = 0; i < titleAnnot.size(); i++){
+
+				String feature = titleAnnot.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "titleAnnot" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+				
+			}			
+			
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
+			// writing list of EC numbers
+			for(int i = 0; i < ecnumbers.size(); i++){
+				String feature = ecnumbers.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "ECnumber" + String.valueOf(i) + namefeature;
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");				
+			}
+		}
+		
+		if (Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
+			// writing list of ngrams on titles			
+			for( int i = 0; i < titleGrams.size(); i++){
+
+				String feature = titleGrams.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				namefeature = namefeature.replaceAll("<|>", "");
+				String ref = "titleNgram" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+				
+			}
+		}		
+		
+		if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
+			// write list of ngrams
+			for(int i = 0; i < nGrams.size(); i++){
+
+				String feature = nGrams.get(i);
+				String namefeature = feature.replaceAll("\\s", "-");
+				namefeature = namefeature.replaceAll("[,:=+']", "-");
+				String ref = "Ngram" + String.valueOf(i) + namefeature; 
+				
+				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
+				
+			}
+		}
+		
+		// writing the dataset classes		
+		headerArff.append("@ATTRIBUTE class 	{positive, negative}\n");
+		headerArff.append("@DATA\n");
+		
+		return headerArff.toString();
+	}	
+
+	/**
+	 * Iterates over the list of features and 
+	 * counts number of features containing 
+	 * on a given document.    
+	 * 
+	 * @param jTitle title of journal 
+	 * @param title  title of paper
+	 * @param text  abstract content
+	 * @param ecnum  paper EC numbers 
+	 * @param classTriage  triage classification: positive or negative
+	 * @param exp experiment type: train or test
+	 * @return String holding counts for all features found in a document
+	 */
+	
+	public String getArffLine(String jTitle, String title, String text, String ecnum, String classTriage, int exp){
+		//String vectorArff = "";
+		StringBuilder vectorArff = new StringBuilder();
+				
+		text = removeSpecialChar(text.toLowerCase());
+		title = removeSpecialChar(title.toLowerCase());
+		jTitle = removeSpecialChar(jTitle.toLowerCase());
+		ecnum = removeSpecialChar(ecnum);			
+		
+		int emptyabs = 0;
+
+		// fill title and text sizes (number of words)
+		// annotation markups do not matter because
+		// they do not introduce blank spaces hence 
+		// they do not modify the number of words found	
+		if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){
+
+			String[] titleGrams = StringUtils.split(title," ");
+			int titlesize = titleGrams.length;
+
+			String[] abstractcontent = StringUtils.split(text," ");
+			int abstractsize = abstractcontent.length;
+			
+			if(abstractsize == 1){
+				emptyabs++;
+			}
+			
+			vectorArff.append(titlesize).append(",").append(abstractsize).append(",");			
+		}
+		
+		//fill values of journal titles
+		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
+			
+			for(int i = 0; i < journalTitles.size(); i++){
+				String jfeat = "";
+				int jfeatcount = 0;
+				jfeat = journalTitles.get(i).replaceFirst(" ", "");
+				
+				if(jTitle.contains(jfeat)){
+					jfeatcount = StringUtils.countMatches(jTitle, jfeat);
+					vectorArff.append(jfeatcount).append(",");
+				}
+				else{
+					vectorArff.append("0,");
+				}
+			}
+		}
+		
+		// fill values of annotation types taken into account 
+		// either only the abstract or abstract and title
+		// adds on vector the count of occurrences		
+		if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
+
+			for(int i = 0; i < annotations.size(); i++){		
+				String anfeat = "";
+				int anfeatcount = 0;
+				anfeat = annotations.get(i).replaceFirst(" ", "").toLowerCase();
+				
+				//in case the text has current annotation
+				if (text.contains(anfeat)){
+					//check the count of the annotation
+					if((Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){
+						anfeatcount = StringUtils.countMatches(text, anfeat);						
+					}
+					//adding title annot count to annotations
+					else if (!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){						
+						anfeatcount = StringUtils.countMatches(text, anfeat);
+						//in case title has annotation, add to count
+						if(title.contains(anfeat)){
+							anfeatcount = anfeatcount + StringUtils.countMatches(title, anfeat);
+						}
+					}					
+					vectorArff.append(anfeatcount).append(",");
+				}
+				//handles the case that only the title (but not abstract) has current annotation
+				else if((!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)))){
+					if(title.contains(anfeat)){
+						anfeatcount = StringUtils.countMatches(title, anfeat);
+					}
+					vectorArff.append(anfeatcount).append(",");
+				}
+				else{
+					vectorArff.append("0,");					
+				}
+			}			
+		}
+		
+		//fill values of abstract annotation types
+		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
+			
+			for(int i = 0; i < annotationsType.size(); i++){		
+				String antype = "";
+				int antypecount = 0;
+				antype = annotationsType.get(i).replaceFirst(" ", "").toLowerCase();
+				
+				if (text.contains(antype)){
+					//divided by 2  to match occurance 
+					//(count considers open and close tags)
+					antypecount = (StringUtils.countMatches(text, antype))/2;
+					vectorArff.append(antypecount).append(",");					
+				}
+				else{
+					vectorArff.append("0,");					
+				}
+			}		
+			
+		}
+		
+		//fill values of title annotations
+		if (Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){
+			
+			for( int i =0; i < titleAnnot.size(); i++){				
+				String titfeat = "";
+				int titfeatcount = 0;
+				titfeat = titleAnnot.get(i).replaceFirst(" ", "").toLowerCase();
+				
+				if (title.contains(titfeat)){
+					titfeatcount = StringUtils.countMatches(title, titfeat);
+					vectorArff.append(titfeatcount).append(",");					
+				}
+				else{
+					vectorArff.append("0,");				
+				}				
+			}
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
+			
+			for(int i = 0; i < ecnumbers.size(); i++){
+				String ecfeat = "";
+				int ecnumcount  = 0;
+				ecfeat = ecnumbers.get(i);
+				
+				if(ecnum.contains(ecfeat)){
+					ecnumcount = StringUtils.countMatches(ecnum, ecfeat);
+					vectorArff.append(ecnumcount).append(",");
+				}
+				else{
+					vectorArff.append("0,");
+				}
+			}
+		}
+		
+		// fill only values of title ngrams
+		if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
+
+			String cleanTitle = removeTags(title.toLowerCase());
+					
+			for( int i =0; i < titleGrams.size(); i++){
+				String titgram = "";
+				int titgramcount = 0;
+				titgram = titleGrams.get(i).toLowerCase();
+				
+				//in case the title has current ngram
+				if (cleanTitle.contains(titgram)){
+					//check the count of the ngram
+					titgramcount = StringUtils.countMatches(cleanTitle, titgram);
+
+					//adding weight to current ngram count
+					if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
+						titgramcount = applyWeight(titgramcount, Integer.parseInt(pathVars.WEIGHT));						
+					}
+					vectorArff.append(titgramcount).append(",");					
+				}
+				else{
+					vectorArff.append("0,");				
+				}
+			}					
+		}
+		
+		// fill values of ngrams
+		if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
+			String cleanText = removeTags(text.toLowerCase());
+			String cleanTitle = removeTags(title.toLowerCase());
+						
+			for( int i = 0; i < nGrams.size(); i++){
+				String ngramfeat = "";
+				int ngramcount = 0;
+				ngramfeat = nGrams.get(i).toLowerCase();
+
+				//in case the text has current ngram
+				if (cleanText.contains(ngramfeat)){
+					//check the count of the ngram
+					if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){						
+						ngramcount = StringUtils.countMatches(cleanText, ngramfeat);
+						
+						//adding weight to current ngram count 
+						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
+							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
+						}
+					}
+					//checking if title ngrams should be added to the count
+					else if(!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){
+						ngramcount = StringUtils.countMatches(cleanText, ngramfeat);						
+						
+						//in case title has ngram, add to count
+						if(cleanTitle.contains(ngramfeat)){							
+							ngramcount += StringUtils.countMatches(cleanTitle, ngramfeat);							
+						}
+						
+						//adding weight to current ngram count 
+						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){							
+							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
+						}
+					}				
+
+					vectorArff.append(ngramcount).append(",");					
+				}
+				////handles the case that only the title (but not abstract) has current ngram
+				else if (!(cleanText.contains(ngramfeat))){
+					//in case only the title has the ngram, add to count
+					if(cleanTitle.contains(ngramfeat)){
+						ngramcount = StringUtils.countMatches(cleanTitle, ngramfeat);
+						
+						//adding weight to ngram count
+						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
+							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
+						}
+					}
+					vectorArff.append(ngramcount).append(",");
+				}
+				else{
+					vectorArff.append("0,");					
+				}
+			}	
+		}
+		
+		
+		//if(exp == 0){
+			if (classTriage.contains("positive")){ 
+				vectorArff.append("positive");
+				//vectorArff.append("?");	
+			}
+			else {
+				vectorArff.append("negative");
+				//vectorArff.append("?");
+			}
+		//}
+
+		/*else if (exp == 1){
+			vectorArff.append("?");				
+		}	*/	
+		
+	return vectorArff.toString();
+	}
+	
+	/**
+	 * Cleans a given String from special characters
+	 *  
+	 * @param str String to be cleaned 
+	 * @return String without special characters
+	 */
+	
+	public String removeSpecialChar(String str){
+		str = str.replace("}", "");
+		str = str.replace("{", "");
+		str = str.replace("]", "");
+		str = str.replace("[", "");
+		str = str.replace("#", "");
+		str = str.replace("*", "");
+		str = str.replace("&gt", "");
+		str = str.replace("&quot", "");
+		str = str.replace("&apos", "");
+		str = str.replace("%", "");
+		str = str.replace("/", "");
+		str = str.replace("\\", "");		
+		str = str.replace("&", "");
+		str = str.replace("=", "");
+		str = str.replace("?", "");
+		str = str.replace(",", "");
+		str = str.replace(":", "");
+		str = str.replace(";", "");
+		str = str.replace(".", "");
+		str = str.replace(")", "");
+		str = str.replace("(", "");		
+		str = str.replace("\t\t", "\t");
+		str = str.replace("-", "");
+		str = str.replace("  ", "");
+		
+		return str;
+	}
+	
+	/**
+	 * 
+	 * @param str
+	 * @return
+	 */
+	public String removeTags(String str){
+		String[] remove = StringUtils.split(str,"");
+		StringBuilder sb = new StringBuilder();
+		
+		for(int i = 0; i < remove.length; i++){
+			
+			if(remove[i].equalsIgnoreCase("<")){
+				do{
+					i++;
+				}
+				while(!(remove[i].equalsIgnoreCase(">")));
+			}
+			else sb.append(remove[i]);
+		}
+				
+		return sb.toString();	
+	}
+	
+	public int applyWeight(int count, int weight){
+		
+		if(weight > 0){
+			count = count * weight;
+		}
+		return count;
+	}
+	
+	
+	public String informFeatures(PathConstants pathVars){
+		String value = "";
+		if(Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE))
+			value = value + "_annotations";
+		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE))
+			value = value + "_types";
+		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE))
+			value = value + "_journal";
+		if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE) || Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))
+			value = value + "_title";
+		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE))
+			value = value + "_ecnum";
+		if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE))
+			value = value + "_ngrams_size"+ pathVars.NGRAM_SIZE;
+		if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE) && Boolean.valueOf(pathVars.NGRAM_STOP))
+			value = value + "_stopwords";
+		if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM))
+			value = value + "_weight"+ pathVars.WEIGHT;
+		
+		return value;
+	}
+
+	
+}
diff --git a/src/classifier/.gitignore b/src/classifier/.gitignore
new file mode 100644
index 0000000..b92cc15
--- /dev/null
+++ b/src/classifier/.gitignore
@@ -0,0 +1,3 @@
+/test.class
+/train.class
+/Trainer.class
diff --git a/src/classifier/Trainer.java b/src/classifier/Trainer.java
new file mode 100644
index 0000000..7417982
--- /dev/null
+++ b/src/classifier/Trainer.java
@@ -0,0 +1,315 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+package classifier;
+import java.util.Random;
+
+import weka.attributeSelection.LatentSemanticAnalysis;
+import weka.attributeSelection.PrincipalComponents;
+import weka.attributeSelection.GainRatioAttributeEval;
+import weka.attributeSelection.CorrelationAttributeEval;
+import weka.attributeSelection.Ranker;
+import weka.classifiers.Classifier;
+import weka.classifiers.CostMatrix;
+import weka.classifiers.Evaluation;
+import weka.classifiers.bayes.NaiveBayes;
+import weka.classifiers.functions.LibSVM;
+import weka.classifiers.meta.AttributeSelectedClassifier;
+import weka.classifiers.meta.CostSensitiveClassifier;
+import weka.classifiers.trees.LMT;
+import weka.core.Instances;
+import weka.core.converters.ConverterUtils.DataSource;
+import configure.PathConstants;
+
+/**
+ * Trains and tests a classifier, 
+ * executes k-fold cross validation on train data 
+ * and outputs the classification results.
+ * 
+ * @author halmeida
+ *
+ */
+
+public class Trainer {
+	
+	public static int SEED = 1; //the seed for randomizing the data
+	public static int FOLDS = 5; //the # of folds to generate
+	double[][] ranking;
+	String rank;
+	
+
+	/**
+	 * @param args
+	 * @throws Exception 
+	 */
+	public static void main(String[] args) throws Exception {
+		
+		PathConstants pathVars = new PathConstants();
+		Trainer evaluator = new Trainer();
+			
+		
+		//Creating classifier
+        Classifier cls = (Classifier) new LMT();
+        //Classifier cls = (Classifier) new NaiveBayes();
+        //Classifier cls = (Classifier) new LibSVM();
+						
+		//Loading train data
+		DataSource sourceTrain = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TRAIN_DIR + pathVars.ARFF_TRAIN);
+		Instances trainData = sourceTrain.getDataSet();
+		
+		//Flagging the class index on the training data
+		trainData.setClassIndex(trainData.numAttributes()-1);		
+		System.out.println("Class index set on training data.");
+		
+		System.out.println("Training data loaded. Number of instances: " + trainData.numInstances() + "\n");	
+		
+		//Executing k-fold cross validation
+		//train.crossFold(trainData, cls);			
+		
+		//Loading test data
+		DataSource sourceTest = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TEST_DIR + pathVars.ARFF_TEST);
+		Instances testData = sourceTest.getDataSet();
+		
+		//Flagging the class index on the training data
+		testData.setClassIndex(trainData.numAttributes()-1);		
+		System.out.println("Class index set on testing data.");
+		
+		System.out.println("Test data loaded. Number of instances: " + testData.numInstances() + "\n");		
+			
+		//Creating filtered classifiers
+		//AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls);
+		//AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls);
+		//AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls);
+		//AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls);
+				
+		//Training and testing classifier
+		evaluator.classify(trainData, testData, cls);	
+		
+		//Training and testing costSensitive classifier
+		//evaluator.classify(trainData, testData, evaluator.classifySensitive(cls));		
+		
+		//Executing k-fold cross validation on filtered classifiers
+		//evaluator.crossFold(trainData, PCAclassifier);
+		//evaluator.crossFold(trainData, LSAclassifier);		
+		
+	}	
+	
+
+	/**
+	 * Trains and tests a classifier when two separated
+	 * datasets are provided.
+	 * 
+	 * @param train training data to build classifier
+	 * @param test  test data to evaluate classifier
+	 * @param classif  type of classifier applied
+	 * @throws Exception
+	 */
+	public void classify(Instances train, Instances test, Classifier classif) throws Exception{
+
+		classif.buildClassifier(train);
+		Evaluation evaluateClassifier = new Evaluation(train);		
+		evaluateClassifier.evaluateModel(classif, test);	
+		
+		stats(evaluateClassifier, classif);		
+	}
+	
+	/**
+	 * Trains and tests a classifier using a 
+	 * provided Cost matrix 
+	 * 
+	 * @param classif type of classifier to be trained
+	 * @return CostSensitive classifier with costs and classifier
+	 * @throws Exception
+	 */
+	
+	public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{
+		CostSensitiveClassifier costSensitive = new CostSensitiveClassifier();
+		CostMatrix matrix = new CostMatrix(2);
+		matrix.setElement(0, 1, 4);
+		matrix.setElement(1, 0, 1);
+		costSensitive.setClassifier(classif);
+		costSensitive.setCostMatrix(matrix);
+		
+		return costSensitive;
+	}
+	
+	
+	/**
+	 * Outputs classifier results.
+	 * 
+	 * @param eval  Evaluation model built by a classifier
+	 * @param classif  type of classifier applied
+	 * @throws Exception 
+	 */
+
+	public void stats(Evaluation eval, Classifier classif) throws Exception{		
+		System.out.println("Number of attributes: " + eval.getHeader().numAttributes());
+		System.out.println(eval.toSummaryString("\n======== RESULTS ========\n", false));
+		System.out.println(eval.toClassDetailsString("\n\n======== Detailed accuracy by class ========\n"));
+		System.out.println(eval.toMatrixString("\n\n======== Confusion Matrix ========\n"));		
+	}
+	
+	/**
+	 * Executes k-fold cross validation 
+	 * on a given dataset
+	 * @param data training data provided
+	 * @param classif type of classifier usedsearch
+	 * @throws Exception
+	 */
+			
+	public void crossFold(Instances data, Classifier classif) throws Exception{
+
+		Random random = new Random(SEED); //creating seed number generator
+		Evaluation evaluateClassifier = new Evaluation(data);
+		
+		System.out.println("Classifier working...\n\n");
+		//Classifier should not be trained when cross-validation is executed. 
+		//because subsequent calls to buildClassifier method will return the same results always.
+		evaluateClassifier.crossValidateModel(classif, data, FOLDS, random);		
+						
+		stats(evaluateClassifier, classif);		
+	}
+	
+	
+	/**
+	 * Implements a Filtered GainRatio classifier, 
+	 * using the ranker as a search method.
+	 * 
+	 * @param classif type of classifier to be used
+	 * @return  filtered classif with Correlation analysis
+	 */
+	
+	public AttributeSelectedClassifier setGRFilter(Classifier classif){
+		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+		
+		//Creating evaluator and search method
+		GainRatioAttributeEval GR = new GainRatioAttributeEval();
+		Ranker rank = new Ranker();
+		//return the attributes with evaluation greater than 0
+		double threshold = 0.0;
+		rank.setThreshold(threshold);
+		
+		//Setting GainRatio filtered classifier		
+		fClassif.setClassifier(classif);
+		fClassif.setEvaluator(GR);
+		fClassif.setSearch(rank);
+		
+		return fClassif;
+		
+	}
+	
+	/**
+	 * Implements a Filtered Correlation classifier, 
+	 * using the ranker as a search method.
+	 * 
+	 * @param classif type of classifier to be used
+	 * @return  filtered classif with Correlation analysis
+	 */
+	
+	public AttributeSelectedClassifier setCorrFilter(Classifier classif){
+		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+		
+		//Creating evaluator and search method
+		CorrelationAttributeEval Corr = new CorrelationAttributeEval();
+		Ranker rank = new Ranker();
+		
+		//return the attributes with evaluation greater than 0
+		double threshold = 0.03;
+		rank.setThreshold(threshold);
+		
+		//Setting GainRatio filtered classifier		
+		fClassif.setClassifier(classif);
+		fClassif.setEvaluator(Corr);
+		fClassif.setSearch(rank);
+		
+		return fClassif;
+		
+	}
+	
+	/**
+	 * Implements a Filtered PCA classifier, 
+	 * using the ranker as a search method.
+	 * 
+	 * @param classif type of classifier to be used
+	 * @return  filtered classif with PCA analysis config
+	 */
+	public AttributeSelectedClassifier setPCAFilter(Classifier classif){
+		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+		
+		//Creating evaluator and search method
+		PrincipalComponents PCA = new PrincipalComponents();
+		PCA.setMaximumAttributeNames(-1);
+		Ranker rank = new Ranker();
+		//return the attributes with evaluation greater than 0
+		rank.setThreshold(0);
+				
+		//Setting the PCA classifier configurations
+		fClassif.setClassifier(classif);
+		fClassif.setEvaluator(PCA);
+		fClassif.setSearch(rank);		
+		
+		return fClassif;
+	}
+	
+	/**
+	 * Implements a Filtered LSA classifier, 
+	 * using the ranker as a search method
+	 * @param classif
+	 * @return
+	 */
+	
+	private AttributeSelectedClassifier setLSAFilter(Classifier classif) {
+		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+		
+		//Creating evaluator
+		LatentSemanticAnalysis LSA = new LatentSemanticAnalysis();
+		LSA.setMaximumAttributeNames(-1);
+		//value between 0 and 1 includes proportion of total latent variables
+		//greater than 1 = exact # of variables to include;
+		//less than or equal zero = include all;
+		//default = 0.95 (proportional)
+		double defaul = 0;
+		LSA.setRank(defaul);
+		//Creating search method
+		Ranker rank = new Ranker();
+		rank.setThreshold(0);
+				
+		//Setting the LSA classifier configurations
+		fClassif.setClassifier(classif);		
+		fClassif.setEvaluator(LSA);
+		fClassif.setSearch(rank);				
+		
+		return fClassif;
+	}	
+	
+	
+
+}
diff --git a/src/configure/.gitignore b/src/configure/.gitignore
new file mode 100644
index 0000000..26ecd44
--- /dev/null
+++ b/src/configure/.gitignore
@@ -0,0 +1,2 @@
+/DeprecatedVariables.class
+/PathConstants.class
diff --git a/src/configure/PathConstants.java b/src/configure/PathConstants.java
new file mode 100644
index 0000000..2103118
--- /dev/null
+++ b/src/configure/PathConstants.java
@@ -0,0 +1,180 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+***
+* This class re-uses https://code.google.com/p/semlinker/source/browse/trunk/src/configure/NistKBPConfiguration.java
+* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
+*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
+*                   
+* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License
+* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
+*/
+
+package configure;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ *
+ * Variables used by the software 
+ * 
+ * @author Marie-Jean Meurs
+ * @since 2013
+ *  
+ */
+public class PathConstants {	
+
+	/**
+	 * Default constructor
+	 */
+	public PathConstants() {
+		initVars();
+	}
+
+	/**
+	 * Constructor with custom parameter file.
+	 * @param configfile
+	 */
+	public PathConstants(String configfile) {
+		CONFIG_FILE = configfile;
+		initVars();
+	}
+
+
+	public static String CONFIG_FILE = "config.cfg";
+	public HashMap<String, String> CONFIG_MAP = new HashMap<String, String>();
+
+	//Input files
+	public String HOME_DIR;
+	public String CORPUS_DIR; 
+	public String TRAIN_DIR; 
+	public String TEST_DIR;
+	public String FEATURE_DIR;	
+	public String OUTPUT_MODEL;
+	public String TRAINING_FILE;
+	public String TEST_FILE;
+	public String ARFF_TRAIN;
+	public String ARFF_TEST;
+	public String STOP_LIST;
+	
+	//Output files
+	public String JOURNAL_TITLE_FEATURES;
+	public String ECNUM_FEATURES;
+	public String ANNOTATION_FEATURES;
+	public String TITLE_FEATURES;
+	public String NGRAM_FEATURES;
+	public String TITLE_NGRAMS;
+	
+	//Feature setup
+	public String USE_TEXT_SIZE;
+	public String USE_JOURNAL_TITLE_FEATURE;
+	public String USE_ECNUM_FEATURE;
+	public String FEATURE_MIN_FREQ;
+	public String FEATURE_MIN_LENGTH;
+	
+	//Feature setup - Annotations
+	public String USE_ANNOTATION_FEATURE;
+	public String USE_ANNOTATION_TYPE;
+	public String USE_TITLE_FEATURE;
+	
+	
+	//Feature setup - Ngrams
+	public String USE_NGRAM_FEATURE;
+	public String USE_TITLE_NGRAMS;
+	public String NGRAM_STOP;
+	public String NGRAM_SIZE;
+	public String USE_WEIGHTED_NGRAM;
+	public String WEIGHT;
+
+	//Task setup
+	public String EXP_TYPE;	
+	public String NB_PARAMS;		
+	
+
+	private void initVars() {
+		String text = null;
+
+		try {
+			BufferedReader reader = new BufferedReader(new FileReader(CONFIG_FILE));
+			while ((text = reader.readLine()) != null) {
+				if (! text.startsWith("#")) {
+					String label = text.split("=")[0];
+					String value = text.split("=")[1];
+					CONFIG_MAP.put(label, value);
+				}
+			}
+			reader.close();
+		} catch (IOException ex) {
+			Logger.getLogger(PathConstants.class.getName()).log(Level.SEVERE, null, ex);
+		}
+		HOME_DIR = CONFIG_MAP.get("HOME_DIR");
+		CORPUS_DIR = CONFIG_MAP.get("CORPUS_DIR"); 
+		TRAIN_DIR = CONFIG_MAP.get("TRAIN_DIR"); 
+		TEST_DIR = CONFIG_MAP.get("TEST_DIR");
+		FEATURE_DIR = CONFIG_MAP.get("FEATURE_DIR");		
+		OUTPUT_MODEL = CONFIG_MAP.get("OUTPUT_MODEL");
+		TRAINING_FILE = CONFIG_MAP.get("TRAINING_FILE");
+		TEST_FILE = CONFIG_MAP.get("TEST_FILE");
+		ARFF_TRAIN = CONFIG_MAP.get("ARFF_TRAIN");
+		ARFF_TEST = CONFIG_MAP.get("ARFF_TEST");
+		STOP_LIST = CONFIG_MAP.get("STOP_LIST");
+		
+		JOURNAL_TITLE_FEATURES = CONFIG_MAP.get("JOURNAL_TITLE_FEATURES");
+		ECNUM_FEATURES = CONFIG_MAP.get("ECNUM_FEATURES");	
+		ANNOTATION_FEATURES = CONFIG_MAP.get("ANNOTATION_FEATURES");
+		TITLE_FEATURES = CONFIG_MAP.get("TITLE_FEATURES");
+		NGRAM_FEATURES = CONFIG_MAP.get("NGRAM_FEATURES");
+		TITLE_NGRAMS = CONFIG_MAP.get("TITLE_NGRAMS");
+		
+		USE_TEXT_SIZE = CONFIG_MAP.get("USE_TEXT_SIZE");
+		USE_JOURNAL_TITLE_FEATURE = CONFIG_MAP.get("USE_JOURNAL_TITLE_FEATURE");	
+		USE_ECNUM_FEATURE = CONFIG_MAP.get("USE_ECNUM_FEATURE");
+		FEATURE_MIN_FREQ = CONFIG_MAP.get("FEATURE_MIN_FREQ");
+		FEATURE_MIN_LENGTH = CONFIG_MAP.get("FEATURE_MIN_LENGTH");
+		
+		USE_ANNOTATION_FEATURE = CONFIG_MAP.get("USE_ANNOTATION_FEATURE");
+		USE_ANNOTATION_TYPE = CONFIG_MAP.get("USE_ANNOTATION_TYPE");		
+		USE_TITLE_FEATURE = CONFIG_MAP.get("USE_TITLE_FEATURE");
+		
+		USE_NGRAM_FEATURE = CONFIG_MAP.get("USE_NGRAM_FEATURE");
+		USE_TITLE_NGRAMS = CONFIG_MAP.get("USE_TITLE_NGRAMS");
+		NGRAM_STOP = CONFIG_MAP.get("NGRAM_STOP");		
+		NGRAM_SIZE = CONFIG_MAP.get("NGRAM_SIZE");
+		USE_WEIGHTED_NGRAM = CONFIG_MAP.get("USE_WEIGHTED_NGRAM");
+		WEIGHT = CONFIG_MAP.get("WEIGHT");
+				
+		EXP_TYPE = CONFIG_MAP.get("EXP_TYPE");		
+		NB_PARAMS = CONFIG_MAP.get("NB_PARAMS");		
+		
+	}
+}
diff --git a/stopList.txt b/stopList.txt
new file mode 100644
index 0000000..d42a69c
--- /dev/null
+++ b/stopList.txt
@@ -0,0 +1 @@
+a,about,again,all,almost,also,although,always,among,an,and,another,any,are,as,at,be,because,been,before,being,between,both,but,by,can,could,did,do,does,done,due,during,each,either,enough,especially,etc,for,found,from,further,had,has,have,having,here,how,however,i,if,in,into,is,it,its,itself,just,kg,km,made,mainly,make,may,mg,might,ml,mm,most,mostly,must,nearly,neither,no,nor,obtained,of,often,on,our,overall,perhaps,pmid,quite,rather,really,regarding,seem,seen,several,should,show,showed,shown,shows,significantly,since,so,some,such,than,that,the,their,theirs,them,then,there,therefore,these,they,this,those,through,thus,to,upon,use,used,using,various,very,was,we,were,what,when,which,while,with,within,without,would