diff --git a/config-sample.cfg b/config-sample.cfg
index 3c8295f..137b6b9 100644
--- a/config-sample.cfg
+++ b/config-sample.cfg
@@ -7,14 +7,11 @@
 ##################################################
 ########################### DIRECTORIES ##########
 # project home
-HOME_DIR=/.
+HOME_DIR=/home/usr/mycosort-pck-version/
 #
 # corpus directory
 CORPUS_DIR=corpus/
 #
-# source documents directory
-SOURCE_DIR=src/
-#
 # duplicate documents directory 
 DUP_DIR=test/
 #
@@ -37,6 +34,23 @@ FEATURE_DIR=features/
 OUTPUT_MODEL=arff/
 #
 #################################################
+###################### CORPUS SAMPLING ##########
+# true if training set must be sampled
+SAMPLE_TRAIN=false
+#
+# true if test set must be sampled
+SAMPLE_TEST=false
+#
+# % of test corpus WRT the collection  
+PERCT_TEST=15
+#
+# % positive on training set
+PERCT_POS_TRAIN=50
+#
+# % positive on test set 
+PERCT_POS_TEST=10
+#
+#################################################
 ########################## INPUT FILES ##########
 # training file
 TRAINING_FILE=/triagecorpus_train.xml
diff --git a/jar/README b/jar/README
deleted file mode 100644
index 9a9b435..0000000
--- a/jar/README
+++ /dev/null
@@ -1,7 +0,0 @@
-Please add to this folder the following libraries:
-commons-lang3-3.2.1.jar
-jsoup-1.7.3.jar
-weka.jar
-LibSVM.jar
-LibSVM/libsvm.jar
-
diff --git a/jar/README~ b/jar/README~
deleted file mode 100644
index 56f2ce9..0000000
--- a/jar/README~
+++ /dev/null
@@ -1,7 +0,0 @@
-Please add to this folder the following libraries:
-commons-lang3-3.2.1.jar
-jsoup-1.7.3.jar
-weka.jar
-LibSVM.jar
-libsvm.jar
-
diff --git a/src/.gitignore b/src/.gitignore
deleted file mode 100644
index 1924ede..0000000
--- a/src/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*.classpath
-*.project
-*.*~
diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore
deleted file mode 100644
index 6b468b6..0000000
--- a/src/analyse/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.class
diff --git a/src/analyse/ConcatXML.java b/src/analyse/ConcatXML.java
deleted file mode 100644
index 9c24173..0000000
--- a/src/analyse/ConcatXML.java
+++ /dev/null
@@ -1,734 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-package analyse;
-
-import java.io.BufferedOutputStream;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.PrintWriter;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Date;
-import java.util.List;
-
-import configure.PathConstants;
-
-/**
- * Generates a corpus from raw XML doc instances, 
- * so that features can be extracted from it
- *   
- * @author halmeida
- *
- */
-public class ConcatXML extends Extractor{
-	
-	private String tag1;
-	private String tag2;
-	private String tag3;
-
-
-	public ConcatXML(){		
-		this.id = "<PubmedArticleSet><PubmedArticle><MedlineCitation";
-		this.endId = "</PMID>";	
-		this.openFile = "<corpus>";
-		this.endFile = "</corpus>";
-		this.openJournal = "<Title>";
-		this.tag1 = "<?xml version=\"1.0\"?>";
-		this.tag2 = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>";
-		this.tag3 = "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st January 2013//EN\" \"http://www.ncbi.nlm.nih.gov/corehtml/query/DTD/pubmed_130101.dtd\">";
-	}
-	
-	public static void main(String[] args) throws IOException {	
-		
-		PathConstants pathVars = new PathConstants();									
-		
-		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
-		
-		String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE;
-		String xmlDir = "train";
-		String sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "all_nbs/"+ xmlDir;
-		String duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "/src"+ "/annotated_GH27-36_2013_12_31";
-				
-		String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml";
-		String tagCorpus = concatCorpus;
-		
-		ConcatXML concat = new ConcatXML();		
-		
-		//================= Checking for duplicates =====================//
-		//concat.checkDupCorpus(trainCorpusPath, sourceDir);
-		//concat.checkDupFolder(sourceDir, duplicatesDir);
-		
-		
-		//================== Creating corpus ==========================//		
-		concat.cleanXML(sourceDir);
-		//concat.cleanXML(duplicatesDir);
-		concat.concatenateXML(sourceDir, "", concatCorpus);
-		concat.tagCorpus(tagCorpus);	
-	}	
-	
-	/**
-	 * Reads the file IDs in a folder and 
-	 * checks a second folder for duplicates. 
-	 *  
-	 * @param dirSrc source folder
-	 * @param dirDup folder to check for duplicates
-	 */
-	
-	public void checkDupFolder(String dirSrc, String dirDup){
-		ArrayList<String> sourceIDs = new ArrayList<String>();
-		ArrayList<String> duplicated = new ArrayList<String>();
-		ArrayList<String> dupIDs = new ArrayList<String>();
-		int ids = 0;
-
-		if(dirSrc.contentEquals(dirDup)){		
-			System.out.println("Source and duplicates directories are the same.\n\n========================\n");			
-		}		
-		else {		
-
-			File sourceDir = new File(dirSrc);
-			File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
-				@Override
-				public boolean accept(File dir, String name){
-					return name.endsWith(".xml");
-				}
-			});		
-
-			try{
-				//for each file on the source dir 
-				for (File xml : srcXMLs){				
-
-					try{
-						BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
-
-						String line = null;
-
-						String id  = null;
-
-						while((line = reader.readLine()) != null){
-
-							line = line.replaceAll("\t","");
-							line = line.replace("\"", "");	
-
-							//get the IDs of the new files
-							if (line.contains(getid())){
-
-								line = line.substring(line.indexOf("><PMID"), line.indexOf(getendId()));
-								line = line.replaceAll("><PMID Version=1>", "");
-
-								id = line.replace(getendId(), "");
-
-								sourceIDs.add(id);							
-
-								line = reader.readLine();
-								line = line.replaceAll("\t","");
-							}
-
-							if(line.contains(getOpenJournal())){
-								ids++;					
-							}
-
-							line = line.replaceAll("\t","");
-							line = line.replace("\"", "");
-						}
-
-						reader.close();
-
-					}catch (FileNotFoundException e) {
-						e.printStackTrace();
-					}
-
-				}				
-
-			}catch (FileNotFoundException e) {
-				e.printStackTrace();
-			}
-			catch(Exception e){
-				throw new RuntimeException(e);
-			}
-
-			System.out.println(ids + " source file IDs encountered.");
-			ids = 0;
-
-			File dupDir = new File(dirDup);
-
-			File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
-				@Override
-				public boolean accept(File dir, String name){
-					return name.endsWith(".xml");
-				}
-			});		
-
-			try{
-				//for each file on the possibly duplicated dir 
-				for (File xml : dupXMLs){				
-
-					try{
-						BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
-
-						String line = null;
-
-						String id  = null;
-
-						while((line = reader.readLine()) != null){
-
-							line = line.replaceAll("\t","");
-							line = line.replace("\"", "");	
-
-							//get the IDs of the new files
-							if (line.contains(getid())){
-
-								line = line.substring(line.indexOf("><PMID"), line.indexOf(getendId()));
-								line = line.replaceAll("><PMID Version=1>", "");
-
-								id = line.replace(getendId(), "");
-
-								dupIDs.add(id);
-								String dupFileID = id;
-
-								for(int j = 0; j < sourceIDs.size(); j++){
-									if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){
-										//moving the original file									
-										Path from = xml.toPath(); //convert from File to Path
-										Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
-										Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
-									}
-								}
-
-
-								line = reader.readLine();
-								line = line.replaceAll("\t","");
-							}
-
-							if(line.contains(getOpenJournal())){
-								ids++;					
-							}
-
-							line = line.replaceAll("\t","");
-							line = line.replace("\"", "");
-						}
-
-						reader.close();
-
-					}catch (FileNotFoundException e) {
-						e.printStackTrace();
-					}
-
-				}				
-
-			}catch (FileNotFoundException e) {
-				e.printStackTrace();
-			}
-			catch(Exception e){
-				throw new RuntimeException(e);
-			}
-
-			//count number of existing papers on possibly duplicated folder
-			//just to make sure we are gathering all IDs
-			System.out.println(ids + " new file IDs encountered.");
-			ids = 0;
-
-			//for each possible duplicated ID, 
-			//check if it exists on source folder ID list
-			//if yes, list the duplicated ones
-			for(int i = 0; i < dupIDs.size(); i++){
-				for(int j = 0; j < sourceIDs.size(); j++){
-					if(sourceIDs.get(j).equalsIgnoreCase(dupIDs.get(i))){
-						duplicated.add(dupIDs.get(i));
-					}
-				}
-			}
-
-			//sorting the list of duplicated IDs
-			Collections.sort(duplicated, new Comparator<String>(){
-				@Override
-				public int compare(String one, String two){
-					return one.compareTo(two);
-				}
-			});	
-
-			System.out.println("\nReaded source files: " + sourceIDs.size());				
-			System.out.println("Readed new files: " + dupIDs.size());	
-
-			System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
-
-			System.out.println("\nDuplicated files IDs: ");
-			for(int i = 0; i < duplicated.size(); i++){
-				System.out.println(duplicated.get(i));
-			}
-
-			System.out.println("\n========================\n");
-		}
-
-
-	}
-	
-	/**
-	 * Reads the corpus and checks the papers IDs
-	 * to identify duplicates in case new papers 
-	 * are being concatenated to corpus.
-	 * 
-	 * @param corpus path to current corpora to check
-	 * @param dir path to folder with new files to be concatenated
-	 */
-	
-	public void checkDupCorpus(String corpus, String dir){
-		ArrayList<String> trainingIDs = new ArrayList<String>();
-		ArrayList<String> duplicated = new ArrayList<String>();
-		ArrayList<String> newFiles = new ArrayList<String>();
-		
-		int ids = 0;
-		
-		try 
-		{			
-			BufferedReader reader = new BufferedReader(new FileReader(corpus));
-		
-			String line = null;
-			String id = null;
-			
-			
-			while((line = reader.readLine()) != null){
-				
-				line = line.replaceAll("\t","");
-				line = line.replace("\"", "");				
-
-				//on the previous training corpus
-				//find exact paper ID and store it
-				if (line.contains(getid())){					
-					
-					line = line.substring(line.indexOf("><PMID"), line.indexOf(getendId()));
-					line = line.replaceAll("><PMID Version=1>", "");
-										
-					id = line.replace(getendId(), "");
-					
-					//insert paper ID to existing training file list
-					trainingIDs.add(id);									
-					
-					line = reader.readLine();
-					line = line.replaceAll("\t","");					
-				}
-				
-				//count number of existing papers on the training file
-				//just to make sure we are gathering all IDs
-				if(line.contains(getOpenJournal())){
-					ids++;					
-				}
-				
-				line = line.replaceAll("\t","");
-				line = line.replace("\"", "");
-			}
-			
-			reader.close();
-		
-		}catch (FileNotFoundException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-		
-		System.out.println(ids + " training file IDs encountered.");
-		ids = 0;
-		
-		File corpusDir = new File(dir);
-		File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name){
-				return name.endsWith(".xml");
-			}
-		});		
-		
-		try{
-			//for each file on the corpus dir 
-			for (File xml : newXMLs){				
-
-				try{
-					BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
-
-					String line = null;
-
-					String id  = null;
-
-					while((line = reader.readLine()) != null){
-
-						line = line.replaceAll("\t","");
-						line = line.replace("\"", "");	
-
-						//get the IDs of the new files
-						if (line.contains(getid())){
-
-							line = line.substring(line.indexOf("><PMID"), line.indexOf(getendId()));
-							line = line.replaceAll("><PMID Version=1>", "");
-
-							id = line.replace(getendId(), "");
-
-							newFiles.add(id);
-							String newFileID = id;
-							
-							for(int j = 0; j < trainingIDs.size(); j++){
-								if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){
-									 //moving the original file									
-									Path from = xml.toPath(); //convert from File to Path
-									Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
-						    	    Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
-								}
-							}
-							
-
-							line = reader.readLine();
-							line = line.replaceAll("\t","");
-						}
-						
-						if(line.contains(getOpenJournal())){
-							ids++;					
-						}
-
-						line = line.replaceAll("\t","");
-						line = line.replace("\"", "");
-					}
-				
-					reader.close();
-						
-				}catch (FileNotFoundException e) {
-					e.printStackTrace();
-				}
-
-			}				
-
-		}catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch(Exception e){
-			throw new RuntimeException(e);
-		}
-		
-		//count number of existing papers on the training file
-		//just to make sure we are gathering all IDs
-		System.out.println(ids + " new file IDs encountered.");
-		ids = 0;
-		
-		//for each new ID, check if it exists on training file ID list
-		//if yes, list the duplicated ones
-		for(int i = 0; i < newFiles.size(); i++){
-			for(int j = 0; j < trainingIDs.size(); j++){
-				if(trainingIDs.get(j).equalsIgnoreCase(newFiles.get(i))){
-					duplicated.add(newFiles.get(i));
-				}
-			}
-		}
-		
-		//sorting the list of duplicated IDs
-		Collections.sort(duplicated, new Comparator<String>(){
-			@Override
-			public int compare(String one, String two){
-				return one.compareTo(two);
-			}
-		});	
-		
-		System.out.println("\nReaded training files: " + trainingIDs.size());				
-		System.out.println("Readed new files: " + newFiles.size());	
-				
-		System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
-		
-		System.out.println("\nDuplicated files IDs: ");
-		for(int i = 0; i < duplicated.size(); i++){
-			System.out.println(duplicated.get(i));
-		}
-		
-		System.out.println("\n========================\n");
-		
-	}
-	
-	
-	/**
-	 * Reads and edits a list of XMLs files in a folder
-	 * to remove XML and previous corpus tags, 
-	 * preparing the files to be concatenated. 
-	 *  
-	 * @param dir string with folder path
-	 */
-	
-	public void cleanXML(String dir){		
-
-		//listing files on corpus dir
-		File sourceDir = new File(dir);
-		
-		File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name){
-				return name.endsWith(".xml");
-			}
-		});		
-
-		System.out.println("... Files list loaded.");				
-
-		try{
-			//for each file on the corpus dir 
-			for (File xml : newXMLs){				
-
-				try{
-					BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
-
-					String line = null;
-					ArrayList<String> allLines = new ArrayList<String>();
-					String content  = null;
-
-					while((line = reader.readLine()) != null){						
-						content = line;	
-
-						//cleaning XML markups
-						if(content.contains(getTag1())){							
-							content = content.replace(getTag1(), "");
-							allLines.add(content);							
-						}
-						if(content.contains(getTag2())){							
-							content = content.replace(getTag2(), "");
-							allLines.add(content);							
-						}
-						if(content.contains(getTag3())){							
-							content = content.replace(getTag3(), "");
-							allLines.add(content);							
-						}
-						
-						//cleaning previous corpus tags
-						if(content.contains(getOpenFile())){
-							content = content.replace(getOpenFile(), "");
-							allLines.add(content);
-						}
-						if(content.contains(getendFile())){
-							content = content.replace(getendFile(), "");
-							allLines.add(content);
-						}
-
-						allLines.add(content);						
-					}					
-
-					PrintWriter writer = new PrintWriter(xml.getPath());
-
-					for (String l : allLines){
-						writer.println(l);			
-					}					
-					reader.close();
-					writer.close();				
-
-				}catch (FileNotFoundException e) {
-					e.printStackTrace();
-				}
-
-			}				
-
-		}catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch(Exception e){
-			throw new RuntimeException(e);
-		}
-
-		System.out.println("... Files cleaned and saved.");
-		System.out.println("Ready for concatenation.");
-		System.out.println("\n========================\n");
-	}
-	
-	/**
-	 * Concatenates all XMLs in one folder or between two folders.
-	 * @param sourceDir main directory with XML files.
-	 * @param duplicDir second directory with duplicated XML files 
-	 * @param concatFile path name to saved concatenated corpus
-	 */
-	
-	public void concatenateXML(String sourceDir, String duplicDir, String concatFile){		
-
-		final int BUFFER = 1024 << 8;
-		byte[] buffer = new byte[BUFFER];
-
-		//listing files on corpus dir
-		File srcDir = new File(sourceDir);
-		File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name){
-				return name.endsWith(".xml");
-			}
-		});
-		
-		File dupDir = new File(duplicDir);
-		File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name) {				
-				return name.endsWith(".xml");
-			}			
-		}); 
-		
-		System.out.println("... Files list loaded.");		
-
-		//defining the output file (concatenated)
-		File newCorpus = new File(concatFile);		
-
-		try{	
-			OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus));
-			
-
-			//for each file on the corpus dir 
-			for (File xmls : srcXMLs){				
-				InputStream input = new FileInputStream(xmls);				
-				int count;				
-				
-				//if the file is not empty/finished
-				try{
-					while((count = input.read(buffer)) >= 0){										
-						
-						//write it on the concatenated final file
-						output.write(buffer, 0, count);
-					}
-				}finally{
-					input.close();
-				}
-			}
-			
-		if(dupXMLs != null){
-			for(File xmld : dupXMLs){
-				InputStream input = new FileInputStream(xmld);				
-				int count;				
-				
-				//if the file is not empty/finished
-				try{
-					while((count = input.read(buffer)) >= 0){										
-						
-						//write it on the concatenated final file
-						output.write(buffer, 0, count);
-					}
-				}finally{
-					input.close();
-				}
-			}
-		}
-			output.flush();
-			output.close();				
-			
-		}catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch(Exception e){
-			throw new RuntimeException(e);
-		}
-
-		System.out.println("... File concatenated and saved.");
-		System.out.println("Ready for corpus tagging.");
-		System.out.println("\n========================\n");
-	}
-	
-	/**
-	 * Inserts corpus tag on XML file
-	 * 
-	 * @param pathToCorpus path to 
-	 * 		  concatenated corpus 
-	 */
-	
-	public void tagCorpus(String pathToCorpus){
-		
-		//tagging as corpus		
-		try{
-			BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus));
-						
-			String line = null;
-			String edit = null;
-			List<String> allLines = new ArrayList<String>();
-			
-			//adds tag at beggining of corpus
-			allLines.add(getOpenFile());
-			
-			while((line = reader.readLine()) != null){	
-				 
-				allLines.add(line);					
-			}
-			//adds tag at the end of corpus
-			allLines.add(getendFile());			
-			
-			System.out.println("... Corpus loaded and tagged.");
-			//re-writting the file
-			PrintWriter writer = new PrintWriter(pathToCorpus);
-			
-			for (String l : allLines){
-				writer.println(l);			
-			}
-			reader.close();
-			writer.close();
-			
-			System.out.println("... File saved as tagged corpus.");			
-		}
-		catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch(IOException e){
-			e.printStackTrace();
-		}
-	}
-	
-	public String getTag1() {
-		return tag1;
-	}
-
-	public void setTag1(String tag1) {
-		this.tag1 = tag1;
-	}
-
-	public String getTag2() {
-		return tag2;
-	}
-
-	public void setTag2(String tag2) {
-		this.tag2 = tag2;
-	}
-	
-	public String getTag3() {
-		return tag3;
-	}
-
-	public void setTag3(String tag3) {
-		this.tag3 = tag3;
-	}
-	
-		
-}
-
-
diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java
deleted file mode 100644
index 8e91951..0000000
--- a/src/analyse/Extractor.java
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package analyse;
-
-import java.io.BufferedWriter;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.UnsupportedEncodingException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-/**
- * Implements common tools to FeatureExtractor 
- * and NgramExtractor classes that are used to 
- * extract features from doc instances 
- * 
- * @author halmeida
- *
- */
-public class Extractor {
-	
-	//String pathFile;
-	String id;
-	protected String endId;
-	String openFile;	
-	String endFile;
-	String openAbst;
-	String closeAbst;
-	String abstractLabel;
-	String openEC;
-	String closeEC;
-	String classTag;
-	String openTitle;
-	String closeTitle;
-	String openJournal;
-	String closeJournal;
-	String copyR;
-	String closeCopyR;
-	
-	/**
-	 * Replaces special characters to clean 
-	 * text for tokenizing.
-	 * 
-	 * @param str text to be cleaned
-	 * @return string with cleaned text
-	 */
-	public String removeSpecialChar(String str){
-		str = str.replace("}", "");
-		str = str.replace("{", "");
-		str = str.replace("]", "");
-		str = str.replace("[", "");
-		str = str.replace("#", "");
-		str = str.replace("*", "");
-		str = str.replace("&gt", "");
-		str = str.replace("&apos", "");
-		str = str.replace("%", "");
-		str = str.replace("&quot", "");
-		str = str.replace("&", "");
-		str = str.replace("=", "");
-		str = str.replace("?", "");
-		str = str.replace(";", "");
-		str = str.replace(":", "");
-		str = str.replace(",", "");
-		str = str.replace(".", "");
-		str = str.replace(")", "");
-		str = str.replace("(", "");
-		str = str.replace("\t\t", "\t");
-		//losing ngrams because of hifen between names 
-		str = str.replace("-", " ");
-		str = str.replace("  ", "");
-		
-		return str;
-	}
-	
-	/**
-	 * Handles external tags (and multiple abstract 
-	 * text tags) present in a single paper
-	 * @param str abstract content
-	 * @return string without external tags 
-	 */
-	
-	public String processAbstract(String str){
-		str = str.replace("  ", "");		
-		String[] remove = str.split("");
-		StringBuilder sb = new StringBuilder();
-		String temp = "";
-		String abstrac = "";
-		
-		for(int i = 0; i < remove.length; i++){
-			temp = temp + remove[i];
-			
-			if(temp.contains("<AbstractText ")){
-				temp = "";				
-				do{
-					i++;
-				} while(!(remove[i].equalsIgnoreCase(">")));
-			}
-			if(temp.contains("Copyright ")){
-				temp = "";
-				do{
-					i++;
-					//an exception here can mean that a copyright information
-					//tag content did not ended with a period
-				}while(!(remove[i]).equalsIgnoreCase("."));
-			}
-			else sb.append(remove[i]);		
-		}
-		
-		 abstrac = sb.toString();
-		 abstrac = removeAbstractTags(abstrac);
-				 
-		 return abstrac;
-	}
-	
-
-	/**
-	 * Removes specific tags encountered on Abstract texts.
-	 * This is used to clean the abstract text before 
-	 * processing the feature count on the model. 
-	 * @param str
-	 * @return
-	 */
-	
-	public String removeAbstractTags(String str){		
-		//this order of removing tags matters to 
-		//exclude the first tag from the abstracts.
-		
-		str = str.replace("<abstracttext>", "");
-		str = str.replace("<abstracttext", "");
-		str = str.replace("<copyrightinformation>", "");
-		str = str.replace("</copyrightinformation>", "");
-		str = str.replace("copyright", "");		
-		str = str.replace("</abstractText>", "");
-		str = str.replace("<abstract>", "");
-		str = str.replace("</abstract>", "");
-		str = str.replace("<abstracttext.*?>", "");	
-		
-		return str;
-	}
-	
-	
-	/**
-	 * Removes the markup annotations of a
-	 * text field, and keeps its content
-	 * 
-	 * @param str text containing markups
-	 * @return string with cleaned text 
-	 */	
-	public String removeTags(String str) {
-		String[] remove = str.split("");
-		StringBuilder sb = new StringBuilder();
-		
-		for(int i = 0; i < remove.length; i++){
-			
-			if(remove[i].equalsIgnoreCase("<")){
-				do{
-					i++;
-				}
-				while(!(remove[i].equalsIgnoreCase(">")));
-			}
-			else sb.append(remove[i]);
-		}
-				
-		return sb.toString();		
-	}
-	
-	
-	/**
-	 * Displays the keys and values of the
-	 * maps created.
-	 * 
-	 * @param hash  HashMap containing list,
-	 * values, counts
-	 */
-	public void displayList(HashMap hash){
-		Iterator<Object> itr = hash.keySet().iterator();
-		int sum = 0;
-		while(itr.hasNext()){
-			Object str = itr.next();
-			System.out.println("key: "+str+"\t value: "+hash.get(str));			
-		}		
-	}
-	
-	
-	/**
-	 * Exports hashmap of values extracted  
-	 * from dataset to external file
-	 * 
-	 * @param location folder, file name and file extension
-	 * @param list values to be exported
-	 */	
-	public void exportFile(String location, HashMap list){
-
-		String SEPARATOR = "\t";
-		StringBuffer line = new StringBuffer();
-		Iterator<Object> itr = list.keySet().iterator();
-		
-			try{
-				BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8"));
-
-				while(itr.hasNext()){
-					Object str = itr.next();
-					if(str != null){
-						line.append(str).append(SEPARATOR).append(list.get(str));
-						if(line.toString().contains("="))
-							line.replace(line.indexOf("="), line.indexOf("=")+1,SEPARATOR);	
-						//handling specificities from title content extraction
-						if(line.toString().contains(","))
-							line.replace(line.indexOf(","), line.indexOf(",")+1,SEPARATOR);						
-					}
-					if(itr.hasNext()){						
-						//writer.newLine();					
-						line.append("\n");					
-					}
-					writer.write(removeSpecialChar(line.toString()));					
-					line.replace(0, line.length(), "");
-					//writer.newLine();
-				}
-				writer.flush();
-				writer.close();
-			}
-			catch(UnsupportedEncodingException e){
-				e.printStackTrace();
-			}
-			catch(FileNotFoundException e){
-				e.printStackTrace();
-			}
-			catch(IOException e){
-				e.printStackTrace();
-			}		
-			
-			
-		//}
-	}
-	
-	
-	/**
-	 * Exports list of values extracted  
-	 * from dataset to a string variable
-	 * 
-	 * @param list list of values to be exported
-	 * @return string containing values on list
-	 * @deprecated
-	 */		
-	public String exportContent(HashMap list){
-		String SEPARATOR = "\t";
-		Iterator<String> itr = list.keySet().iterator();
-		StringBuffer export = new StringBuffer();		
-		//try{
-		while(itr.hasNext()){
-			String str = itr.next();
-			if(str != null){
-				export.append(str).append(SEPARATOR).append(list.get(str));
-				
-				if(export.toString().contains("="))
-					export.replace(export.indexOf("="), export.indexOf("=")+1,SEPARATOR);				
-			}
-			
-			if(itr.hasNext()){
-				export.append("\n");
-			}
-		}
-		/*}
-		catch(Exception e){
-			
-		}*/
-		
-		return removeSpecialChar(export.toString());			
-	}
-	
-	
-	/**
-	 * Exports list of values extracted  
-	 * from dataset to external file
-	 * 
-	 * @param location folder, file name and file extension
-	 * @param list list of values to be exported
-	 *
-	 */	
-	public void exportList(String location, ArrayList<String> list){
-
-		String SEPARATOR = "\n";
-		StringBuffer line = new StringBuffer();		
-
-		try{
-			BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8"));
-
-			for(int i = 0; i < list.size(); i++){
-				String str = list.get(i);
-				if(str != null){
-					line.append(str).append(SEPARATOR);											
-				}
-			}
-			writer.write(removeSpecialChar(line.toString()));
-
-			writer.flush();
-			writer.close();
-		}
-		catch(UnsupportedEncodingException e){
-			e.printStackTrace();
-		}
-		catch(FileNotFoundException e){
-			e.printStackTrace();
-		}
-		catch(IOException e){
-			e.printStackTrace();
-		}		
-
-	}
-	
-	
-	public void initialize(){
-				
-	} 
-	
-		
-	/**
-	 * Accessors and mutators methods
-	 * for Extractor variables. 
-	 * @return
-	 */
-	/*public String getPathFile() {
-		return pathFile;
-	}
-	public void setPathFile(String pathFile) {
-		this.pathFile = pathFile;
-	}*/
-	public String getid() {
-		return id;
-	}
-	public void setid(String id) {
-		this.id = id;
-	}
-	public String getendId() {
-		return endId;
-	}
-	public void setendId(String endId) {
-		this.endId = endId;
-	}
-	public String getOpenFile() {
-		return openFile;
-	}
-	public void setOpenFile(String openFile) {
-		this.openFile = openFile;
-	}
-	public String getendFile() {
-		return endFile;
-	}
-	public void setendFile(String endFile) {
-		this.endFile = endFile;
-	}
-	public String getopenAbst() {
-		return openAbst;
-	}
-	public void setopenAbst(String openAbst) {
-		this.openAbst = openAbst;
-	}
-	public String getcloseAbst() {
-		return closeAbst;
-	}
-	public void setcloseAbst(String closeAbst) {
-		this.closeAbst = closeAbst;
-	}
-	public String getOpenEC() {
-		return openEC;
-	}
-	public void setOpenEC(String openEC) {
-		this.openEC = openEC;
-	}
-	public String getCloseEC() {
-		return closeEC;
-	}
-	public void setCloseEC(String closeEC) {
-		this.closeEC = closeEC;
-	}
-	public String getAbstractLabel() {
-		return abstractLabel;
-	}
-	public void setAbstractLabel(String abstractLabel) {
-		this.abstractLabel = abstractLabel;
-	}	
-	public String getClassTag() {
-		return classTag;
-	}
-	public void setClassTag(String classTag) {
-		this.classTag = classTag;
-	}
-	public String getOpenTitle() {
-		return openTitle;
-	}
-	public void setOpenTitle(String titleTag) {
-		this.openTitle = titleTag;
-	}
-	public String getCloseTitle() {
-		return closeTitle;
-	}
-	public void setCloseTitle(String closeTitle) {
-		this.closeTitle = closeTitle;
-	}
-	public String getOpenJournal() {
-		return openJournal;
-	}
-	public void setOpenJournal(String openJournal) {
-		this.openJournal = openJournal;
-	}
-	public String getCloseJournal() {
-		return closeJournal;
-	}
-	public void setCloseJournal(String closeJournal) {
-		this.closeJournal = closeJournal;
-	}
-
-}
\ No newline at end of file
diff --git a/src/analyse/FeatureExtractor.java b/src/analyse/FeatureExtractor.java
deleted file mode 100644
index 4d66d4f..0000000
--- a/src/analyse/FeatureExtractor.java
+++ /dev/null
@@ -1,526 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package analyse;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
-import org.apache.commons.lang3.StringUtils;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-
-import configure.PathConstants;
-import filter.NaiveFilter;
-
-
-/**
- * This class extracts and parses domain 
- * annotation features from doc instances
- *   
- * @author Hayda Almeida
- * @since 2014
- * 
- */
-
-public class FeatureExtractor extends Extractor{	
-	
-	public FeatureExtractor(){
-				
-		this.id = "PMID";
-		this.openAbst = "AbstractText";
-		this.abstractLabel = "AbstractText ";
-		this.openEC = "RegistryNumber";
-		this.classTag = "TRIAGE";
-		this.openJournal = "Title";
-		this.openTitle = "ArticleTitle";		
-	}
-		
-	
-	public static void main(String[] args) {
-		
-		PathConstants pathVars = new PathConstants();
-		boolean verbose = false;
-		
-		String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +  pathVars.TRAIN_DIR + pathVars.TRAINING_FILE;
-		FeatureExtractor fextrac = new FeatureExtractor();
-		NaiveFilter featFilter = new NaiveFilter();
-				
-		//store all features, type and count
-		HashMap<Map<String,String>,Integer> abstract_count = new HashMap<Map<String,String>,Integer>();
-		//store all features, type and classification
-		HashMap<Map<String,String>,String> abstract_type = new HashMap<Map<String,String>,String>();
-		
-		//store title features, type and classification
-		HashMap<Map<String,String>,String> title_type = new HashMap<Map<String,String>,String>();
-		//store title features, type and count
-		HashMap<Map<String,String>, Integer> title_count = new HashMap<Map<String,String>, Integer>();
-		//store title features, whole journal title content and classification
-		HashMap<Map<String,String>,String> title_content = new HashMap<Map<String,String>,String>();		
-		//store title content and EC numbers
-		ArrayList<String> ec_numbers = new ArrayList<String>();
-		
-		//store ID, class and features
-		HashMap<String,String> PMIDs = new HashMap<String,String>();
-				
-		fextrac.initialize();
-		int jTitle = 0;
-				
-		try 
-		{
-			String line = null;
-			String features = null;
-			//Loading file
-			File input = new File(AnCorpus);
-			//Jsoup parse
-			Document doc = Jsoup.parse(input, "UTF-8");
-
-			Elements corpus = doc.body().getElementsByTag("pubmedarticleset");
-
-			//Fetching elements
-
-			for(Element paper : corpus ){			
-
-				//Fetching elements
-				Elements journalTitle = paper.getElementsByTag(fextrac.getOpenJournal());
-				Elements title = paper.getElementsByTag(fextrac.getOpenTitle());
-				Elements abstractC = paper.getElementsByTag(fextrac.getopenAbst());
-				Elements ECnumber = paper.getElementsByTag(fextrac.getOpenEC());
-				Elements classDoc = paper.getElementsByTag(fextrac.getClassTag());				
-
-				String journal = "";
-				String docID = "";
-				String label = "";
-				ArrayList<String> tempList = new ArrayList<String>();
-				StringBuffer sb = new StringBuffer();
-				
-				//fetching the paper ID - 
-				//for all items in a paper, retrieve only PMIDs 
-				for(Element e : paper.select(fextrac.getid())){
-					//only consider the ID if the parent is medline citation
-					if(e.parentNode().nodeName().contains("medline")){						
-						docID = e.text();
-					}
-				}			
-				//fetch the doc label as well
-				if(classDoc.hasText()){
-					label = classDoc.text();									
-				}
-				
-				PMIDs.put(docID, label);				
-
-				if(journalTitle.hasText()){
-
-					jTitle++;				
-					journal = journalTitle.toString();
-					journal = fextrac.removeSpecialChar(journal);				
-					journal = fextrac.removeTags(journal);									
-				}				
-
-				String title_annotation = "";
-				if(title.hasText()){
-					title_annotation = title.toString();
-					title_annotation = fextrac.removeSpecialChar(title_annotation);
-
-					tempList.addAll(fextrac.annotations(title_annotation, title_count, title_type, featFilter, pathVars));
-					fextrac.addContent(title_annotation, journal, title_content, featFilter);					
-				}
-
-				String abstrac = "";
-				if(abstractC.hasText()){
-					abstrac = abstractC.toString();
-					abstrac = fextrac.removeSpecialChar(abstrac);
-					abstrac = fextrac.removeAbstractTags(abstrac);
-
-					tempList.addAll(fextrac.annotations(abstrac, abstract_count, abstract_type, featFilter, pathVars));				
-				}		
-
-				String ecnum = "";
-				if(ECnumber.hasText()){				
-					for(Element number : ECnumber){						
-						ecnum = number.toString();
-						if(ecnum.contains("EC")){
-							ecnum = fextrac.removeSpecialChar(ecnum);
-							ecnum = fextrac.removeTags(ecnum);
-							ec_numbers.add(features);
-						}
-					}				
-				}			
-
-				String triage = "";
-				if(classDoc.hasText()){
-					triage = classDoc.toString();
-					triage = fextrac.removeSpecialChar(triage);
-					triage = fextrac.removeTags(triage);
-
-					fextrac.addClass(triage, abstract_type);							
-					fextrac.addClass(triage, title_type);
-					fextrac.addClass(triage, title_content);
-				}
-				
-//				for(int i = 0; i < tempList.size(); i++){
-//					sb.append(tempList.get(i) + "-");					
-//				}
-//				
-//				PMIDs.put(docIDLabel, sb.toString());
-			}
-			
-		}
-		
-		catch (FileNotFoundException e) {
-			e.printStackTrace();			
-		} 
-		catch (IOException e) {
-			e.printStackTrace();
-		}		
-		
-		if(verbose){
-			//print list of extracted features
-			System.out.println("\n===========TITLE==ANNOTATIONS=============");
-			fextrac.displayList(title_count);		
-			fextrac.displayList(title_type);
-			fextrac.displayList(title_content);
-			System.out.println("\n========ABSTRACT==ANNOTATIONS=============");
-			fextrac.displayList(abstract_count);		
-			fextrac.displayList(abstract_type);
-		}
-		
-		//filter features by occurence
-		featFilter.considerAnnotationOccurence(abstract_count, pathVars);
-		featFilter.considerAnnotationOccurence(title_count, pathVars);
-				
-		System.out.println("\n===========FEATURE==EXPORT===============");
-		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs);
-		System.out.println("..."+ PMIDs.size()+" document IDs listed.");
-		fextrac.exportList(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES, ec_numbers);
-		System.out.println("..."+ ec_numbers.size()+" EC numbers saved.");				
-		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES, abstract_count);
-		System.out.println("..."+ abstract_count.size()+" unique Abstract annotations saved.");
-		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES, title_count);
-		System.out.println("..."+ title_count.size() +" unique Title annotations saved.");
-		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES, title_content);
-		System.out.println("..."+jTitle+" Journal titles saved.");
-		System.out.println("\n=========================================\n");
-		
-	}		
-
-	/**
-	 * Identifies the classification on doc
-	 * 
-	 * @param clas text containing classification (after char removal)
-	 * @return classification of doc
-	 */	
-	private String getClassif(String clas) {
-		
-		//parsing the not edited text into HTML using Jsoup
-		Document doc = Jsoup.parseBodyFragment(clas);
-		//saving the text as an Jsoup element, with a main tag (the HTML body), 
-		//attributes and child nodes (TRIAGE tags)
-		Element text = doc.body();
-		
-		Elements classification = text.getElementsByTag("TRIAGE");
-				
-		return classification.text();		
-	}
-	
-	/**
-	 * Inserts the classification 
-	 * on the list of features
-	 * 
-	 * @param class information to insert on list
-	 * @param list list of features used
-	 */	
-	private void addClass(String element, HashMap<Map<String,String>, String> list){
-		//going over list to insert
-		//classif on document instances		
-		Iterator<Map<String, String>>it = list.keySet().iterator();
-		
-		while(it.hasNext()){		
-			Map<String,String> str = it.next();
-								
-			if(list.get(str).contains(element)){
-			//if(list.get(str).contains("positive") || list.get(str).contains("negative")){
-					
-			}
-			else list.put(str, element);
-		}
-	}	
-
-	
-	/**
-	 * Extract the annotations from a determined section
-	 * of the document and add them to the specified lists.
-	 * 
-	 * @param annotation cleaned and splitted line with annotation
-	 * @param count list that holds annotation, its type and its count
-	 * @param type list that holds annotation, its type and its classification
-	 */	
-	private ArrayList<String> annotations(String annot, HashMap<Map<String, String>, Integer> count, HashMap<Map<String,String>,String> type, NaiveFilter filter, PathConstants pathVars) {		
-		HashMap<String,String> features = loadAnnotationEntities();
-		PathConstants pathVar = new PathConstants(); 
-		NgramExtractor nextrac = new NgramExtractor();
-		ArrayList<String> content = new ArrayList<String>();		
-
-		//parsing the not edited text into HTML using Jsoup
-		Document doc = Jsoup.parseBodyFragment(annot);
-		//saving the text as an Jsoup element, with a main tag (the HTML body), 
-		//attributes and child nodes (annotation tags)
-		Element annotations = doc.body();
-
-		//iterating over list of entities
-		for(Map.Entry<String,String> value : features.entrySet()){
-
-			String an_type = value.getKey();
-			String an_level = value.getValue();
-
-			//for each entity, find the annotations on abstract
-			Elements annots = annotations.getElementsByTag(an_type);			
-
-			//for each annotation found, 
-			for(Element an : annots){
-
-				//grabbing annotation content:
-				//if the annotation is made on the sentence level:
-				if(an_level.contains("sentence")){
-
-					//checkingh if sentence contains inner annotations
-					if(an.childNodeSize() != 0){
-
-						//going over list of inner annotations
-						for(Element child : an.children()){
-
-							//if child is sentence (sentence inside of sentence),  
-							//then add annotations as ngrams on this
-							if(features.get(child.nodeName()).contains("sentence")) {
-								content.addAll(nextrac.nGrams(child.text(), filter, pathVar));								
-								insertAnnotation(content, an.nodeName(), count, type, pathVars);
-							}
-							//adding annotations on sentence as they are - no ngrams on this
-							else {
-								content.add(child.text());	
-								insertAnnotation(content, an.nodeName(), count, type, pathVars);
-							}
-						}
-						
-						//removing inner annotations from sentence, they are already added
-						Element tempAnnot = an.clone();
-						tempAnnot.children().remove();
-
-						//splitting content in ngrams to whats left on the sentence
-						content.addAll(nextrac.nGrams(tempAnnot.text(), filter, pathVar));
-						insertAnnotation(content, an.nodeName(), count, type, pathVars);
-					}			
-
-				}
-				else {
-					//keeping original annotation content for other cases					
-					content.add(an.text()); 
-					insertAnnotation(content, an.nodeName(), count, type, pathVars);
-				}
-			}
-
-		}
-		return content;
-
-	}	
-	
-	
-	/**
-	 * Insert annotation (or ngram list of annotation) 
-	 * on lists, used on @annotations method 
-	 * @param content content of annotation
-	 * @param an_type type extracted from text (entity)
-	 * @param count list of annotations and their count
-	 * @param type  list of annotations and their type
-	 */	
-	private void insertAnnotation(ArrayList<String> content, String an_type, HashMap<Map<String, String>, Integer> count, HashMap<Map<String,String>,String> type, PathConstants pathVars){
-		
-		//iterating over list of annotations
-		for(int i = 0; i < content.size(); i++){
-
-			if(content.get(i).length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){
-
-				//creating the list key as: content - type mapping
-				Map<String, String> an_content = new HashMap<String, String>();				
-				an_content.put(content.get(i), an_type);
-
-				//for each annotation (or ngram on annotation)
-				//insert content and related type
-				if(count.containsKey(an_content)){						
-					try{
-						int cnt = count.get(an_content);								
-						count.put(an_content, cnt+1);
-
-					}catch(Exception e){
-						count.put(an_content, 1);															
-					}
-				}					
-				else{					
-					count.put(an_content, 1);					
-				}
-				//populating list of feature_an_types, with:
-				//feature--an_type--class				
-				type.put(an_content, "");
-			}
-		}
-		
-		content.clear();
-		
-	}
-
-	
-	/**
-	 * Inserts the text (e.g.title) content into   
-	 * a list of features (e.g.title features)
-	 *  
-	 * @param annot text with the annotations to be handled
-	 * @param wContent whole field to be added on the list of features
-	 * @param list features used
-	 * 
-	 */	
-	private void addContent(String annot, String wContent, HashMap<Map<String,String>,String> list, NaiveFilter filter) {
-
-		HashMap<String,String> features = loadAnnotationEntities();
-		ArrayList<String> content = new ArrayList<String>();
-		NgramExtractor nextrac = new NgramExtractor();
-		PathConstants pathVar = new PathConstants();
-
-		//parsing not edited text into HTML using Jsoup
-		Document doc = Jsoup.parseBodyFragment(annot);
-		//saving the text as an Jsoup element, with a main tag (the HTML body), 
-		//attributes and child nodes (annotation tags)
-		Element annotations = doc.body();
-
-		//iterating over annotation types
-		for(Map.Entry<String,String> value : features.entrySet()){
-
-			String an_type = value.getKey();
-			String an_level = value.getValue();
-
-			//for each annotation type, find all related annotations on the abstract
-			Elements annots = annotations.getElementsByTag(an_type);			
-
-			//for each annotation type, 
-			for(Element an : annots){
-
-				//grab annotation content								
-				if(an_level.contains("sentence"))
-					//splitting in ngrams for sentence level annotations
-					content = nextrac.nGrams(an.text(), filter, pathVar);
-				else 
-					//keeping original annotation for other cases
-					content.add(an.text());
-
-				//iterating over list of annotations
-				for(int i = 0; i < content.size(); i++){
-					
-					Map<String,String> an_content = new HashMap<String,String>();
-					an_content.put(content.get(i), wContent);
-					
-					//populating list of feature_an_types, with:
-					//feature--an_type--class
-					list.put(an_content, "");									
-				}
-				content.clear();
-			}
-		}
-	}
-
-	
-	/**
-	 * Loads list of entities from external file
-	 * 
-	 * @param str list of entities
-	 * @param pathVar constants from 
-	 * @return
-	 */	
-	public HashMap<String,String> loadAnnotationEntities(){
-		
-		String pathEntities = "entities.txt";		
-		HashMap<String,String> values = new HashMap<String,String>();
-						
-		try{			
-			BufferedReader reader = new BufferedReader(new FileReader(pathEntities));
-			
-			String line = null;	
-			
-			while((line = reader.readLine()) != null){				
-                
-				String[] value = StringUtils.split(line, " ");
-				values.put(value[0].toLowerCase(), value[1].toLowerCase());				
-			}
-			
-			reader.close();
-			
-		}catch (FileNotFoundException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        }		
-		//String[] entities = values.toArray(new String[values.size()]);
-		
-		return values;
-	}
-
-	
-	/**
-	 * Handles the content of annotations; when
-	 * there is multiple elements, they are 
-	 * concatenated after extracted 
-	 * 
-	 * @param str list of annotation elements
-	 * @return single string with all elements
-	 */	
-	public String contentToString(String[] str){
-		String cont = "";
-		
-		for(int i = 0; i < str.length; i++){
-				if(cont.contentEquals("")){
-					cont = cont + str[i];	
-				}
-				else cont = cont+" "+ str[i];
-				
-			}		
-		
-		return cont;
-	}
-	
-	
-
-}
diff --git a/src/analyse/NgramExtractor.java b/src/analyse/NgramExtractor.java
deleted file mode 100644
index c101c25..0000000
--- a/src/analyse/NgramExtractor.java
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package analyse;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.commons.lang3.StringUtils;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-
-import configure.PathConstants;
-import filter.NaiveFilter;
-
-/**
- * This class extracts and parses n-grams
- * from XML doc instances.
- * 
- * @author Hayda Almeida
- * @since 2014
- * 
- */
-
-public class NgramExtractor extends Extractor{
-		
-	public NgramExtractor(){
-		
-		//defining relevant paper text fields
-		this.id = "PMID";
-		this.openJournal = "Title";
-		this.openAbst = "AbstractText";		
-		this.openEC = "RegistryNumber";
-		this.classTag = "TRIAGE";
-		this.openTitle = "ArticleTitle";		
-	}	
-	
-		
-	public static void main(String[] args) {
-		
-		PathConstants pathVars = new PathConstants();
-		boolean verbose = false;
-		
-		String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE;
-		NgramExtractor nextrac = new NgramExtractor();
-		NaiveFilter featFilter = new NaiveFilter();
-		
-		//store abstract ngrams and its count
-		HashMap<String,Integer> ngram_count = new HashMap<String,Integer>();
-		//store abstract ngrams and doc ID
-		HashMap<String,String> ngram_ID  = new HashMap<String,String>();
-		//store title ngrams and its count
-		HashMap<String,Integer> ngram_title_count = new HashMap<String,Integer>();
-		//store title ngrams, count and "relevance(TBD)"
-		HashMap<Map<String,String>,Integer> ngram_title = new HashMap<Map<String,String>,Integer>();
-		//store ID and label of documents
-		HashMap<String,String> PMIDs = new HashMap<String,String>();
-				
-		nextrac.initialize();		
-		
-		try 
-		{		
-			
-			//Loading file
-			File input = new File(AnCorpus);
-			//Jsoup parse
-			Document doc = Jsoup.parse(input, "UTF-8");
-
-			Elements corpus = doc.body().getElementsByTag("pubmedarticleset");
-						
-			//Fetching elements
-			
-			for(Element paper : corpus ){			
-
-				Elements journalTitle = paper.getElementsByTag(nextrac.getOpenJournal());
-				Elements title = paper.getElementsByTag(nextrac.getOpenTitle());
-				Elements abstractC = paper.getElementsByTag(nextrac.getopenAbst());
-				Elements ECnumber = paper.getElementsByTag(nextrac.getOpenEC());
-				Elements classDoc = paper.getElementsByTag(nextrac.getClassTag());		
-
-				String journal = "";
-				String docID = "";
-				String label = "";
-				int jTitle = 0;
-
-				//fetching the paper ID - 
-				//for all items in a paper, retrieve only PMIDs 
-				for(Element e : paper.select(nextrac.getid())){
-					//only consider the ID if the parent is medline citation
-					if(e.parentNode().nodeName().contains("medline")){						
-						docID = e.text();
-					}
-				}			
-				//fetch the doc label as well
-				if(classDoc.hasText()){
-					label = classDoc.text();									
-				}
-
-				PMIDs.put(docID, label);
-
-				//Extracting the Journal Title
-				if(journalTitle.hasText()){
-					jTitle++;				
-					journal = journalTitle.toString();
-					journal = nextrac.removeSpecialChar(journal);				
-					journal = nextrac.removeTags(journal);									
-				}
-
-				String tit_content = "";
-				//Extracting the Paper Title
-				if(title.hasText()){
-					tit_content = title.toString();
-					tit_content = nextrac.removeSpecialChar(tit_content);
-					tit_content = nextrac.removeTags(tit_content);
-
-					ArrayList<String> title_c = nextrac.nGrams(tit_content, featFilter, pathVars);
-					nextrac.addNGram(title_c, ngram_title_count, pathVars);		
-				}
-
-				String abstrac = "";
-				//Extracting the Paper abstract
-				if(abstractC.hasText()){
-					abstrac = abstractC.toString();
-					abstrac = nextrac.removeTags(abstrac);
-					abstrac = nextrac.removeSpecialChar(abstrac);				
-					abstrac = nextrac.removeAbstractTags(abstrac);
-
-					ArrayList<String> abstract_c = nextrac.nGrams(abstrac, featFilter, pathVars);
-					nextrac.addNGram(abstract_c, ngram_count, pathVars);			
-				}
-			}
-
-		}catch (FileNotFoundException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        } 
-        
-		if(verbose){
-			//print list of extracted n-grams
-			nextrac.displayList(PMIDs);
-			System.out.println("\n========ABSTRACT==NGRAMS=============");
-			nextrac.displayList(ngram_count);
-			nextrac.displayList(ngram_title);
-			System.out.println("\n===========TITLE==NGRAMS=============");
-			nextrac.displayList(ngram_title_count);
-		}	
-		
-		//filter features by occurence			
-		featFilter.considerNgramOccurence(ngram_count, pathVars);
-		featFilter.considerNgramOccurence(ngram_title_count, pathVars);		
-		
-		System.out.println("\n===========NGRAMS==EXPORT===============\n");
-		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs);
-		System.out.println("..."+ PMIDs.size()+" document IDs listed.");
-		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES, ngram_count);
-		System.out.println("..."+ ngram_count.size()+" unique Abstract ngrams saved.");
-		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS, ngram_title_count);
-		System.out.println("... "+ ngram_title_count.size() +" unique Title ngrams saved.");		
-		System.out.println("\n========================================\n");		
-               
-	}
-	
-
-	/**
-	 * Inserts ngrams into list of features 
-	 * with a mapping for ngram count  
-	 * @param str relation of ngrams extracted
-	 * @param list_count mapping for ngram counts
-	 * @param pathVars 
-	 */
-	
-	private void addNGram(ArrayList<String> str, HashMap<String,Integer> list_count, PathConstants pathVars){
-		
-		//iterating over ngram list
-		for(int i = 0; i < str.size(); i++){
-			String currentNGram = str.get(i);
-			
-			//checking existence of current ngram on list mapping
-			if(list_count.containsKey(currentNGram)){
-				//retrieve the amount of current ngrams on mapping
-				int count = list_count.get(currentNGram);
-				//insert the updated count of ngrams
-				list_count.put(currentNGram, count+1);			
-			}
-			else {
-				//insert ngram on mapping list 
-				if(currentNGram.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){
-					list_count.put(currentNGram, 1);
-				}
-			}
-		}
-	}
-	
-	/**
-	 * Extracts n-grams from a given content field
-	 * 
-	 * @param str text to extract ngrams
-	 * @return list of extracted grams
-	 */	
-	public ArrayList<String> nGrams(String str, NaiveFilter filter, PathConstants pathVar){
-
-		//removing ASCII special characters		
-		str = str.replace("/", "");
-		str = str.replace("\\", "");
-		//str = str.replace("\n", " ");
-		str = str.replaceAll("\\s+"," ");
-		str = str.replace(" ", "-");		
-		
-		//Tokenizing the sentence
-		String[] words = StringUtils.split(str,"-"); 
-		ArrayList<String> ngramList = new ArrayList<String>();
-
-		int ngram =Integer.parseInt(pathVar.NGRAM_SIZE);
-
-		//Stop-words removal 
-		if(Boolean.valueOf(pathVar.NGRAM_STOP)){
-			words = StringUtils.split(filter.removeStopList(words, pathVar)," ");
-		}	
-		
-		//extracting ngrams according to gram size (1, 2, 3)
-		for(int i=0; i < words.length - (ngram - 1); i++){
-			switch(pathVar.NGRAM_SIZE){
-			case "1":
-				ngramList.add(words[i].toLowerCase());
-				break;
-			case "2":
-				ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase());
-				break;
-			case "3":
-				ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()+" "+words[i+2].toLowerCase());
-				break;				
-			}			
-		}
-		
-		return ngramList;
-	}
-	
-//	/**
-//	 * Removes stopwords from ngrams list
-//	 * 
-//	 * @param str list of ngrams
-//	 * @param constants 
-//	 * @return cleaned list of ngrams
-//	 */	
-//	public String removeStopList(String[] str, PathConstants pathVar){
-//		
-//		//stop-words file name
-//		String pathStop = "stopList.txt";
-//		String[] stop = null;
-//		StringBuilder cleaned = new StringBuilder();
-//		
-//		try{
-//			
-//			BufferedReader reader = new BufferedReader(new FileReader(pathStop));
-//			
-//			String line = null;	
-//			//loading stop-words list
-//			while((line = reader.readLine()) != null){
-//				stop = StringUtils.split(line,",");
-//				line = reader.readLine();
-//			}
-//			
-//			reader.close();
-//			
-//		}catch (FileNotFoundException e) {
-//            e.printStackTrace();
-//        } catch (IOException e) {
-//            e.printStackTrace();
-//        } 		
-//		
-//		//iteraing over text to be cleaned
-//		for(int i = 0; i < str.length; i++){
-//			//iterating over stop-words list
-//			for(int j = 0; j < stop.length; j++){
-//				
-//				//when stop-word is encountered, replace it
-//				if(str[i].equalsIgnoreCase(stop[j])){
-//					str[i] = str[i].replace(str[i],"*");					
-//				}				
-//			}
-//			//retrieve the text without stop-words replacements
-//			if(!(str[i].contentEquals("*"))){
-//				cleaned.append(str[i]).append(" ");				
-//			}
-//		}		
-//		return cleaned.toString().replace("  ", " ");
-//	}
-	
-		
-	/**
-	 * Displays the keys and values of the
-	 * maps created with n-grams and counts.
-	 * @param hash  HashMap containing n-grams
-	 */
-	@Override
-	public void displayList(HashMap hash){
-		super.displayList(hash);
-			//sum = sum + hash.get(str);		
-		System.out.println("\n=======================================\n");
-		System.out.println("Number of unique n-grams: "+hash.size());
-		System.out.println("\n=======================================\n");
-	}
-	
-		
-	
-}
diff --git a/src/arffmatrix/.gitignore b/src/arffmatrix/.gitignore
deleted file mode 100644
index ec5761d..0000000
--- a/src/arffmatrix/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/buildmodel.class
-/buildtest.class
diff --git a/src/arffmatrix/BuildModel.class b/src/arffmatrix/BuildModel.class
deleted file mode 100644
index 0be977c..0000000
Binary files a/src/arffmatrix/BuildModel.class and /dev/null differ
diff --git a/src/arffmatrix/BuildModel.java b/src/arffmatrix/BuildModel.java
deleted file mode 100644
index f8d0fac..0000000
--- a/src/arffmatrix/BuildModel.java
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-***
-* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/corpus/buildmodel.java
-* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
-*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
-*                   
-* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term
-* http://creativecommons.org/licenses/by-nc/3.0/
-*/
-
-package arffmatrix;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import analyse.Extractor;
-import arffvector.CreateVector;
-import configure.PathConstants;
-
-/**
- * This class reads the corpus instances and uses
- * the CreateVector class to generate a model file (ARFF) *  
- *
- * @author Hayda Almeida, Marie-Jean Meurs
- * @since 2014
- *
- */
-
-public class BuildModel {
-	
-	private static String configfile =  null;
-	
-	public static void main(String[] args) {
-		
-		//-----------------------------------
-		// instantiate classes of constants
-		// and configuration file.
-		//-----------------------------------
-
-		PathConstants pathVars;
-
-		if (configfile == null){
-			pathVars = new PathConstants();
-		}
-		else{
-			pathVars = new PathConstants(configfile);
-		}		
-
-		Extractor model = new Extractor();
-		model.initialize();
-		CreateVector vectorgenerator = new CreateVector(pathVars);
-		String attributes = vectorgenerator.informFeatures(pathVars);
-		System.out.println("Features loaded ...");
-		
-		// name output ARFF files
-		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
-		String arffFileName = "triage" + pathVars.EXP_TYPE + attributes +"_"+ timeStamp + ".arff";
-				
-		try 
-	    {		
-			//by default
-			String sortarffFileName = pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + arffFileName; // default
-			
-			// create file			
-			BufferedWriter out = new BufferedWriter(new FileWriter(sortarffFileName));
-			 
-			// load ARFF header and write it
-			String outHeaderArff = vectorgenerator.genArffHeader(pathVars,Integer.parseInt(pathVars.EXP_TYPE));
-			//System.out.println(outHeaderArff); // verbose
-			out.write(outHeaderArff + "\n");			
-
-			// reader for corpus
-			BufferedReader reader = null;
-			//train corpus
-			if(Integer.parseInt(pathVars.EXP_TYPE) == 0)
-				reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR + pathVars.TRAINING_FILE));	
-			//test corpus
-			else if(Integer.parseInt(pathVars.EXP_TYPE) ==1)
-				reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_DIR + pathVars.TEST_FILE));
-						
-	        //--------------------------------------------
-	        // repeat until all lines have been read
-	        // from the file
-	        //--------------------------------------------
-			String text = null;
-			String content = null;
-			
-			String abstracttext = "";
-			String journaltitle = "";
-			String title = "";
-			String ecnumber = "";
-			String classtriage = "";
-			int hasText = 0;
-			int journaltitlecount = 0;
-			int abstracttitlecount = 0;
-			int abstracttextcount = 0;
-			int positivecount = 0;
-			int negativecount = 0;
-			
-			
-	        while ((text = reader.readLine()) != null) { 		        	
-	        	
-	        	// detect a PubMed abstract
-	        	if (text.contains("<PMID Version=\"1\">")){
-	        		
-	        		// Reinitialize journal title 
-	        		 journaltitle = "";
-	        		 
-	        		// Reinitialize abstract title 
-	        		 title = ""; 
-	        		
-	        		 // Reinitialize abstract text 
-	        		 abstracttext = ""; 
-	        		 
-	        		 // Reinitialize hasText to false
-	        		hasText = 0;
-	        		
-	        		String pmid = text.replaceFirst("<PubmedArticleSet>.*<PMID Version=\"1\">", "");
-	        		pmid = pmid.replace("</PMID>", "");
-	        		System.out.println("PMID : " + pmid);
-	        		
-	        		// continue to read
-	        		content = reader.readLine();
-	        		content = content.replaceAll("\t", "");
-	        		content = content.replaceFirst("\\s+", "");	        		
-	        		
-	        		while ( ! content.contentEquals("</PubmedArticleSet>") ) {
-	        			
-	        			if (content.contains("<Title>")){
-	        				
-	        				journaltitlecount++;
-	        				
-	        				content = content.replace("<Title>", "");
-	        				content = content.replace("</Title>", "");
-	        				journaltitle = content;
-	        				System.out.println("Journal title : " + content);
-	        			}
-	        			
-	        			if (content.contains("<ArticleTitle>")){
-	        				
-	        				abstracttitlecount++;
-	        				
-	        				content = content.replace("<ArticleTitle>", "");
-	        				content = content.replace("</ArticleTitle>", "");
-	        				title = content;
-	        				System.out.println("Paper title : " + content);
-	        			}
-	        			
-	        			        			
-	        			if (content.contains("<AbstractText>")){
-
-	        				abstracttextcount++;
-	        				hasText = 1; // use it to indicate if the abstract has some text or not 
-
-	        				content = content.replace("<AbstractText>", "");
-	        				
-	        				//checks if there are empty lines after AbstractText tag
-	        				//and keeps reading until finds the abstract content
-	        				while(content.isEmpty()){
-	        						content = reader.readLine();     					
-	        				}	        				
-	        					abstracttext = abstracttext + content; 	        					
-	        					// clean
-	        					abstracttext = model.removeAbstractTags(abstracttext);        					
-	        				
-
-	        				content = reader.readLine();
-	        				// converting toLowerCase is not relevant in bio context
-	        				// because it introduces ambiguities (ie Gene name / Enzyme alias)
-	        				// abstracttext = abstracttext.toLowerCase();
-	        			}
-
-	        			if (content.contains("<AbstractText ")){       				        				
-	        				
-	        				String temp = "";
-							String newAbs = "<AbstractText>";
-							
-							if(content.contains("</Abstract>")){
-								temp = temp + model.processAbstract(content);
-							}
-							else{
-								do{							
-									temp = temp + model.processAbstract(content);								
-									content = reader.readLine();							
-								}while(!(content.contains("</Abstract>")));
-							}
-							
-							newAbs = newAbs + temp;
-							content = newAbs + "</AbstractText>"; 
-							
-							abstracttext = content;
-							abstracttext = model.removeAbstractTags(abstracttext);
-							
-							content = reader.readLine();
-								        				
-	        			}	        			
-	        			
-	        			if (content.contains("<RegistryNumber>EC ")){
-	        				content = content.replace("<RegistryNumber>EC ", "");
-							content = content.replace("</RegistryNumber>", "");
-							ecnumber = content;	        				
-	        			}
-	        			
-	        			if (content.contains("<TRIAGE>")){
-        				
-        				content = content.replace("<TRIAGE>", "");
-        				content = content.replace("</TRIAGE>", "");
-        				classtriage = content;
-        				if(content.contains("positive")){
-        					positivecount++;
-        				}
-        				if(content.contains("negative")){
-        					negativecount++;
-        				}
-        				System.out.println("Triage classification : " + content);
-        			}
-	        			
-	        			content = reader.readLine();
-	        			content = content.replaceAll("\t", "");
-	        			content = content.replaceFirst("\\s+", "");
-	        		}
-	        		
-	        		System.out.println("Abstract : " + abstracttext.toString() + "\n\n");
-
-	        		// end of if: collect data and write ARFF
-	        		String Arffline = vectorgenerator.getArffLine(pmid,
-	        				journaltitle, 
-	        				title, 
-	        				abstracttext,
-	        				ecnumber,
-	        				classtriage,	        				
-	        				Integer.parseInt(pathVars.EXP_TYPE)
-	        				);
-	        		
-	        		Arffline = Arffline + "\n";
-	        		// write line on disc
-	        		out.write(Arffline);	        		
-	        		// out.write(id + " " + Arffline + "\n"); // 	        		
-	        	}      	
-	        	
-	        }
-	        
-	        System.out.println(
-	        		"Abstracts processed: " + abstracttitlecount
-	        		+ "\t with text content: " + abstracttextcount
-	        		+ "\t from " + journaltitlecount + " journals"
-	        		+ "\nTotal of: \n" + positivecount + " positive"
-	        		+ "\t and " + negativecount + " negative documents");
-	        out.write("\n");
-	        out.close();
-	        
-	        reader.close();
-	      
-	        
-	    }catch (FileNotFoundException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        }		
-		
-	}	
-	
-	
-	
-}
-
-
-
diff --git a/src/arffvector/.gitignore b/src/arffvector/.gitignore
deleted file mode 100644
index bdc0ba3..0000000
--- a/src/arffvector/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-/buildvector.class
-/FeatureVector.class
-/CreateVector.class
-/CreateWeightedVector.class
-/ArbitraryWeight.class
-/CountsWeightedVector.class
-/ArbitraryWeightedVector.class
diff --git a/src/arffvector/CreateVector.java b/src/arffvector/CreateVector.java
deleted file mode 100644
index b112ea5..0000000
--- a/src/arffvector/CreateVector.java
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-***
-* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/vector/buildvector.java
-* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
-*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
-*                   
-* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term
-* http://creativecommons.org/licenses/by-nc/3.0/
-*/
-
-
-
-
-package arffvector;
-
-import java.io.BufferedReader;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import org.apache.commons.lang3.StringUtils;
-import configure.PathConstants;
-
-/**
- * Uses the features extracted and the 
- * generated corpus to create a feature vector
- * (a matrix representation of the corpus) 
- * 
- * @author Hayda Almeida, Marie-Jean Meurs
- * @since 2014
- * 
- */
-public class CreateVector {	
-	
-	ArrayList<String> annotations = new ArrayList<String>();
-	ArrayList<String> annotationsType = new ArrayList<String>();
-	ArrayList<String> journalTitles = new ArrayList<String>();
-	ArrayList<String> ecnumbers = new ArrayList<String>();
-	ArrayList<String> titleGrams = new ArrayList<String>();
-	ArrayList<String> titleAnnot = new ArrayList<String>();
-	ArrayList<String> nGrams = new ArrayList<String>();
-	ArrayList<String> docID = new ArrayList<String>();
-		
-	PathConstants pathVars = null;
-	
-	/**
-	 * Constructor to load all features extracted
-	 * from training files. These features will be 
-	 * used to generate the ARFF header and the
-	 * ARFF vector lines.
-	 * 
-	 * @param extVars Variables holding system paths
-	 */
-	
-	public CreateVector(PathConstants extVars) {
-		
-		pathVars = extVars;		
-		
-		String pathJournalT = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES;
-		try{
-			String journalT = "";
-			
-			//receiving journal title
-			BufferedReader reader = new BufferedReader(new FileReader(pathJournalT));
-			int featcount = 0;
-			while (( journalT = reader.readLine()) != null) {
-				
-				if (Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
-										
-					String[] features = StringUtils.split(journalT,"\n"); 
-
-					for(int i = 0; i < features.length; i++){
-
-						String[] featurename = StringUtils.split(features[i],"\t");
-						
-						//checking for journal titles duplicates 						
-						if(featurename[1] != "" && !(journalTitles.contains(featurename[1]))){
-							journalTitles.add(featurename[1]);
-						}
-					}		
-				}
-				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
-
-			}
-			reader.close();
-		}
-		catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch (IOException e) {			
-			e.printStackTrace();
-		}
-		
-		String pathAnnotations = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES;
-		String pathTitleAnnot = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES;
-		
-		try{
-			String abstAnnot = "";
-			String tAnnot = "";
-			
-			//receiving abstract annotations (features)
-			BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations));
-			BufferedReader readerT = new BufferedReader(new FileReader(pathTitleAnnot));
-			
-			int featcount = 0;			
-			
-			while (( abstAnnot = reader.readLine()) != null) {				
-				
-				if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
-					String[] features = StringUtils.split(abstAnnot,"\n"); 
-
-					for(int i = 0; i < features.length; i++){
-
-						String[] featurename = StringUtils.split(features[i],"\t");
-						
-						//checking for duplicate abstract annotations
-						if(featurename[0] != "" && !(annotations.contains(featurename[0]))){
-							annotations.add(featurename[0]);
-						}
-					}		
-				}				
-				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
-			}
-			
-			
-			if(!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){
-				while((tAnnot = readerT.readLine()) != null){
-					
-					String[] features = StringUtils.split(tAnnot,"\n");
-					
-					for(int i = 0; i < features.length; i++){
-
-						String[] featurename = StringUtils.split(features[i],"\t");
-						
-						//checking for duplicate annotations
-						if(featurename[0] != "" && !(annotations.contains(featurename[0]))){
-							annotations.add(featurename[0]);
-						}
-					}	
-					
-				}
-				
-			}
-			
-			reader.close();
-			readerT.close();
-		}
-		catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch (IOException e) {			
-			e.printStackTrace();
-		}
-		
-		try{
-			String abstAnType = "";
-			
-			//receiving abstract annotation types
-			BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations));
-			int featcount = 0;
-			while (( abstAnType = reader.readLine()) != null) {
-				
-				if (Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
-										
-					String[] features = StringUtils.split(abstAnType,"\n"); 
-
-					for(int i = 0; i < features.length; i++){
-
-						String[] featurename = StringUtils.split(features[i],"\t");
-						
-						//checking for duplicate abstract annotation types 
-						if(featurename[1] != "" && !(annotationsType.contains(featurename[1]))){							
-							annotationsType.add(featurename[1]);
-						}
-						
-					}		
-				}
-				if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;}
-
-			}
-			reader.close();
-		}
-		catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch (IOException e) {			
-			e.printStackTrace();
-		}		
-		
-		
-		
-		try{
-			String titAnnot = "";
-			
-			//receiving title annotations (features)
-			BufferedReader reader = new BufferedReader(new FileReader(pathTitleAnnot));
-			// int featcount = 0;
-			while (( titAnnot = reader.readLine()) != null) {
-
-				if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){
-					
-					//String titAnnot = FeatureExtractor.getTitCount();
-
-					String[] features = StringUtils.split(titAnnot,"\n");				
-
-					for(int i = 0; i < features.length; i++){
-						String[] featurename = StringUtils.split(features[i],"\t");
-						
-						//checking for duplicate title annotations
-						if(!(titleAnnot.contains(featurename[0]))){
-							titleAnnot.add(featurename[0]);	
-						}
-					}								
-				}
-			}
-			reader.close();
-		}
-		catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch (IOException e) {			
-			e.printStackTrace();
-		}
-		
-		
-		String pathECNumFeatures = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES;
-
-		try{
-			String ECNum = "";
-
-			//receiving EC numbers (features)
-			BufferedReader reader = new BufferedReader(new FileReader(pathECNumFeatures));
-			// int featcount = 0;
-			while ((ECNum = reader.readLine()) != null) {
-
-				if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
-
-					//String titAnnot = FeatureExtractor.getTitCount();
-
-					String[] features = StringUtils.split(ECNum,"\n");				
-
-					for(int i = 0; i < features.length; i++){
-						String[] featurename = StringUtils.split(features[i],"\t");
-
-						//checking for duplicate EC numbers
-						if(!(ecnumbers.contains(featurename[0]))){
-							ecnumbers.add(featurename[0]);	
-						}
-					}								
-				}
-			}
-			reader.close();
-		}
-		catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch (IOException e) {			
-			e.printStackTrace();
-		}
-		
-		
-		String pathTitleGrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS;
-		
-		
-		try{
-			String titCont = "";
-			// String grams = "";
-			
-			//receiving title ngrams
-			BufferedReader reader = new BufferedReader(new FileReader(pathTitleGrams));
-			
-			int featcount = 0;
-			while (( titCont = reader.readLine()) != null) {
-
-				if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
-					
-					String[] content = StringUtils.split(titCont,"\n");				
-
-					for(int i = 0; i < content.length; i++){				
-						String[] featurename = StringUtils.split(content[i],"\t");			
-						
-						//check for duplicate title ngrams
-						if(!(titleGrams.contains(featurename[0]))){
-							titleGrams.add(featurename[0]);
-						}
-					}			
-				}
-			}
-						
-			reader.close();
-
-		}
-		catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch (IOException e) {			
-			e.printStackTrace();
-		}
-				
-		String pathNgrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES;
-		try{
-			String grams = "";
-			String tgrams = "";
-			
-			//receiving ngrams
-			BufferedReader reader = new BufferedReader(new FileReader(pathNgrams));
-			BufferedReader readerT = new BufferedReader(new FileReader(pathTitleGrams));
-			
-			// int featcount = 0;
-			while (( grams = reader.readLine()) != null) {
-
-				if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
-
-					String[] features = StringUtils.split(grams,"\n");
-
-					for(int i = 0; i < features.length; i++){
-						String[] featurename = StringUtils.split(features[i],"\t");
-
-						//check for duplicate abstract ngrams
-						if(!(nGrams.contains(featurename[0]))){
-							nGrams.add(featurename[0]);
-						}
-					}
-				}
-
-			}
-			
-			//if not using title grams separately, 
-			// then insert title grams with abstract grams.  
-			if (!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){
-				while (( tgrams = readerT.readLine()) != null) {
-					
-					String[] features = StringUtils.split(tgrams,"\n");
-					
-					for(int i = 0; i < features.length; i++){
-						String[] featurename = StringUtils.split(features[i],"\t");
-						
-						//check for duplicate ngrams
-						if(!(nGrams.contains(featurename[0]))){
-							nGrams.add(featurename[0]);
-						}
-					}					
-				}				
-			}
-			
-			reader.close();
-			readerT.close();
-			
-		}catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch (IOException e) {			
-			e.printStackTrace();
-		}
-	}
-	
-	/**
-	 * Gathers the list of features, according to 
-	 * experimental configurations. The list of 
-	 * features will be written on the ARFF header.
-	 * 
-	 * @param pathVars Variables holding system paths
-	 * @param exp experiment type: train or test
-	 * @return a String containing the ARFF header
-	 */
-	
-	public String genArffHeader(PathConstants pathVars, int exp){
-		
-		StringBuilder headerArff = new StringBuilder();
-		
-		switch(exp){
-			case 0: 
-				headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2015\n\n");
-			break;			
-			case 1: 
-				headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2015\n\n");
-			break;
-		}		
-		
-		headerArff.append("@RELATION triage\n");
-		
-		if(Boolean.valueOf(pathVars.USE_TEXT_SIZE)){
-			// writing the list of text sizes
-			headerArff.append("@ATTRIBUTE sizeoftitle \tREAL \t\t%size of title\n");
-			headerArff.append("@ATTRIBUTE sizeoftext \tREAL \t\t%size of text\n");			
-		}
-		
-		if(Boolean.valueOf(pathVars.USE_DOC_ID)){
-			//writing the docIDs
-			headerArff.append("@ATTRIBUTE docID \tREAL \t\t%PMID of paper\n");
-						
-		}
-		
-		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
-			for(int i = 0; i < journalTitles.size(); i++){
-			// writing list of journal titles
-				String feature = journalTitles.get(i);
-				String namefeature = feature.replaceAll("\\s", "-");
-				namefeature = namefeature.replaceAll("[,:=+']", "-");
-				namefeature = namefeature.replaceAll("<|>", "");
-				String ref = "journalTitle" + String.valueOf(i) + namefeature; 
-				
-				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
-							
-			}
-		}
-		
-		if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
-			// writing list of annotation features
-			for(int i = 0; i < annotations.size(); i++){
-
-				String feature = annotations.get(i);
-				String namefeature = feature.replaceAll("\\s", "-");
-				namefeature = namefeature.replaceAll("[,:=+']", "-");
-				namefeature = namefeature.replaceAll("<|>", "");
-				String ref = "annotation" + String.valueOf(i) + namefeature; 
-				
-				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
-
-			}
-		}
-		
-		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
-			// writing list of annotation entities
-			for(int i = 0; i < annotationsType.size(); i++){
-				String feature = annotationsType.get(i);
-				String namefeature = feature.replaceAll("\\s", "-");
-				namefeature = namefeature.replaceAll("[,:=+']", "-");
-				namefeature = namefeature.replaceAll("<|>", "");
-				String ref = "annotationType" + String.valueOf(i) + namefeature;
-				
-				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
-				
-			}
-		}
-		
-		if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){			
-			// write list of title features
-			for( int i = 0; i < titleAnnot.size(); i++){
-
-				String feature = titleAnnot.get(i);
-				String namefeature = feature.replaceAll("\\s", "-");
-				namefeature = namefeature.replaceAll("[,:=+']", "-");
-				namefeature = namefeature.replaceAll("<|>", "");
-				String ref = "titleAnnot" + String.valueOf(i) + namefeature; 
-				
-				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
-				
-			}			
-			
-		}
-		
-		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
-			// writing list of EC numbers
-			for(int i = 0; i < ecnumbers.size(); i++){
-				String feature = ecnumbers.get(i);
-				String namefeature = feature.replaceAll("\\s", "-");
-				namefeature = namefeature.replaceAll("[,:=+']", "-");
-				namefeature = namefeature.replaceAll("<|>", "");
-				String ref = "ECnumber" + String.valueOf(i) + namefeature;
-				
-				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");				
-			}
-		}
-		
-		if (Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
-			// writing list of ngrams on titles			
-			for( int i = 0; i < titleGrams.size(); i++){
-
-				String feature = titleGrams.get(i);
-				String namefeature = feature.replaceAll("\\s", "-");
-				namefeature = namefeature.replaceAll("[,:=+']", "-");
-				namefeature = namefeature.replaceAll("<|>", "");
-				String ref = "titleNgram" + String.valueOf(i) + namefeature; 
-				
-				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
-				
-			}
-		}		
-		
-		if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
-			// write list of ngrams
-			for(int i = 0; i < nGrams.size(); i++){
-
-				String feature = nGrams.get(i);
-				String namefeature = feature.replaceAll("\\s", "-");
-				namefeature = namefeature.replaceAll("[,:=+']", "-");
-				String ref = "Ngram" + String.valueOf(i) + namefeature; 
-				
-				headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n");
-				
-			}
-		}
-		
-		// writing the dataset classes		
-		headerArff.append("@ATTRIBUTE class 	{positive, negative}\n");
-		headerArff.append("@DATA\n");
-		
-		return headerArff.toString();
-	}	
-
-	/**
-	 * Iterates over the list of features and 
-	 * counts number of features containing 
-	 * on a given document.    
-	 * 
-	 * @param jTitle title of journal 
-	 * @param title  title of paper
-	 * @param text  abstract content
-	 * @param ecnum  paper EC numbers 
-	 * @param classTriage  triage classification: positive or negative
-	 * @param exp experiment type: train or test
-	 * @return String holding counts for all features found in a document
-	 */
-	
-	public String getArffLine(String paperID, String jTitle, String title, String text, String ecnum, String classTriage, int exp){
-		//String vectorArff = "";
-		StringBuilder vectorArff = new StringBuilder();
-				
-		paperID = removeSpecialChar(paperID.toLowerCase());
-		text = removeSpecialChar(text.toLowerCase());
-		title = removeSpecialChar(title.toLowerCase());
-		jTitle = removeSpecialChar(jTitle.toLowerCase());
-		ecnum = removeSpecialChar(ecnum);			
-		
-		int emptyabs = 0;
-
-		// fill title and text sizes (number of words)
-		// annotation markups do not matter because
-		// they do not introduce blank spaces hence 
-		// they do not modify the number of words found	
-		if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){
-
-			String[] titleGrams = StringUtils.split(title," ");
-			int titlesize = titleGrams.length;
-
-			String[] abstractcontent = StringUtils.split(text," ");
-			int abstractsize = abstractcontent.length;
-			
-			if(abstractsize == 1){
-				emptyabs++;
-			}
-			
-			vectorArff.append(titlesize).append(",").append(abstractsize).append(",");			
-		}
-		
-		//fill ID of documents
-		if(Boolean.valueOf(pathVars.USE_DOC_ID)){
-
-				if(paperID.length()>0){					
-					vectorArff.append(paperID).append(",");
-				}
-				else{
-					vectorArff.append("0,");
-				}			
-		}
-		
-		//fill values of journal titles
-		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
-			
-			for(int i = 0; i < journalTitles.size(); i++){
-				String jfeat = "";
-				int jfeatcount = 0;
-				jfeat = journalTitles.get(i).replaceFirst(" ", "");
-				
-				if(jTitle.contains(jfeat)){
-					jfeatcount = StringUtils.countMatches(jTitle, jfeat);
-					vectorArff.append(jfeatcount).append(",");
-				}
-				else{
-					vectorArff.append("0,");
-				}
-			}
-		}
-		
-		// fill values of annotation types taken into account 
-		// either only the abstract or abstract and title
-		// adds on vector the count of occurrences		
-		if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){
-
-			for(int i = 0; i < annotations.size(); i++){		
-				String anfeat = "";
-				int anfeatcount = 0;
-				anfeat = annotations.get(i).replaceFirst(" ", "").toLowerCase();
-				
-				//in case the text has current annotation
-				if (text.contains(anfeat)){
-					//check the count of the annotation
-					if((Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){
-						anfeatcount = StringUtils.countMatches(text, anfeat);						
-					}
-					//adding title annot count to annotations
-					else if (!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){						
-						anfeatcount = StringUtils.countMatches(text, anfeat);
-						//in case title has annotation, add to count
-						if(title.contains(anfeat)){
-							anfeatcount = anfeatcount + StringUtils.countMatches(title, anfeat);
-						}
-					}					
-					vectorArff.append(anfeatcount).append(",");
-				}
-				//handles the case that only the title (but not abstract) has current annotation
-				else if((!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)))){
-					if(title.contains(anfeat)){
-						anfeatcount = StringUtils.countMatches(title, anfeat);
-					}
-					vectorArff.append(anfeatcount).append(",");
-				}
-				else{
-					vectorArff.append("0,");					
-				}
-			}			
-		}
-		
-		//fill values of abstract annotation types
-		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){
-			
-			for(int i = 0; i < annotationsType.size(); i++){		
-				String antype = "";
-				int antypecount = 0;
-				antype = annotationsType.get(i).replaceFirst(" ", "").toLowerCase();
-				
-				if (text.contains(antype)){
-					//divided by 2  to match occurance 
-					//(count considers open and close tags)
-					antypecount = (StringUtils.countMatches(text, antype))/2;
-					vectorArff.append(antypecount).append(",");					
-				}
-				else{
-					vectorArff.append("0,");					
-				}
-			}		
-			
-		}
-		
-		//fill values of title annotations
-		if (Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){
-			
-			for( int i =0; i < titleAnnot.size(); i++){				
-				String titfeat = "";
-				int titfeatcount = 0;
-				titfeat = titleAnnot.get(i).replaceFirst(" ", "").toLowerCase();
-				
-				if (title.contains(titfeat)){
-					titfeatcount = StringUtils.countMatches(title, titfeat);
-					vectorArff.append(titfeatcount).append(",");					
-				}
-				else{
-					vectorArff.append("0,");				
-				}				
-			}
-		}
-		
-		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){
-			
-			for(int i = 0; i < ecnumbers.size(); i++){
-				String ecfeat = "";
-				int ecnumcount  = 0;
-				ecfeat = ecnumbers.get(i);
-				
-				if(ecnum.contains(ecfeat)){
-					ecnumcount = StringUtils.countMatches(ecnum, ecfeat);
-					vectorArff.append(ecnumcount).append(",");
-				}
-				else{
-					vectorArff.append("0,");
-				}
-			}
-		}
-		
-		// fill only values of title ngrams
-		if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){
-
-			String cleanTitle = removeTags(title.toLowerCase());
-					
-			for( int i =0; i < titleGrams.size(); i++){
-				String titgram = "";
-				int titgramcount = 0;
-				titgram = titleGrams.get(i).toLowerCase();
-				
-				//in case the title has current ngram
-				if (cleanTitle.contains(titgram)){
-					//check the count of the ngram
-					titgramcount = StringUtils.countMatches(cleanTitle, titgram);
-
-					//adding weight to current ngram count
-					if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
-						titgramcount = applyWeight(titgramcount, Integer.parseInt(pathVars.WEIGHT));						
-					}
-					vectorArff.append(titgramcount).append(",");					
-				}
-				else{
-					vectorArff.append("0,");				
-				}
-			}					
-		}
-		
-		// fill values of ngrams
-		if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){
-			String cleanText = removeTags(text.toLowerCase());
-			String cleanTitle = removeTags(title.toLowerCase());
-						
-			for( int i = 0; i < nGrams.size(); i++){
-				String ngramfeat = "";
-				int ngramcount = 0;
-				ngramfeat = nGrams.get(i).toLowerCase();
-
-				//in case the text has current ngram
-				if (cleanText.contains(ngramfeat)){
-					//check the count of the ngram
-					if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){						
-						ngramcount = StringUtils.countMatches(cleanText, ngramfeat);
-						
-						//adding weight to current ngram count 
-						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
-							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
-						}
-					}
-					//checking if title ngrams should be added to the count
-					else if(!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){
-						ngramcount = StringUtils.countMatches(cleanText, ngramfeat);						
-						
-						//in case title has ngram, add to count
-						if(cleanTitle.contains(ngramfeat)){							
-							ngramcount += StringUtils.countMatches(cleanTitle, ngramfeat);							
-						}
-						
-						//adding weight to current ngram count 
-						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){							
-							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
-						}
-					}				
-
-					vectorArff.append(ngramcount).append(",");					
-				}
-				////handles the case that only the title (but not abstract) has current ngram
-				else if (!(cleanText.contains(ngramfeat))){
-					//in case only the title has the ngram, add to count
-					if(cleanTitle.contains(ngramfeat)){
-						ngramcount = StringUtils.countMatches(cleanTitle, ngramfeat);
-						
-						//adding weight to ngram count
-						if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){
-							ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT));							
-						}
-					}
-					vectorArff.append(ngramcount).append(",");
-				}
-				else{
-					vectorArff.append("0,");					
-				}
-			}	
-		}
-		
-		
-		//if(exp == 0){
-			if (classTriage.contains("positive")){ 
-				vectorArff.append("positive");
-				//vectorArff.append("?");	
-			}
-			else {
-				vectorArff.append("negative");
-				//vectorArff.append("?");
-			}
-		//}
-
-		/*else if (exp == 1){
-			vectorArff.append("?");				
-		}	*/	
-		
-	return vectorArff.toString();
-	}
-	
-	/**
-	 * Cleans a given String from special characters
-	 *  
-	 * @param str String to be cleaned 
-	 * @return String without special characters
-	 */
-	
-	public String removeSpecialChar(String str){
-		str = str.replace("}", "");
-		str = str.replace("{", "");
-		str = str.replace("]", "");
-		str = str.replace("[", "");
-		str = str.replace("#", "");
-		str = str.replace("*", "");
-		str = str.replace("&gt", "");
-		str = str.replace("&quot", "");
-		str = str.replace("&apos", "");
-		str = str.replace("%", "");
-		str = str.replace("/", "");
-		str = str.replace("\\", "");		
-		str = str.replace("&", "");
-		str = str.replace("=", "");
-		str = str.replace("?", "");
-		str = str.replace(",", "");
-		str = str.replace(":", "");
-		str = str.replace(";", "");
-		str = str.replace(".", "");
-		str = str.replace(")", "");
-		str = str.replace("(", "");		
-		str = str.replace("\t\t", "\t");
-		str = str.replace("-", "");
-		str = str.replace("  ", "");
-		
-		return str;
-	}
-	
-	/**
-	 * 
-	 * @param str
-	 * @return
-	 */
-	public String removeTags(String str){
-		String[] remove = StringUtils.split(str,"");
-		StringBuilder sb = new StringBuilder();
-		
-		for(int i = 0; i < remove.length; i++){
-			
-			if(remove[i].equalsIgnoreCase("<")){
-				do{
-					i++;
-				}
-				while(!(remove[i].equalsIgnoreCase(">")));
-			}
-			else sb.append(remove[i]);
-		}
-				
-		return sb.toString();	
-	}
-	
-	public int applyWeight(int count, int weight){
-		
-		if(weight > 0){
-			count = count * weight;
-		}
-		return count;
-	}
-	
-	
-	public String informFeatures(PathConstants pathVars){
-		String value = "";
-		if(Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE))
-			value = value + "_annotations";
-		if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE))
-			value = value + "_types";
-		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE))
-			value = value + "_journal";
-		if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE) || Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))
-			value = value + "_title";
-		if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE))
-			value = value + "_ecnum";
-		if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE))
-			value = value + "_ngrams_size"+ pathVars.NGRAM_SIZE;
-		if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE) && Boolean.valueOf(pathVars.NGRAM_STOP))
-			value = value + "_stopwords";
-		if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM))
-			value = value + "_weight"+ pathVars.WEIGHT;
-		
-		return value;
-	}
-
-	
-}
diff --git a/src/classifier/.gitignore b/src/classifier/.gitignore
deleted file mode 100644
index b92cc15..0000000
--- a/src/classifier/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-/test.class
-/train.class
-/Trainer.class
diff --git a/src/classifier/Trainer.java b/src/classifier/Trainer.java
deleted file mode 100644
index 4ec0da2..0000000
--- a/src/classifier/Trainer.java
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-package classifier;
-import java.util.ArrayList;
-import java.util.Random;
-import weka.attributeSelection.LatentSemanticAnalysis;
-import weka.attributeSelection.PrincipalComponents;
-import weka.attributeSelection.GainRatioAttributeEval;
-import weka.attributeSelection.CorrelationAttributeEval;
-import weka.attributeSelection.Ranker;
-import weka.classifiers.Classifier;
-import weka.classifiers.CostMatrix;
-import weka.classifiers.Evaluation;
-import weka.classifiers.bayes.NaiveBayes;
-import weka.classifiers.evaluation.NominalPrediction;
-import weka.classifiers.evaluation.Prediction;
-import weka.classifiers.evaluation.output.prediction.PlainText;
-import weka.classifiers.functions.LibSVM;
-import weka.classifiers.meta.AttributeSelectedClassifier;
-import weka.classifiers.meta.CostSensitiveClassifier;
-import weka.classifiers.meta.FilteredClassifier;
-import weka.classifiers.trees.LMT;
-import weka.core.Attribute;
-import weka.core.Instances;
-import weka.core.Range;
-import weka.core.converters.ConverterUtils.DataSource;
-import weka.filters.Filter;
-import weka.filters.unsupervised.attribute.Remove;
-import configure.PathConstants;
-import filter.InformedFilter;
-
-/**
- * Trains and tests a classifier, 
- * executes k-fold cross validation on train data 
- * and outputs the classification results.
- * 
- * @author Hayda Almeida
- * @since 2014
- *
- */
-
-public class Trainer {
-	
-	public static int SEED = 1; //the seed for randomizing the data
-	public static int FOLDS = 5; //the # of folds to generate
-	double[][] ranking;
-	String rank;
-	
-	boolean verbose = false;
-	
-
-	/**
-	 * @param args
-	 * @throws Exception 
-	 */
-	public static void main(String[] args) throws Exception {
-		
-		
-		String classifier= "";	
-		
-		for(int i = 0; i < args.length; i++){
-			try{
-				if(args[i].matches("-lmt"))
-					classifier = "lmt";
-				if(args[i].matches("-svm"))
-					classifier = "svm";
-				if(args[i].matches("-nb"))
-				classifier = "nb";				
-			}
-			catch(Exception e){
-				System.out.println("A classifier must be given as argument. Use: \n"
-				+ "-lmt -> a LMT classifier; \n "
-				+ "-svm -> a SVM classifier; \n"
-				+ "-nb  -> a Naive Bayes classifier. ");
-				System.exit(0);
-			}
-		}
-		
-		PathConstants pathVars = new PathConstants();
-		Trainer evaluator = new Trainer();
-		InformedFilter filter = new InformedFilter();			
-		Classifier cls;
-		
-		//Creating classifier
-		if(classifier.contains("lmt")) 
-			cls = (Classifier) new LMT();
-		else if (classifier.contains("svm")) 
-			cls = (Classifier) new LibSVM();
-		else 
-			cls = (Classifier) new NaiveBayes();
-       						
-		//Loading train data
-		DataSource sourceTrain = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TRAIN_DIR + pathVars.ARFF_TRAIN);
-		Instances trainData = sourceTrain.getDataSet();
-		
-		//Flagging the class index on the training data
-		trainData.setClassIndex(trainData.numAttributes()-1);		
-		System.out.println("Class index set on training data.");
-		
-		System.out.println("Training data loaded. Number of instances: " + trainData.numInstances() + "\n");	
-					
-		
-		//Loading test data
-		DataSource sourceTest = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TEST_DIR + pathVars.ARFF_TEST);
-		Instances testData = sourceTest.getDataSet();
-		
-		//Flagging the class index on the training data
-		testData.setClassIndex(trainData.numAttributes()-1);		
-		System.out.println("Class index set on testing data.");
-		
-		System.out.println("Test data loaded. Number of instances: " + testData.numInstances() + "\n");		
-		
-		
-		//filter the file IDs, consider the new training set
-		Instances filteredTrainData = evaluator.filteredIDs(trainData);
-		Instances filteredTestData = evaluator.filteredIDs(testData);
-		
-		if(Boolean.valueOf(pathVars.USE_ODDS_RATIO)){
-			//Calculate OddsRatio for all instances
-			double[] OR = evaluator.loadFeatureFilter(filteredTrainData, filter, 1, Integer.parseInt(pathVars.OR_THRESHOLD));
-
-			//Apply Odds Ratio filtering in instances
-			filteredTrainData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTrainData);
-			filteredTestData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTestData);
-		}
-		
-		if(Boolean.valueOf(pathVars.USE_IDF)){
-			//Calculate idf for all instances
-			double[] idf = evaluator.loadFeatureFilter(filteredTrainData, filter, 2, Integer.parseInt(pathVars.IDF_THRESHOLD));
-			
-			//Apply idf filtering in instances
-			filteredTrainData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTrainData);
-			filteredTestData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTestData);
-		}
-				
-		//Training and testing classifier
-		evaluator.classify(filteredTrainData, filteredTestData, cls, testData);			
-		
-	}	
-	
-	/**
-	 * Loads evaluation of attributes according
-	 * to feature selection method provided.
-	 * 
-	 * @param data data instances
-	 * @param filter informed filter instance 
-	 * @param method identifier for selection method 
-	 * @return
-	 */
-	private double[] loadFeatureFilter(Instances data, InformedFilter filter, int method, int threshold){
-		
-		double[] values = new double[data.numAttributes()];		
-		
-		switch(method){
-		
-		case 1:
-			values = filter.oddsRatio(data, threshold);
-			break;
-		case 2:
-			values = filter.idf(data, threshold);
-			break;
-		}		
-		
-		return values;		
-	}	
-	
-	/**
-	 * Uses evaluation of features according to 
-	 * selection method to remove attributes from
-	 * the dataset before training phase.
-	 * 
-	 * @param threshold selection method threshold
-	 * @param values evaluation of attributes according to method 
-	 * @param data dataset instances
-	 * @return filtered dataset instances
-	 * @throws Exception
-	 */	
-	private Instances applyFilter(String threshold, double[] values, Instances data) throws Exception{
-		int numberRemoved = 0;
-		
-		String indexRemove = "";		
-		
-		for(int i = 0; i < values.length; i++){
-			if(values[i] == 0){
-				
-				int ind = i+1;
-				
-				if(indexRemove.length()==0) indexRemove = ind + ""; 
-				else indexRemove = indexRemove + "," + ind;
-				
-				numberRemoved++;
-			}
-		}
-		
-		try{
-			indexRemove = indexRemove.substring(0, indexRemove.length()-1);
-			//if(verbose)
-			System.out.println("\n = = = = => Filter removed " + numberRemoved +" attributes: " + indexRemove.toString() );
-		}
-		catch (Exception e){
-			System.out.println("\n = = = = => Filter threshold did not remove any attribute.");
-			}
-		
-		Remove remove = new Remove();
-		remove.setAttributeIndices(indexRemove);
-		remove.setInvertSelection(false);		
-		remove.setInputFormat(data);		
-		
-		Instances dataSubset = Filter.useFilter(data, remove);
-		return dataSubset;		
-	}
-	
-	
-	/**
-	 * Removes the ID attribute (index 1) 
-	 * from a given dataset 
-	 * 
-	 * @param data instances
-	 * @return filtered dataset
-	 * @throws Exception
-	 */
-	private Instances filteredIDs(Instances data) throws Exception {
-		Remove remove = new Remove();		
-		//setting index to be removed
-		remove.setAttributeIndices("1");
-		remove.setInvertSelection(false);		
-		remove.setInputFormat(data);
-		
-		Instances dataSubset = Filter.useFilter(data, remove);
-		return dataSubset;
-	}
-
-
-	/**
-	 * Trains and tests a classifier when two separated
-	 * datasets are provided.
-	 * 
-	 * @param train training data to build classifier
-	 * @param test  test data to evaluate classifier
-	 * @param classif  type of classifier applied
-	 * @throws Exception
-	 */
-	public void classify(Instances filteredTrain, Instances filteredTest, Classifier classif, Instances test) throws Exception{
-
-		StringBuffer sb = new StringBuffer();
-		PlainText prediction = new PlainText();
-		Range attributesToShow = null;
-		prediction.setBuffer(sb);
-		prediction.setHeader(test);				
-		prediction.setOutputDistribution(true);
-
-		classif.buildClassifier(filteredTrain);
-
-		Evaluation evaluateClassifier = new Evaluation(filteredTrain);		
-		evaluateClassifier.evaluateModel(classif, filteredTest, prediction, attributesToShow, true);
-		//evaluateClassifier.evaluateModel(classif, filteredTest);	
-
-			stats(evaluateClassifier, classif);
-
-		ArrayList<Prediction> output =  evaluateClassifier.predictions();		
-
-		if(verbose){
-		for(int i = 0; i < output.size(); i++){
-			double act = output.get(i).actual();
-			String actual;
-			if(act == 1.0) actual = "negative"; else actual = "positive";
-
-			double pred = output.get(i).predicted();
-			String predicted;
-			if(pred == 1.0) predicted = "negative"; else predicted = "positive";
-
-			String value = test.instance(i).toString(0);
-
-			System.out.println("PMID: "+ value + "\t" +
-					"Actual: " + actual + "\t" +
-					"Predicted: " + predicted								
-					);	
-		}	}			
-	}
-
-	
-	/**
-	 * Outputs classifier results.
-	 * 
-	 * @param eval  Evaluation model built by a classifier
-	 * @param classif  type of classifier applied
-	 * @throws Exception 
-	 */
-	public void stats(Evaluation eval, Classifier classif) throws Exception{		
-		System.out.println("Number of attributes: " + eval.getHeader().numAttributes());
-		System.out.println(eval.toSummaryString("\n======== RESULTS ========\n", false));
-		System.out.println(eval.toClassDetailsString("\n\n======== Detailed accuracy by class ========\n"));
-		System.out.println(eval.toMatrixString("\n\n======== Confusion Matrix ========\n"));		
-	}
-	
-	
-	//Training and testing costSensitive classifier
-	//evaluator.classify(trainData, testData, evaluator.classifySensitive(cls));
-	
-//	/**
-//	 * Trains and tests a classifier using a 
-//	 * provided Cost matrix 
-//	 * 
-//	 * @param classif type of classifier to be trained
-//	 * @return CostSensitive classifier with costs and classifier
-//	 * @throws Exception
-//	 */	
-//	public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{
-//		CostSensitiveClassifier costSensitive = new CostSensitiveClassifier();
-//		CostMatrix matrix = new CostMatrix(2);
-//		matrix.setElement(0, 1, 4);
-//		matrix.setElement(1, 0, 1);
-//		costSensitive.setClassifier(classif);
-//		costSensitive.setCostMatrix(matrix);
-//		
-//		return costSensitive;
-//	}
-	
-	//Executing k-fold cross validation on filtered classifiers
-	//evaluator.crossFold(trainData, PCAclassifier);
-	//evaluator.crossFold(trainData, LSAclassifier);
-	
-//	/**
-//	 * Executes k-fold cross validation 
-//	 * on a given dataset
-//	 * @param data training data provided
-//	 * @param classif type of classifier usedsearch
-//	 * @throws Exception
-//	 */			
-//	public void crossFold(Instances data, Classifier classif) throws Exception{
-//
-//		Random random = new Random(SEED); //creating seed number generator
-//		Evaluation evaluateClassifier = new Evaluation(data);
-//		
-//		System.out.println("Classifier working...\n\n");
-//		//Classifier should not be trained when cross-validation is executed. 
-//		//because subsequent calls to buildClassifier method will return the same results always.
-//		evaluateClassifier.crossValidateModel(classif, data, FOLDS, random);		
-//						
-//		stats(evaluateClassifier, classif);		
-//	}	
-	
-	
-	//Creating filtered classifiers
-	//AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls);
-	//AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls);
-	//AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls);
-	//AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls);
-	
-//	/**
-//	 * Implements a Filtered GainRatio classifier, 
-//	 * using the ranker as a search method.
-//	 * 
-//	 * @param classif type of classifier to be used
-//	 * @return  filtered classif with Correlation analysis
-//	 */	
-//	public AttributeSelectedClassifier setGRFilter(Classifier classif){
-//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
-//		
-//		//Creating evaluator and search method
-//		GainRatioAttributeEval GR = new GainRatioAttributeEval();
-//		Ranker rank = new Ranker();
-//		//return the attributes with evaluation greater than 0
-//		double threshold = 0.0;
-//		rank.setThreshold(threshold);
-//		
-//		//Setting GainRatio filtered classifier		
-//		fClassif.setClassifier(classif);
-//		fClassif.setEvaluator(GR);
-//		fClassif.setSearch(rank);
-//		
-//		return fClassif;
-//		
-//	}
-//	
-//	/**
-//	 * Implements a Filtered Correlation classifier, 
-//	 * using the ranker as a search method.
-//	 * 
-//	 * @param classif type of classifier to be used
-//	 * @return  filtered classif with Correlation analysis
-//	 */	
-//	public AttributeSelectedClassifier setCorrFilter(Classifier classif){
-//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
-//		
-//		//Creating evaluator and search method
-//		CorrelationAttributeEval Corr = new CorrelationAttributeEval();
-//		Ranker rank = new Ranker();
-//		
-//		//return the attributes with evaluation greater than 0
-//		double threshold = 0.03;
-//		rank.setThreshold(threshold);
-//		
-//		//Setting GainRatio filtered classifier		
-//		fClassif.setClassifier(classif);
-//		fClassif.setEvaluator(Corr);
-//		fClassif.setSearch(rank);
-//		
-//		return fClassif;
-//		
-//	}
-//	
-//	/**
-//	 * Implements a Filtered PCA classifier, 
-//	 * using the ranker as a search method.
-//	 * 
-//	 * @param classif type of classifier to be used
-//	 * @return  filtered classif with PCA analysis config
-//	 */
-//	public AttributeSelectedClassifier setPCAFilter(Classifier classif){
-//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
-//		
-//		//Creating evaluator and search method
-//		PrincipalComponents PCA = new PrincipalComponents();
-//		PCA.setMaximumAttributeNames(-1);
-//		Ranker rank = new Ranker();
-//		//return the attributes with evaluation greater than 0
-//		rank.setThreshold(0);
-//				
-//		//Setting the PCA classifier configurations
-//		fClassif.setClassifier(classif);
-//		fClassif.setEvaluator(PCA);
-//		fClassif.setSearch(rank);		
-//		
-//		return fClassif;
-//	}
-//	
-//	/**
-//	 * Implements a Filtered LSA classifier, 
-//	 * using the ranker as a search method
-//	 * @param classif
-//	 * @return
-//	 */	
-//	private AttributeSelectedClassifier setLSAFilter(Classifier classif) {
-//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
-//		
-//		//Creating evaluator
-//		LatentSemanticAnalysis LSA = new LatentSemanticAnalysis();
-//		LSA.setMaximumAttributeNames(-1);
-//		//value between 0 and 1 includes proportion of total latent variables
-//		//greater than 1 = exact # of variables to include;
-//		//less than or equal zero = include all;
-//		//default = 0.95 (proportional)
-//		double defaul = 0;
-//		LSA.setRank(defaul);
-//		//Creating search method
-//		Ranker rank = new Ranker();
-//		rank.setThreshold(0);
-//				
-//		//Setting the LSA classifier configurations
-//		fClassif.setClassifier(classif);		
-//		fClassif.setEvaluator(LSA);
-//		fClassif.setSearch(rank);				
-//		
-//		return fClassif;
-//	}	
-	
-	
-
-}
diff --git a/src/configure/.gitignore b/src/configure/.gitignore
deleted file mode 100644
index 26ecd44..0000000
--- a/src/configure/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/DeprecatedVariables.class
-/PathConstants.class
diff --git a/src/configure/PathConstants.java b/src/configure/PathConstants.java
deleted file mode 100644
index dab7b82..0000000
--- a/src/configure/PathConstants.java
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-***
-* This class re-uses https://code.google.com/p/semlinker/source/browse/trunk/src/configure/NistKBPConfiguration.java
-* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton
-*                   Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc
-*                   
-* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License
-* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
-*/
-
-package configure;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-/**
- *
- * Variables used by the software 
- * 
- * @author Marie-Jean Meurs
- * @since 2013
- *  
- */
-public class PathConstants {	
-
-	/**
-	 * Default constructor
-	 */
-	public PathConstants() {
-		initVars();
-	}
-
-	/**
-	 * Constructor with custom parameter file.
-	 * @param configfile
-	 */
-	public PathConstants(String configfile) {
-		CONFIG_FILE = configfile;
-		initVars();
-	}
-
-
-	public static String CONFIG_FILE = "config.cfg";
-	public HashMap<String, String> CONFIG_MAP = new HashMap<String, String>();
-
-	//Input files
-	public String HOME_DIR;
-	public String CORPUS_DIR;
-	public String SOURCE_DIR;
-	public String DUP_DIR;
-	public String POS_DIR;
-	public String NEG_DIR;
-	public String TRAIN_DIR; 
-	public String TEST_DIR;
-	public String FEATURE_DIR;	
-	public String OUTPUT_MODEL;
-	public String TRAINING_FILE;
-	public String TEST_FILE;
-	public String ARFF_TRAIN;
-	public String ARFF_TEST;
-	public String STOP_LIST;
-	
-	//Output files
-	public String JOURNAL_TITLE_FEATURES;
-	public String ECNUM_FEATURES;
-	public String ANNOTATION_FEATURES;
-	public String TITLE_FEATURES;
-	public String NGRAM_FEATURES;
-	public String TITLE_NGRAMS;
-	public String DOC_IDS;
-	
-	//Feature setup
-	public String USE_TEXT_SIZE;
-	public String USE_JOURNAL_TITLE_FEATURE;
-	public String USE_ECNUM_FEATURE;
-	public String FEATURE_MIN_FREQ;
-	public String FEATURE_MIN_LENGTH;
-	
-	//Feature setup - Annotations
-	public String USE_ANNOTATION_FEATURE;
-	public String USE_ANNOTATION_TYPE;
-	public String USE_TITLE_FEATURE;
-	public String USE_DOC_ID;
-	
-	//Feature setup - Ngrams
-	public String USE_NGRAM_FEATURE;
-	public String USE_TITLE_NGRAMS;
-	public String NGRAM_STOP;
-	public String NGRAM_SIZE;
-	public String USE_WEIGHTED_NGRAM;
-	public String WEIGHT;
-	
-	//Feature filtering
-	public String USE_ODDS_RATIO;
-	public String OR_THRESHOLD;
-	public String USE_IDF;
-	public String IDF_THRESHOLD;
-
-	//Task setup
-	public String EXP_TYPE;	
-	public String NB_PARAMS;		
-	
-
-	private void initVars() {
-		String text = null;
-
-		try {
-			BufferedReader reader = new BufferedReader(new FileReader(CONFIG_FILE));
-			while ((text = reader.readLine()) != null) {
-				if (! text.startsWith("#")) {
-					String label = text.split("=")[0];
-					String value = text.split("=")[1];
-					CONFIG_MAP.put(label, value);
-				}
-			}
-			reader.close();
-		} catch (IOException ex) {
-			Logger.getLogger(PathConstants.class.getName()).log(Level.SEVERE, null, ex);
-		}
-		HOME_DIR = CONFIG_MAP.get("HOME_DIR");
-		CORPUS_DIR = CONFIG_MAP.get("CORPUS_DIR"); 
-		SOURCE_DIR = CONFIG_MAP.get("SOURCE_DIR");
-		DUP_DIR = CONFIG_MAP.get("DUP_DIR");
-		POS_DIR = CONFIG_MAP.get("POS_DIR");
-		NEG_DIR = CONFIG_MAP.get("NEG_DIR");
-		TRAIN_DIR = CONFIG_MAP.get("TRAIN_DIR"); 
-		TEST_DIR = CONFIG_MAP.get("TEST_DIR");
-		FEATURE_DIR = CONFIG_MAP.get("FEATURE_DIR");		
-		OUTPUT_MODEL = CONFIG_MAP.get("OUTPUT_MODEL");
-		TRAINING_FILE = CONFIG_MAP.get("TRAINING_FILE");
-		TEST_FILE = CONFIG_MAP.get("TEST_FILE");
-		ARFF_TRAIN = CONFIG_MAP.get("ARFF_TRAIN");
-		ARFF_TEST = CONFIG_MAP.get("ARFF_TEST");
-		STOP_LIST = CONFIG_MAP.get("STOP_LIST");
-		
-		JOURNAL_TITLE_FEATURES = CONFIG_MAP.get("JOURNAL_TITLE_FEATURES");
-		ECNUM_FEATURES = CONFIG_MAP.get("ECNUM_FEATURES");	
-		ANNOTATION_FEATURES = CONFIG_MAP.get("ANNOTATION_FEATURES");
-		TITLE_FEATURES = CONFIG_MAP.get("TITLE_FEATURES");
-		NGRAM_FEATURES = CONFIG_MAP.get("NGRAM_FEATURES");
-		TITLE_NGRAMS = CONFIG_MAP.get("TITLE_NGRAMS");
-		DOC_IDS = CONFIG_MAP.get("DOC_IDS");
-		
-		USE_TEXT_SIZE = CONFIG_MAP.get("USE_TEXT_SIZE");
-		USE_JOURNAL_TITLE_FEATURE = CONFIG_MAP.get("USE_JOURNAL_TITLE_FEATURE");	
-		USE_ECNUM_FEATURE = CONFIG_MAP.get("USE_ECNUM_FEATURE");
-		FEATURE_MIN_FREQ = CONFIG_MAP.get("FEATURE_MIN_FREQ");
-		FEATURE_MIN_LENGTH = CONFIG_MAP.get("FEATURE_MIN_LENGTH");
-		
-		USE_ANNOTATION_FEATURE = CONFIG_MAP.get("USE_ANNOTATION_FEATURE");
-		USE_ANNOTATION_TYPE = CONFIG_MAP.get("USE_ANNOTATION_TYPE");		
-		USE_TITLE_FEATURE = CONFIG_MAP.get("USE_TITLE_FEATURE");
-		USE_DOC_ID = CONFIG_MAP.get("USE_DOC_ID");
-		
-		USE_NGRAM_FEATURE = CONFIG_MAP.get("USE_NGRAM_FEATURE");
-		USE_TITLE_NGRAMS = CONFIG_MAP.get("USE_TITLE_NGRAMS");
-		NGRAM_STOP = CONFIG_MAP.get("NGRAM_STOP");		
-		NGRAM_SIZE = CONFIG_MAP.get("NGRAM_SIZE");
-		USE_WEIGHTED_NGRAM = CONFIG_MAP.get("USE_WEIGHTED_NGRAM");
-		WEIGHT = CONFIG_MAP.get("WEIGHT");
-		
-		USE_ODDS_RATIO = CONFIG_MAP.get("USE_ODDS_RATIO");
-		OR_THRESHOLD = CONFIG_MAP.get("OR_THRESHOLD");
-		USE_IDF = CONFIG_MAP.get("USE_IDF");
-		IDF_THRESHOLD = CONFIG_MAP.get("IDF_THRESHOLD");
-				
-		EXP_TYPE = CONFIG_MAP.get("EXP_TYPE");		
-		NB_PARAMS = CONFIG_MAP.get("NB_PARAMS");		
-		
-	}
-}
diff --git a/src/filter/.gitignore b/src/filter/.gitignore
deleted file mode 100644
index 6b468b6..0000000
--- a/src/filter/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.class
diff --git a/src/filter/InformedFilter.java b/src/filter/InformedFilter.java
deleted file mode 100644
index 4b125db..0000000
--- a/src/filter/InformedFilter.java
+++ /dev/null
@@ -1,182 +0,0 @@
-package filter;
-
-import weka.core.Attribute;
-import weka.core.Instances;
-
-/**
- * This class implements informed feature selection
- * methods, to be used as filters after vector 
- * generation and pre-model building 
- * 
- * @author Hayda Almeida
- * @since 2015
- *
- */
-public class InformedFilter {
-	
-	private boolean verbose = true;
-	
-	/**
-	 * Calculates oddsRatio of each feature 
-	 * in a given set of Instances
-	 *  
-	 * @param data set of instances, read from ARFF file
-	 * @return oddsRatio for each attribute in the matrix
-	 */
-	public double[] oddsRatio(Instances data, int threshold){
-
-		double[] oddsRatio = new double[data.numAttributes()];
-		
-
-		for(int i = 0; i < data.numAttributes()-1; i++ ){
-
-			double OR = 0;
-
-			Attribute current = data.attribute(i);
-			double pos_docs = 0, //number of documents in class C 
-					pos_oc = 0,  //number of times term t occured in class C
-					pos_term_docs = 0, //number of docs in class C that have term
-					pos_not_docs = 0,  //number of docs in class C that do not have term
-					neg_term_docs = 0,   //number of docs not in class C with term
-					neg_not_docs = 0,  //number of docs not in class C nor with term
-					neg_docs = 0; //number of documents not in class C
-
-			for(int j = 0; j < data.size(); j++){
-
-				double current_value = data.instance(j).value(current);
-				double current_class = data.instance(j).classValue();
-
-				//class is positive  
-				if(current_class < 1){
-					pos_docs = pos_docs + 1;
-
-					//the feature occurred in the document
-					if(current_value > 0){
-						pos_oc = pos_oc + current_value;
-						pos_term_docs = pos_term_docs +1;
-					}
-					//the feature did not occur in positive docs
-					else pos_not_docs = pos_not_docs + 1;
-				}
-				//class is negative
-				else{
-					neg_docs = neg_docs+1;
-
-					//the feature occurred in the document
-					if(current_value > 0){
-						neg_term_docs = neg_term_docs +1;
-					}
-					//the feature did not occur in negative docs
-					else neg_not_docs = neg_not_docs + 1;
-				}
-
-			}
-
-			OR = ( ( (pos_term_docs / pos_docs) / (pos_not_docs/ pos_docs) ) / 
-					( (neg_term_docs / neg_docs) / (neg_not_docs / neg_docs) ) ); 
-			
-		//	OR = (pos_term_docs / pos_not_docs) / (neg_term_docs / neg_not_docs);
-			
-			
-			//99% confidence: 2.575
-			//95% confidence: 1.96
-			double confidenceLow =  Math.exp(Math.log(OR) - (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs))));
-			double confidenceHigh = Math.exp(Math.log(OR) + (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs))));
-						
-			//checking if OR value is within the confidence interval
-			//and if it satisfies the threshold
-			if( ((OR <= confidenceHigh) && (OR >= confidenceLow) 
-					&& !(OR == threshold))
-					//checking if the confidence interval holds the null hypothesis (i.e., spans 1.0)
-					&& !(confidenceLow <=1 && confidenceHigh >=1))
-				oddsRatio[i] = OR;
-			else
-				oddsRatio[i] = 0;
-			
-			if(verbose){
-			System.out.println("Attribute: "+ data.attribute(i).toString() +"\t\t OddsRatio: " + oddsRatio[i] + 
-					"\tConfidenceLow: " + confidenceLow + "\tConfidenceHigh: "+ confidenceHigh);
-			}
-		}
-		
-		return oddsRatio;		
-	}
-	
-	/**
-	 * Calculates the inverse document frequency
-	 * for each attribute in the dataset. 
-	 * 
-	 * @param data instances
-	 * @param threshold 
-	 * @return list of idfs for each attribute
-	 */
-	public double[] idf(Instances data, int threshold){
-		
-		double[] idf = new double[data.numAttributes()];		
-		
-		for(int i = 0; i < data.numAttributes()-1; i++ ){
-
-			double idf_at = 0;
-			double idf_at2 = 0;
-
-			Attribute current = data.attribute(i);
-			double pos_docs = 0, //number of documents in class C				
-					pos_term_docs = 0, //number of docs in class C that have term
-					neg_term_docs = 0,   //number of docs not in class C with term					
-					neg_docs = 0; //number of documents not in class C
-
-			for(int j = 0; j < data.size(); j++){
-
-				double current_value = data.instance(j).value(current);
-				double current_class = data.instance(j).classValue();
-
-				//class is positive  
-				if(current_class < 1){					
-					pos_docs = pos_docs + 1;
-
-					//the feature occurred in the document
-					if(current_value > 0){						
-						pos_term_docs = pos_term_docs +1;	
-					}						
-				}
-				else{
-					//class is negative 
-					neg_docs = neg_docs+1;
-					
-					//the feature occurred in the document
-					if(current_value > 0){						
-						neg_term_docs = neg_term_docs +1;
-					}					
-				}
-			}			
-						
-//			double idf_pos = Math.log((pos_docs)/(pos_term_docs));
-//			double idf_neg = Math.log((neg_docs)/(neg_term_docs));
-
-			//check if the idf in the "positive" collection
-			//is greater than the idf in the "negative" collection
-//			if (idf_pos > idf_neg) 
-//				idf_at = idf_pos;
-//				
-//			else idf_at = 0;			
-
-			idf_at = Math.log((pos_docs + neg_docs)/(pos_term_docs + neg_term_docs));
-			
-			if(idf_at <= threshold)
-				idf[i] = 0;				
-			else
-				idf[i] = idf_at;
-		}
-		
-		if(verbose){
-			for(int i = 0; i < idf.length; i++){
-				if(idf[i]>0)
-				   System.out.println("Attribute: "+ data.attribute(i).toString()+ "\t\t\t IDF: " + idf[i]);				
-			}
-		}
-		
-		return idf;			
-	}	
-	
-
-}
diff --git a/src/filter/NaiveFilter.java b/src/filter/NaiveFilter.java
deleted file mode 100644
index db8a32e..0000000
--- a/src/filter/NaiveFilter.java
+++ /dev/null
@@ -1,117 +0,0 @@
-package filter;
-
-import java.io.BufferedReader;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import org.apache.commons.lang3.StringUtils;
-import configure.PathConstants;
-
-/**
- * 
- * This class implements naive feature filtering methods 
- * to be used by the extractor processes pre-vector building
- *   
- * @author Hayda Almeida
- * @since 2015 
- *
- */
-public class NaiveFilter {
-	
-	private boolean verbose = true;	
-	
-	/**
-	 * Removes from feature list all features with 
-	 * frequency not statistically relevant (2 or less)
-	 * @param list to be cleaned
-	 */	
-	public void considerAnnotationOccurence(HashMap<Map<String,String>,Integer> list, PathConstants vars){
-		//going over the list of annotations and removing the
-		//features with occurance lower than specified.
-		
-		Iterator<Map<String, String>> iterator = list.keySet().iterator();
-							
-		while(iterator.hasNext()){
-			Map<String, String> key = iterator.next();
-			int valor = list.get(key).intValue();			
-			
-			if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
-				iterator.remove();				
-			}
-		}		
-	}
-	
-	/**
-	 * Removes from feature list all features with 
-	 * frequency not statistically relevant (2 or less)
-	 * @param list to be cleaned
-	 */	
-	public void considerNgramOccurence(HashMap<String,Integer> list, PathConstants vars){
-		//going over the list of annotations and removing the
-		//statistically not significant features - frequency less than 2
-		Iterator <Integer> iterator = list.values().iterator();
-
-		while(iterator.hasNext()){
-			Integer key = iterator.next();
-
-			if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
-				iterator.remove();				
-			}
-		}
-	}
-	
-	/**
-	 * Removes stopwords from ngrams list
-	 * 
-	 * @param str list of ngrams
-	 * @param constants 
-	 * @return cleaned list of ngrams
-	 */	
-	public String removeStopList(String[] str, PathConstants pathVar){
-		
-		//stop-words file name
-		String pathStop = "stopList.txt";
-		String[] stop = null;
-		StringBuilder cleaned = new StringBuilder();
-		
-		try{
-			
-			BufferedReader reader = new BufferedReader(new FileReader(pathStop));
-			
-			String line = null;	
-			//loading stop-words list
-			while((line = reader.readLine()) != null){
-				stop = StringUtils.split(line,",");
-				line = reader.readLine();
-			}
-			
-			reader.close();
-			
-		}catch (FileNotFoundException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        } 		
-		
-		//iteraing over text to be cleaned
-		for(int i = 0; i < str.length; i++){
-			//iterating over stop-words list
-			for(int j = 0; j < stop.length; j++){
-				
-				//when stop-word is encountered, replace it
-				if(str[i].equalsIgnoreCase(stop[j])){
-					str[i] = str[i].replace(str[i],"*");					
-				}				
-			}
-			//retrieve the text without stop-words replacements
-			if(!(str[i].contentEquals("*"))){
-				cleaned.append(str[i]).append(" ");				
-			}
-		}		
-		return cleaned.toString().replace("  ", " ");
-	}
-
-}
diff --git a/src/preprocessing/.gitignore b/src/preprocessing/.gitignore
deleted file mode 100644
index 6b468b6..0000000
--- a/src/preprocessing/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.class
diff --git a/src/preprocessing/ConcatXML.java b/src/preprocessing/ConcatXML.java
deleted file mode 100644
index 89e255f..0000000
--- a/src/preprocessing/ConcatXML.java
+++ /dev/null
@@ -1,717 +0,0 @@
-/*
- * The MIT License (MIT)
-
-Copyright (c) 2014 
-
-Hayda Almeida
-Marie-Jean Meurs
-
-Concordia University
-Tsang Lab
-
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-package preprocessing;
-
-import java.io.BufferedOutputStream;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.PrintWriter;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Date;
-import java.util.List;
-
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
-
-import configure.PathConstants;
-
-/**
- * Generates a corpus from raw XML doc instances, 
- * so that features can be extracted from it
- *   
- * @author Hayda Almeida
- * @since 2014
- *
- */
-public class ConcatXML{
-	
-	private String tag1;
-	private String tag2;
-	private String tag3;
-	private String tag4;
-	private String id;
-	private String corpusTag;
-	private String corpusTagC;
-
-
-	public ConcatXML(){	
-
-		this.setId("PMID");				
-		this.setTag1("(?s)<.*?xml.*?>");
-		this.setTag2("(?s)<.*?!DOCTYPE.*?>");
-		this.setTag3("(?s)<.*?corpus.*?>");
-		this.seTag4("(?s)<.*?/corpus.*?>");
-		this.setCorpusTag("<corpus>");
-		this.setCorpusTag("</corpus>");		
-	}
-	
-	
-
-	public static void main(String[] args) throws Exception {	
-		
-		PathConstants pathVars = new PathConstants();
-		
-		String xmlDir = "";
-		if(Integer.parseInt(pathVars.EXP_TYPE)== 1)
-			xmlDir = "test";
-		else xmlDir = "train";
-		
-		String sourceDir = "", duplicatesDir = "";
-		
-		Boolean dc = false, df = false, cl = false, cc = false;
-		
-		for(int i = 0; i < args.length; i++){
-			try{				
-				if(args[i].matches("-dc")) 	dc = true;
-				if(args[i].matches("-df"))	df = true;
-				if(args[i].matches("-cl"))  cl = true;
-				if(args[i].matches("-cc"))  cc = true;
-			}
-			catch(Exception e){
-				System.out.println("Use: \n"			
-					+ "-tr -> train, -ts -> test; \n "
-					+ "-dc 	-> check duplicates in corpus vs. folder; \n "
-					+ "-df  -> check duplicates in two folders; \n"
-					+ "-cl  -> clean a source folder; \n"
-					+ "-cc  -> concatenate files in a folder ");
-				System.exit(0);
-				};
-		}				
-		
-		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
-		String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE;
-
-		sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + xmlDir;
-		duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.DUP_DIR;
-				
-		String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml";
-		String tagCorpus = concatCorpus;
-		
-		ConcatXML concat = new ConcatXML();		
-		
-		//================= Checking for duplicates =====================//
-		if(dc) concat.checkDupCorpus(trainCorpusPath, sourceDir);
-		if(df) concat.checkDupFolder(sourceDir, duplicatesDir);
-				
-		//================== Creating corpus ==========================//
-		if(cl){
-			concat.cleanXML(sourceDir);
-			if(duplicatesDir.length()>1)
-				concat.cleanXML(duplicatesDir);
-			}
-		if(cc){
-			concat.concatenateXML(sourceDir, "", concatCorpus);
-			concat.tagCorpus(tagCorpus);
-		}
-	}	
-	
-	/**
-	 * Returns the ID of a XML jsoup document
-	 * @param doc  a XML doc parsed by jsoup 
-	 * @return ID string
-	 * @throws IOException
-	 */
-	public String returnID(Document doc) throws IOException{
-		
-		String id = "";
-		
-		Elements paper = doc.body().getElementsByTag("pubmedarticleset");						
-								
-		//fetching the paper ID - 
-		//for all items in a paper, retrieve only PMIDs 
-		for(Element e : paper.select(getId())){
-			//only consider the ID if the parent is medline citation
-			if(e.parentNode().nodeName().contains("medline")){						
-				id = e.text();
-			}
-		}
-		return id;
-	}
-	
-	/**
-	 * Reads the file IDs in a folder and 
-	 * checks a second folder for duplicates. 
-	 *  
-	 * @param dirSrc source folder
-	 * @param dirDup folder to check for duplicates
-	 */
-	
-	public void checkDupFolder(String dirSrc, String dirDup){
-		ArrayList<String> sourceIDs = new ArrayList<String>();
-		ArrayList<String> duplicated = new ArrayList<String>();
-		ArrayList<String> dupIDs = new ArrayList<String>();
-		int ids = 0;
-
-		if(dirSrc.contentEquals(dirDup)){		
-			System.out.println("Source and duplicates directories are the same.\n\n========================\n");			
-		}		
-		else {		
-
-			File sourceDir = new File(dirSrc);
-			File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
-				@Override
-				public boolean accept(File dir, String name){
-					return name.endsWith(".xml");
-				}
-			});	
-
-			try{
-				//for each file on the source dir 
-				for (File xml : srcXMLs){				
-
-					try{
-						
-						String id  = "";
-						//Loading file
-						File input = new File(xml.getPath());
-						//Jsoup parse
-						Document doc = Jsoup.parse(input, "UTF-8");
-												
-						//fetching the document ID
-						id = returnID(doc);
-
-						if(!id.isEmpty()){
-							sourceIDs.add(id);
-							ids++;
-						}
-
-					}catch (FileNotFoundException e) {
-						e.printStackTrace();
-					}
-
-				}				
-
-			}catch (FileNotFoundException e) {
-				e.printStackTrace();
-			}
-			catch(Exception e){
-				throw new RuntimeException(e);
-			}
-
-			System.out.println(ids + " source file IDs encountered.");
-			ids = 0;
-
-			File dupDir = new File(dirDup);
-
-			File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
-				@Override
-				public boolean accept(File dir, String name){
-					return name.endsWith(".xml");
-				}
-			});		
-
-			try{
-				//for each file on the possibly duplicated dir 
-				for (File xml : dupXMLs){				
-
-					try{
-						String id  = "";
-												//Loading file
-						File input = new File(xml.getPath());
-						//Jsoup parse
-						Document doc = Jsoup.parse(input, "UTF-8");
-												
-						//fetching the document ID
-						id = returnID(doc);
-
-						if(!id.isEmpty()){
-							dupIDs.add(id);
-							String dupFileID = id;
-							ids++;
-							
-							for(int j = 0; j < sourceIDs.size(); j++){
-								if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){
-									
-									//add ID to duplicated list
-									duplicated.add(dupFileID);
-									
-									//rename the original file									
-									Path from = xml.toPath(); //convert from File to Path
-									Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
-						    	    Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
-								}
-							}							
-						}
-
-					}catch (FileNotFoundException e) {
-						e.printStackTrace();
-					}
-				}				
-
-			}catch (FileNotFoundException e) {
-				e.printStackTrace();
-			}
-			catch(Exception e){
-				throw new RuntimeException(e);
-			}
-
-			//count number of existing papers on possibly duplicated folder
-			//just to make sure we are gathering all IDs
-			System.out.println(ids + " new file IDs encountered.");
-			ids = 0;
-			//sorting the list of duplicated IDs
-			Collections.sort(duplicated, new Comparator<String>(){
-				@Override
-				public int compare(String one, String two){
-					return one.compareTo(two);
-				}
-			});	
-
-			System.out.println("\nReaded source files: " + sourceIDs.size());				
-			System.out.println("Readed new files: " + dupIDs.size());	
-
-			System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
-
-			System.out.println("\nDuplicated files IDs: ");
-			for(int i = 0; i < duplicated.size(); i++){
-				System.out.println(duplicated.get(i));
-			}
-
-			System.out.println("\n========================\n");
-		}
-
-
-	}
-	
-	/**
-	 * Reads the corpus and checks the papers IDs
-	 * to identify duplicates in case new papers 
-	 * are being concatenated to corpus.
-	 * 
-	 * @param corpus path to current corpora to check
-	 * @param dir path to folder with new files to be concatenated
-	 */
-	
-	public void checkDupCorpus(String corpus, String dir){
-		ArrayList<String> trainingIDs = new ArrayList<String>();
-		ArrayList<String> duplicated = new ArrayList<String>();
-		ArrayList<String> newFiles = new ArrayList<String>();
-		
-		int ids = 0;
-		
-		try 
-		{
-			File input = new File(corpus);
-			//Jsoup parse
-			Document doc = Jsoup.parse(input, "UTF-8");
-			Elements corp = doc.body().getElementsByTag("pubmedarticleset");
-
-			String id  = "";		
-
-			for(Element paper : corp){
-				Document thisDoc = Jsoup.parseBodyFragment(paper.toString());
-				
-				//fetching the document ID
-				id = returnID(thisDoc);
-
-				if(!id.isEmpty()){
-					trainingIDs.add(id);
-					ids++;
-				}	
-			}		
-		}catch (FileNotFoundException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-		
-		System.out.println(ids + " training file IDs encountered.");
-		ids = 0;
-		
-		File corpusDir = new File(dir);
-		File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name){
-				return name.endsWith(".xml");
-			}
-		});		
-		
-		try{
-			//for each file on the corpus dir 
-			for (File xml : newXMLs){				
-
-				try{
-					String id  = "";
-					//Loading file
-					File input = new File(xml.getPath());
-					//Jsoup parse
-					Document doc = Jsoup.parse(input, "UTF-8");
-
-					//fetching the document ID
-					id = returnID(doc);
-
-					if(!id.isEmpty()){						
-
-						newFiles.add(id);
-						String newFileID = id;
-						ids++;
-
-
-						for(int j = 0; j < trainingIDs.size(); j++){
-							if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){
-
-								//add ID to duplicated list
-								duplicated.add(newFileID);
-
-								//moving the original file									
-								Path from = xml.toPath(); //convert from File to Path
-								Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
-								Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
-							}
-						}
-					}
-				}catch (FileNotFoundException e) {
-					e.printStackTrace();
-				}
-			}			
-
-		}catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch(Exception e){
-			throw new RuntimeException(e);
-		}
-		
-		//count number of existing papers on the training file
-		//just to make sure we are gathering all IDs
-		System.out.println(ids + " new file IDs encountered.");
-		ids = 0;
-
-		
-		//sorting the list of duplicated IDs
-		Collections.sort(duplicated, new Comparator<String>(){
-			@Override
-			public int compare(String one, String two){
-				return one.compareTo(two);
-			}
-		});	
-		
-		System.out.println("\nReaded training files: " + trainingIDs.size());				
-		System.out.println("Readed new files: " + newFiles.size());	
-				
-		System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
-		
-		System.out.println("\nDuplicated files IDs: ");
-		for(int i = 0; i < duplicated.size(); i++){
-			System.out.println(duplicated.get(i));
-		}
-		
-		System.out.println("\n========================\n");
-		
-	}
-	
-	
-	/**
-	 * Reads and edits a list of XMLs files in a folder
-	 * to remove XML and previous corpus tags, 
-	 * preparing the files to be concatenated. 
-	 *  
-	 * @param dir string with folder path
-	 */
-	
-	public void cleanXML(String dir){		
-
-		//listing files on corpus dir
-		File sourceDir = new File(dir);
-		
-		File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name){
-				return name.endsWith(".xml");
-			}
-		});		
-
-		System.out.println("... Files list loaded.");				
-
-		try{
-			//for each file on the corpus dir 
-			for (File xml : newXMLs){				
-
-				try{
-					BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
-
-					String line = null;
-					ArrayList<String> allLines = new ArrayList<String>();
-					String content  = null;
-
-					while((line = reader.readLine()) != null){						
-						content = line;	
-
-						//cleaning XML markups
-						content = content.replaceFirst(getTag1(), "");
-						content = content.replaceFirst(getTag2(), "");
-						//cleaning previous corpus tags
-						content = content.replaceFirst(getTag3(), "");
-						content = content.replaceFirst(getTag4(), "");
-						allLines.add(content);										
-					}					
-
-					PrintWriter writer = new PrintWriter(xml.getPath());
-
-					for (String l : allLines){
-						writer.println(l);			
-					}					
-					reader.close();
-					writer.close();				
-
-				}catch (FileNotFoundException e) {
-					e.printStackTrace();
-				}
-
-			}				
-
-		}catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch(Exception e){
-			throw new RuntimeException(e);
-		}
-
-		System.out.println("... Files cleaned and saved.");
-		System.out.println("Ready for concatenation.");
-		System.out.println("\n========================\n");
-	}
-	
-
-
-	/**
-	 * Concatenates all XMLs in one folder or between two folders.
-	 * @param sourceDir main directory with XML files.
-	 * @param duplicDir second directory with duplicated XML files 
-	 * @param concatFile path name to saved concatenated corpus
-	 */
-	
-	public void concatenateXML(String sourceDir, String duplicDir, String concatFile){		
-
-		final int BUFFER = 1024 << 8;
-		byte[] buffer = new byte[BUFFER];
-
-		//listing files on corpus dir
-		File srcDir = new File(sourceDir);
-		File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name){
-				return name.endsWith(".xml");
-			}
-		});
-		
-		File dupDir = new File(duplicDir);
-		File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name) {				
-				return name.endsWith(".xml");
-			}			
-		}); 
-		
-		System.out.println("... Files list loaded.");		
-
-		//defining the output file (concatenated)
-		File newCorpus = new File(concatFile);		
-
-		try{	
-			OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus));
-			
-
-			//for each file on the corpus dir 
-			for (File xmls : srcXMLs){				
-				InputStream input = new FileInputStream(xmls);				
-				int count;				
-				
-				//if the file is not empty/finished
-				try{
-					while((count = input.read(buffer)) >= 0){										
-						
-						//write it on the concatenated final file
-						output.write(buffer, 0, count);
-					}
-				}finally{
-					input.close();
-				}
-			}
-			
-		if(dupXMLs != null){
-			for(File xmld : dupXMLs){
-				InputStream input = new FileInputStream(xmld);				
-				int count;				
-				
-				//if the file is not empty/finished
-				try{
-					while((count = input.read(buffer)) >= 0){										
-						
-						//write it on the concatenated final file
-						output.write(buffer, 0, count);
-					}
-				}finally{
-					input.close();
-				}
-			}
-		}
-			output.flush();
-			output.close();				
-			
-		}catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch(Exception e){
-			throw new RuntimeException(e);
-		}
-
-		System.out.println("... File concatenated and saved.");
-		System.out.println("Ready for corpus tagging.");
-		System.out.println("\n========================\n");
-	}
-	
-	/**
-	 * Inserts corpus tag on XML file
-	 * 
-	 * @param pathToCorpus path to 
-	 * 		  concatenated corpus 
-	 */
-	
-	public void tagCorpus(String pathToCorpus){
-		
-		//tagging as corpus		
-		try{
-			BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus));
-						
-			String line = null;
-			String edit = null;
-			List<String> allLines = new ArrayList<String>();
-			
-			//adds tag at beggining of corpus
-			allLines.add(getCorpusTag());
-			
-			while((line = reader.readLine()) != null){	
-				 
-				allLines.add(line);					
-			}
-			//adds tag at the end of corpus
-			allLines.add(getCorpusTagC());			
-			
-			System.out.println("... Corpus loaded and tagged.");
-			//re-writting the file
-			PrintWriter writer = new PrintWriter(pathToCorpus);
-			
-			for (String l : allLines){
-				writer.println(l);			
-			}
-			reader.close();
-			writer.close();
-			
-			System.out.println("... File saved as tagged corpus.");			
-		}
-		catch (FileNotFoundException e) {
-			e.printStackTrace();
-		}
-		catch(IOException e){
-			e.printStackTrace();
-		}
-	}
-	
-	private String getCorpusTagC() {		
-		return corpusTagC;
-	}
-
-	private String getCorpusTag() {
-		// TODO Auto-generated method stub
-		return corpusTag;
-	}
-
-	public String getTag1() {
-		return tag1;
-	}
-
-	public void setTag1(String tag1) {
-		this.tag1 = tag1;
-	}
-
-	public String getTag2() {
-		return tag2;
-	}
-
-	public void setTag2(String tag2) {
-		this.tag2 = tag2;
-	}
-	
-	private String getTag4() {
-		// TODO Auto-generated method stub
-		return tag4;
-	}
-
-	private String getTag3() {
-		// TODO Auto-generated method stub
-		return tag3;
-	}
-
-	public String getId() {
-		return id;
-	}
-
-	public void setId(String id) {
-		this.id = id;
-	}	
-	
-	private void setCorpusTag(String string) {
-		this.corpusTag = string;
-		
-	}
-
-	private void seTag4(String string) {
-		this.tag4 = string;
-		
-	}
-
-	private void setTag3(String string) {
-		this.tag3 = string;
-		
-	}
-		
-}
-
-
diff --git a/src/preprocessing/SampleCorpus.java b/src/preprocessing/SampleCorpus.java
deleted file mode 100644
index 63613a8..0000000
--- a/src/preprocessing/SampleCorpus.java
+++ /dev/null
@@ -1,237 +0,0 @@
-package preprocessing;
-
-import java.io.File;
-import java.io.FilenameFilter;
-import java.nio.file.Files;
-import java.nio.file.StandardCopyOption;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-
-import configure.PathConstants;
-
-/**
- * Performs document instances sampling
- * generating training and test files
- * with specific balance input by user.
- *   
- * @author Hayda Almeida
- * @since 2015
- *
- */
-public class SampleCorpus {
-
-	public static void main(String[] args) throws Exception {	
-
-		PathConstants pathVars = new PathConstants();
-		SampleCorpus sampling = new SampleCorpus();
-
-		String positiveDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.POS_DIR;
-		List positives = new LinkedList();
-
-		String negativeDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.NEG_DIR;
-		List negatives = new LinkedList();
-
-		//train or test sampling
-		Boolean tr = true, ts = true;
-		//% of test corpus WRT the collection, % positive on test set, % positive on training set 
-		int percTs = 20, posTr = 50, posTs = 10;
-
-		for(int i = 0; i < args.length; i++){
-			try{				
-				if(args[i].matches("-tr")){ 
-					tr = true;
-					posTr = Integer.parseInt(args[i+1]);					
-				}				 
-				if(args[i].matches("-ts")){
-					ts = true;
-					percTs = Integer.parseInt(args[i+1]);
-					posTs = Integer.parseInt(args[i+2]);
-				}				
-			}
-			catch(Exception e){
-				System.out.println(" Use: \n "
-						+ "-tr -> (% of positives) to sample trainig set \n"
-						+ "-ts -> (% of collection) (% of positives) to sample test set");
-				System.exit(0);
-			};
-		}
-		
-		positives = sampling.loadFiles(positiveDir);
-		negatives = sampling.loadFiles(negativeDir);
-		
-		if(tr) sampling.sampleTest(pathVars, positives, negatives, percTs, posTs);
-		
-		if(ts) sampling.sampleTrain(pathVars, positives, negatives, posTr);		
-
-	}	
-	
-	/**
-	 * Lists XML files within a folder 
-	 * @param dirSrc folder path
-	 * @return returns list of file IDs
-	 */
-	public List loadFiles(String dirSrc){						
-
-		List fileIDs = new LinkedList();
-		
-		File sourceDir = new File(dirSrc);
-		File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
-			@Override
-			public boolean accept(File dir, String name){
-				return name.endsWith(".xml");
-			}
-		});	
-
-		fileIDs = new LinkedList(Arrays.asList(srcXMLs));
-		
-		return fileIDs;
-	}
-	
-	/**
-	 * Moves a specific number of files 
-	 * in a list from origin folder to a test folder
-	 * @param pathVars 
-	 * @param files List of file IDs
-	 * @param numFiles number of files to be moved
-	 */
-	public void moveFile(PathConstants pathVars, List files, int numFiles){
-		
-		Iterator<File> filesList = files.iterator();
-		File testDir = new File(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_DIR);
-		
-		if(!testDir.exists()){
-			try{
-				testDir.mkdir();
-			}catch(Exception e){
-				System.out.println("Error creating Test folder.");
-				System.exit(0);
-			}
-		}
-		
-		while(filesList.hasNext() && numFiles > 0){		
-			try{
-				File file = (File) filesList.next();
-				File newFile = new File(testDir + "/" + file.getName());
-				
-				Files.move(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
-				
-				filesList.remove();				
-				numFiles--;
-			}
-			catch(Exception e){
-				System.out.println("Error moving files.");
-				System.exit(0);
-			}
-		}	
-		
-	}
-	
-	/**
-	 * Copies a specific number of files 
-	 * in a list from origin folder to a train folder
-	 * @param pathVars
-	 * @param files  List of file IDs
-	 * @param numFiles number of files to be moved
-	 */
-	public void copyFile(PathConstants pathVars, List files, int numFiles){
-		
-		Iterator<File> filesList = files.iterator();
-		File trainDir = new File(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR);
-		
-		if(!trainDir.exists())
-			try{
-				trainDir.mkdir();
-			}catch(Exception e){
-				System.out.println("Error creating Training folder.");
-				System.exit(0);
-			}
-		
-		while(filesList.hasNext() && numFiles > 0){				
-			try{				
-				File file = (File) filesList.next();
-				File newFile = new File(trainDir + "/"+ file.getName());
-				
-				Files.copy(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
-			}
-			catch(Exception e){
-				System.out.println("Error copying files.");
-				System.exit(0);
-			}
-		}
-		
-	}
-	
-	/**
-	 * Samples document instances from the collection
-	 * to generate a test set.
-	 * 
-	 * @param pathVars
-	 * @param positives list of positive documents IDs
-	 * @param negatives list of negative documents IDs
-	 * @param total  percentage of the document collection for test
-	 * @param pos  percentage of positive documents in the test set
-	 */
-	public void sampleTest(PathConstants pathVars, List positives, List negatives, int total, int pos){
-		
-		int instances = positives.size() + negatives.size();		
-		int testSize = (instances * total) / 100; 		
-		int posSize = (testSize * pos) / 100;		
-		int negSize = testSize - posSize;		
-		
-		Collections.shuffle(negatives);	
-		System.out.println("===== Test > Negative instances shuffled for test set.");
-		moveFile(pathVars, negatives, negSize);
-		System.out.println("===== Test > Negative instances moved to test folder. \n");
-		
-		Collections.shuffle(positives);	
-		System.out.println("===== Test > Positive instances shuffled for test set.");
-		moveFile(pathVars, positives, posSize);	
-		System.out.println("===== Test > Positive instances moved to test folder. \n");
-		
-	}
-	
-	/**
-	 * Samples document instances from the collection
-	 * to generate a training set.
-	 * 
-	 * @param pathVars
-	 * @param positives list of positive documents IDs
-	 * @param negatives list of negative documents IDs
-	 * @param pos percentage of positive documents in the training set
-	 */	
-    public void sampleTrain(PathConstants pathVars, List positives, List negatives, int pos){
-		
-    	int trainSize = positives.size() + negatives.size();  	
-    	int posSize = (trainSize * pos) / 100;
-    	int negSize = trainSize - posSize;
-    	
-    	if(positives.size() < posSize){
-    		System.out.println("Not enough positive instances for training set.");
-    		System.exit(0);
-    	}
-    	else if(negatives.size() < negSize){
-    		System.out.println("Not enough negative instances for training set.");
-    		System.exit(0);    	
-    	}
-    	else{    		
-    		Collections.shuffle(negatives);
-    		System.out.println("===== Training > Negative instances shuffled for training set.");
-    		copyFile(pathVars, negatives, negSize);
-    		System.out.println("===== Training > Negative instances copied to training folder. \n");
-    		
-    		Collections.shuffle(positives);
-    		System.out.println("===== Training > Positive instances shuffled for training set.");
-    		copyFile(pathVars, positives, posSize);
-    		System.out.println("===== Training > Positive instances copied to training folder. \n");
-    	}			
-		
-	}
-	
-
-	
-
-}