diff --git a/config-sample.cfg b/config-sample.cfg
index a9b3483..3c8295f 100644
--- a/config-sample.cfg
+++ b/config-sample.cfg
@@ -12,6 +12,18 @@ HOME_DIR=/.
 # corpus directory
 CORPUS_DIR=corpus/
 #
+# source documents directory
+SOURCE_DIR=src/
+#
+# duplicate documents directory 
+DUP_DIR=test/
+#
+# positive instances directory
+POS_DIR=positives/
+#
+# negative instances directory 
+NEG_DIR=negatives/
+#
 # train directory
 TRAIN_DIR=train/
 #
@@ -61,6 +73,9 @@ NGRAM_FEATURES=ngrams_features.txt
 # Paper title n-grams feature list
 TITLE_NGRAMS=titleGrams.txt
 #
+# Paper ID and class
+DOC_IDS=docIDs.txt
+#
 ###################################################
 ########################## FEATURE SETUP ##########
 # Extract size of abstract and title 
@@ -78,6 +93,9 @@ FEATURE_MIN_FREQ=2
 # minimum length (in chars) to consider a feature
 FEATURE_MIN_LENGTH=3
 #
+# extract document IDs
+USE_DOC_ID=true
+#
 #############################
 ######### ANNOTATIONS #######
 # Extract annotation content
@@ -109,6 +127,20 @@ NGRAM_SIZE=1
 # Define weight of features
 #WEIGHT=3
 #
+###################################################
+########################## FEATURE SELECTION SETUP ##########
+# Enable Odds Ratio (OR) filtering
+USE_ODDS_RATIO=false
+#
+# Define minimum OR threshold to keep attribute 
+OR_THRESHOLD=1
+#
+# Enable inverted document frequency (idf) filtering
+USE_IDF=false
+#
+# Define minimum OR threshold to keep attribute
+IDF_THRESHOLD=1
+#
 #################################################
 ########################### TASK SETUP ##########
 # experiment type : train = 0 / test = 1
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..1924ede
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,3 @@
+*.classpath
+*.project
+*.*~
diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java
index c97cfa7..8e91951 100644
--- a/src/analyse/Extractor.java
+++ b/src/analyse/Extractor.java
@@ -51,7 +51,7 @@ public class Extractor {
 	
 	//String pathFile;
 	String id;
-	String endId;
+	protected String endId;
 	String openFile;	
 	String endFile;
 	String openAbst;
@@ -95,7 +95,8 @@ public String removeSpecialChar(String str){
 		str = str.replace(")", "");
 		str = str.replace("(", "");
 		str = str.replace("\t\t", "\t");
-		str = str.replace("-", "");
+		//losing ngrams because of hifen between names 
+		str = str.replace("-", " ");
 		str = str.replace("  ", "");
 		
 		return str;
@@ -154,15 +155,15 @@ public String removeAbstractTags(String str){
 		//this order of removing tags matters to 
 		//exclude the first tag from the abstracts.
 		
-		str = str.replace("<AbstractText>", "");
-		str = str.replace("<AbstractText", "");
-		str = str.replace("<CopyrightInformation>", "");
-		str = str.replace("</CopyrightInformation>", "");
-		str = str.replace("Copyright", "");		
-		str = str.replace("</AbstractText>", "");
-		str = str.replace("<Abstract>", "");
-		str = str.replace("</Abstract>", "");
-		str = str.replace("<AbstractText.*?>", "");		
+		str = str.replace("<abstracttext>", "");
+		str = str.replace("<abstracttext", "");
+		str = str.replace("<copyrightinformation>", "");
+		str = str.replace("</copyrightinformation>", "");
+		str = str.replace("copyright", "");		
+		str = str.replace("</abstractText>", "");
+		str = str.replace("<abstract>", "");
+		str = str.replace("</abstract>", "");
+		str = str.replace("<abstracttext.*?>", "");	
 		
 		return str;
 	}
diff --git a/src/analyse/FeatureExtractor.java b/src/analyse/FeatureExtractor.java
index 4ca93aa..4d66d4f 100644
--- a/src/analyse/FeatureExtractor.java
+++ b/src/analyse/FeatureExtractor.java
@@ -31,6 +31,7 @@ this software and associated documentation files (the "Software"), to deal in
 package analyse;
 
 import java.io.BufferedReader;
+import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.IOException;
@@ -38,47 +39,48 @@ this software and associated documentation files (the "Software"), to deal in
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
+
 import org.apache.commons.lang3.StringUtils;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
+
 import configure.PathConstants;
+import filter.NaiveFilter;
 
 
 /**
  * This class extracts and parses domain 
  * annotation features from doc instances
  *   
- * @author halmeida
+ * @author Hayda Almeida
+ * @since 2014
+ * 
  */
 
 public class FeatureExtractor extends Extractor{	
 	
 	public FeatureExtractor(){
 				
-		this.id = "<PMID Version=1>";
-		this.endId = "</PMID>";
-		this.endFile = "</PubmedArticleSet>";
-		this.openAbst = "<AbstractText>";
-		this.closeAbst = "</AbstractText>";
-		this.abstractLabel = "<AbstractText ";
-		this.openEC = "<RegistryNumber>EC ";
-		this.closeEC = "</RegistryNumber>";
-		this.classTag = "<TRIAGE>";
-		this.openJournal = "<Title>";
-		this.closeJournal = "</Title>";
-		this.openTitle = "<ArticleTitle>";
-		this.closeTitle = "</ArticleTitle>";		
+		this.id = "PMID";
+		this.openAbst = "AbstractText";
+		this.abstractLabel = "AbstractText ";
+		this.openEC = "RegistryNumber";
+		this.classTag = "TRIAGE";
+		this.openJournal = "Title";
+		this.openTitle = "ArticleTitle";		
 	}
 		
 	
 	public static void main(String[] args) {
 		
 		PathConstants pathVars = new PathConstants();
+		boolean verbose = false;
 		
 		String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +  pathVars.TRAIN_DIR + pathVars.TRAINING_FILE;
-		FeatureExtractor fextrac = new FeatureExtractor();	
+		FeatureExtractor fextrac = new FeatureExtractor();
+		NaiveFilter featFilter = new NaiveFilter();
 				
 		//store all features, type and count
 		HashMap<Map<String,String>,Integer> abstract_count = new HashMap<Map<String,String>,Integer>();
@@ -90,189 +92,142 @@ public static void main(String[] args) {
 		//store title features, type and count
 		HashMap<Map<String,String>, Integer> title_count = new HashMap<Map<String,String>, Integer>();
 		//store title features, whole journal title content and classification
-		HashMap<Map<String,String>,String> title_content = new HashMap<Map<String,String>,String>();
-		
+		HashMap<Map<String,String>,String> title_content = new HashMap<Map<String,String>,String>();		
 		//store title content and EC numbers
 		ArrayList<String> ec_numbers = new ArrayList<String>();
+		
+		//store ID, class and features
+		HashMap<String,String> PMIDs = new HashMap<String,String>();
 				
 		fextrac.initialize();
 		int jTitle = 0;
 				
 		try 
 		{
-			BufferedReader reader = new BufferedReader(new FileReader(AnCorpus));			
-
-			//---------------------------
-			// repeat until all lines of the file are read
-			//---------------------------
 			String line = null;
 			String features = null;
-			// String id = null;			
-
-
-			while((line = reader.readLine()) != null){
+			//Loading file
+			File input = new File(AnCorpus);
+			//Jsoup parse
+			Document doc = Jsoup.parse(input, "UTF-8");
 
-				line = line.replaceAll("\t","");
-				line = line.replace("\"", "");
+			Elements corpus = doc.body().getElementsByTag("pubmedarticleset");
 
-				//find paper ID and store it
-				if (line.contains(fextrac.getid())){
-					line = line.replace(fextrac.getid(), "");
-				//	id = line.replace(fextrac.getendId(), "");		
+			//Fetching elements
 
-					//continue reading
-					features = reader.readLine();
-					features = features.replaceAll("\t","");
-										
-					String journal = "";
+			for(Element paper : corpus ){			
 
-					//continue reading until the end of file
-					while(!(features.contentEquals(fextrac.getendFile()))){											
-						
-						//find relevant doc section - Journal title
-						if(features.contains(fextrac.getOpenJournal())){
-							
-							features = features.replace(fextrac.getOpenJournal(),"");
-							features = features.replace(fextrac.getCloseJournal(), "");
-							features = fextrac.removeSpecialChar(features);
-														
-							//separating only the journal title content													
-							journal = fextrac.removeTags(features);
-							//counting # of journal titles					
-							jTitle++;
-							
-							features = reader.readLine();
-							features = features.replaceAll("\t","");								
-						}						
-												
-						//find relevant doc section - Article title
-						if(features.contains(fextrac.getOpenTitle())){
-							
-							features = features.replace(fextrac.getOpenTitle(),"");
-							features = features.replace(fextrac.getCloseTitle(), "");
-							features = fextrac.removeSpecialChar(features);
-							
-							//separating the title by annotations													
-							String title_annotation = features;																				
-							
-							//extracting annotations and inserting them on lists
-							fextrac.annotations(title_annotation, title_count, title_type, pathVars);
-							fextrac.addContent(title_annotation, journal, title_content);							
-							
-							features = reader.readLine();
-							features = features.replaceAll("\t","");							
-						}						
-						
-						if(features.contains(fextrac.getAbstractLabel())){
-							
-							String temp = "";
-							String newAbs = fextrac.getopenAbst();						
-							
-							//handling cases when the tag is already within abstract content
-							if(features.contains("</Abstract>")){
-								temp = temp + fextrac.processAbstract(features);
-							}
-							else{												
-								do{							
-									temp = temp + fextrac.processAbstract(features);								
-									features = reader.readLine();							
-								}while(!(features.contains("</Abstract>")));
-							}							
-							newAbs = newAbs + temp;
-							features = newAbs + fextrac.getcloseAbst();							
-						}
-						
-						//find relevant doc section - Abstract
-						if(features.contains(fextrac.getopenAbst())){
-							
-							features = features.replace(fextrac.getopenAbst(),"");
-							features = features.replace(fextrac.getcloseAbst(), "");
-							features = fextrac.removeSpecialChar(features);
-							
-							//handle lines in which abstract text tag
-							//is separated from the actual text
-							if(features.isEmpty()){
-								features = reader.readLine();
-								features = features.replaceAll("\t","");
-								features = features.replace(fextrac.getopenAbst(),"");
-								features = features.replace(fextrac.getcloseAbst(), "");
-								features = fextrac.removeSpecialChar(features);
-							}																					
-							
-							features = fextrac.removeAbstractTags(features);
-							
-							//gathering abstract annotations
-							String abstrac = features;
-							
-							//extract annotations and insert them on lists
-							fextrac.annotations(abstrac, abstract_count, abstract_type, pathVars);					
-
-							features = reader.readLine();
-							features = features.replaceAll("\t","");
-							//features = features.replaceAll("\\s+", "");
-						}
-						
-						//identifying EC number
-						if(features.contains(fextrac.getOpenEC())){
-							features = features.replace(fextrac.getOpenEC(), "");
-							features = features.replace(fextrac.getCloseEC(), "");
-							features = fextrac.removeSpecialChar(features);
-							
-							ec_numbers.add(features);
-							
-							features = reader.readLine();
-							features = features.replaceAll("\t","");
-						}
-						
-						//find classification of the document	
-						if(features.contains(fextrac.getClassTag())){	
-														
-							//adding classification to the list of annotations
-							String classif = fextrac.getClassif(features);
-							fextrac.addClass(classif, abstract_type);							
-							fextrac.addClass(classif, title_type);
-							fextrac.addClass(classif, title_content);													
-							
-							features = reader.readLine();
-							features = features.replaceAll("\t","");
-						}
+				//Fetching elements
+				Elements journalTitle = paper.getElementsByTag(fextrac.getOpenJournal());
+				Elements title = paper.getElementsByTag(fextrac.getOpenTitle());
+				Elements abstractC = paper.getElementsByTag(fextrac.getopenAbst());
+				Elements ECnumber = paper.getElementsByTag(fextrac.getOpenEC());
+				Elements classDoc = paper.getElementsByTag(fextrac.getClassTag());				
 
-						features = reader.readLine();
-						features = features.replaceAll("\t","");						
+				String journal = "";
+				String docID = "";
+				String label = "";
+				ArrayList<String> tempList = new ArrayList<String>();
+				StringBuffer sb = new StringBuffer();
+				
+				//fetching the paper ID - 
+				//for all items in a paper, retrieve only PMIDs 
+				for(Element e : paper.select(fextrac.getid())){
+					//only consider the ID if the parent is medline citation
+					if(e.parentNode().nodeName().contains("medline")){						
+						docID = e.text();
+					}
+				}			
+				//fetch the doc label as well
+				if(classDoc.hasText()){
+					label = classDoc.text();									
+				}
+				
+				PMIDs.put(docID, label);				
 
-					}					
+				if(journalTitle.hasText()){
 
+					jTitle++;				
+					journal = journalTitle.toString();
+					journal = fextrac.removeSpecialChar(journal);				
+					journal = fextrac.removeTags(journal);									
 				}				
 
-			}
+				String title_annotation = "";
+				if(title.hasText()){
+					title_annotation = title.toString();
+					title_annotation = fextrac.removeSpecialChar(title_annotation);
 
-			reader.close();
+					tempList.addAll(fextrac.annotations(title_annotation, title_count, title_type, featFilter, pathVars));
+					fextrac.addContent(title_annotation, journal, title_content, featFilter);					
+				}
 
+				String abstrac = "";
+				if(abstractC.hasText()){
+					abstrac = abstractC.toString();
+					abstrac = fextrac.removeSpecialChar(abstrac);
+					abstrac = fextrac.removeAbstractTags(abstrac);
+
+					tempList.addAll(fextrac.annotations(abstrac, abstract_count, abstract_type, featFilter, pathVars));				
+				}		
+
+				String ecnum = "";
+				if(ECnumber.hasText()){				
+					for(Element number : ECnumber){						
+						ecnum = number.toString();
+						if(ecnum.contains("EC")){
+							ecnum = fextrac.removeSpecialChar(ecnum);
+							ecnum = fextrac.removeTags(ecnum);
+							ec_numbers.add(features);
+						}
+					}				
+				}			
+
+				String triage = "";
+				if(classDoc.hasText()){
+					triage = classDoc.toString();
+					triage = fextrac.removeSpecialChar(triage);
+					triage = fextrac.removeTags(triage);
+
+					fextrac.addClass(triage, abstract_type);							
+					fextrac.addClass(triage, title_type);
+					fextrac.addClass(triage, title_content);
+				}
+				
+//				for(int i = 0; i < tempList.size(); i++){
+//					sb.append(tempList.get(i) + "-");					
+//				}
+//				
+//				PMIDs.put(docIDLabel, sb.toString());
+			}
+			
 		}
+		
 		catch (FileNotFoundException e) {
 			e.printStackTrace();			
 		} 
 		catch (IOException e) {
 			e.printStackTrace();
-		}
-		
-		
-		//Use for sample output
-		//System.out.println("\n===========TITLE==ANNOTATIONS=============");
-		//fextrac.displayList(title_count);		
-		//fextrac.displayList(title_type);
-		//fextrac.displayList(title_content);
-		//System.out.println("\n========ABSTRACT==ANNOTATIONS=============");
-		//fextrac.displayList(abstract_count);		
-		//fextrac.displayList(abstract_type);		
-		
-		//Before exporting, take into account the 
-		//occurence of all extracted features 
-		fextrac.considerOccurence(abstract_count, pathVars);
-		fextrac.considerOccurence(title_count, pathVars);
+		}		
 		
+		if(verbose){
+			//print list of extracted features
+			System.out.println("\n===========TITLE==ANNOTATIONS=============");
+			fextrac.displayList(title_count);		
+			fextrac.displayList(title_type);
+			fextrac.displayList(title_content);
+			System.out.println("\n========ABSTRACT==ANNOTATIONS=============");
+			fextrac.displayList(abstract_count);		
+			fextrac.displayList(abstract_type);
+		}
 		
-		System.out.println("\n===========FEATURE==EXPORT===============");			
+		//filter features by occurence
+		featFilter.considerAnnotationOccurence(abstract_count, pathVars);
+		featFilter.considerAnnotationOccurence(title_count, pathVars);
+				
+		System.out.println("\n===========FEATURE==EXPORT===============");
+		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs);
+		System.out.println("..."+ PMIDs.size()+" document IDs listed.");
 		fextrac.exportList(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES, ec_numbers);
 		System.out.println("..."+ ec_numbers.size()+" EC numbers saved.");				
 		fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES, abstract_count);
@@ -319,35 +274,14 @@ private void addClass(String element, HashMap<Map<String,String>, String> list){
 		while(it.hasNext()){		
 			Map<String,String> str = it.next();
 								
-			if(list.get(str).contains("positive") || list.get(str).contains("negative")){
+			if(list.get(str).contains(element)){
+			//if(list.get(str).contains("positive") || list.get(str).contains("negative")){
 					
 			}
 			else list.put(str, element);
 		}
-	}
-	
-	
-	/**
-	 * Removes from feature list all features with 
-	 * frequency not statistically relevant (2 or less)
-	 * @param list to be cleaned
-	 */	
-	private void considerOccurence(HashMap<Map<String,String>,Integer> list, PathConstants vars){
-		//going over the list of annotations and removing the
-		//features with occurance lower than specified.
-		
-		Iterator<Map<String, String>> iterator = list.keySet().iterator();
-							
-		while(iterator.hasNext()){
-			Map<String, String> key = iterator.next();
-			int valor = list.get(key).intValue();			
-			
-			if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
-				iterator.remove();				
-			}
-		}		
 	}	
-	
+
 	
 	/**
 	 * Extract the annotations from a determined section
@@ -357,11 +291,11 @@ private void considerOccurence(HashMap<Map<String,String>,Integer> list, PathCon
 	 * @param count list that holds annotation, its type and its count
 	 * @param type list that holds annotation, its type and its classification
 	 */	
-	private void annotations(String annot, HashMap<Map<String, String>, Integer> count, HashMap<Map<String,String>,String> type, PathConstants pathVars) {		
+	private ArrayList<String> annotations(String annot, HashMap<Map<String, String>, Integer> count, HashMap<Map<String,String>,String> type, NaiveFilter filter, PathConstants pathVars) {		
 		HashMap<String,String> features = loadAnnotationEntities();
 		PathConstants pathVar = new PathConstants(); 
 		NgramExtractor nextrac = new NgramExtractor();
-		ArrayList<String> content = new ArrayList<String>();
+		ArrayList<String> content = new ArrayList<String>();		
 
 		//parsing the not edited text into HTML using Jsoup
 		Document doc = Jsoup.parseBodyFragment(annot);
@@ -394,7 +328,7 @@ private void annotations(String annot, HashMap<Map<String, String>, Integer> cou
 							//if child is sentence (sentence inside of sentence),  
 							//then add annotations as ngrams on this
 							if(features.get(child.nodeName()).contains("sentence")) {
-								content.addAll(nextrac.nGrams(child.text(), pathVar));
+								content.addAll(nextrac.nGrams(child.text(), filter, pathVar));								
 								insertAnnotation(content, an.nodeName(), count, type, pathVars);
 							}
 							//adding annotations on sentence as they are - no ngrams on this
@@ -409,7 +343,7 @@ private void annotations(String annot, HashMap<Map<String, String>, Integer> cou
 						tempAnnot.children().remove();
 
 						//splitting content in ngrams to whats left on the sentence
-						content.addAll(nextrac.nGrams(tempAnnot.text(), pathVar));
+						content.addAll(nextrac.nGrams(tempAnnot.text(), filter, pathVar));
 						insertAnnotation(content, an.nodeName(), count, type, pathVars);
 					}			
 
@@ -422,6 +356,7 @@ private void annotations(String annot, HashMap<Map<String, String>, Integer> cou
 			}
 
 		}
+		return content;
 
 	}	
 	
@@ -479,7 +414,7 @@ private void insertAnnotation(ArrayList<String> content, String an_type, HashMap
 	 * @param list features used
 	 * 
 	 */	
-	private void addContent(String annot, String wContent, HashMap<Map<String,String>,String> list) {
+	private void addContent(String annot, String wContent, HashMap<Map<String,String>,String> list, NaiveFilter filter) {
 
 		HashMap<String,String> features = loadAnnotationEntities();
 		ArrayList<String> content = new ArrayList<String>();
@@ -507,7 +442,7 @@ private void addContent(String annot, String wContent, HashMap<Map<String,String
 				//grab annotation content								
 				if(an_level.contains("sentence"))
 					//splitting in ngrams for sentence level annotations
-					content = nextrac.nGrams(an.text(), pathVar);
+					content = nextrac.nGrams(an.text(), filter, pathVar);
 				else 
 					//keeping original annotation for other cases
 					content.add(an.text());
diff --git a/src/analyse/NgramExtractor.java b/src/analyse/NgramExtractor.java
index e3a8085..c101c25 100644
--- a/src/analyse/NgramExtractor.java
+++ b/src/analyse/NgramExtractor.java
@@ -30,171 +30,138 @@ this software and associated documentation files (the "Software"), to deal in
 
 package analyse;
 
-import java.io.BufferedReader;
+import java.io.File;
 import java.io.FileNotFoundException;
-import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.Map;
 
 import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
 import configure.PathConstants;
+import filter.NaiveFilter;
 
 /**
  * This class extracts and parses n-grams
- * from doc instances.
+ * from XML doc instances.
+ * 
+ * @author Hayda Almeida
+ * @since 2014
  * 
- * @author halmeida
  */
 
 public class NgramExtractor extends Extractor{
 		
 	public NgramExtractor(){
-		this.id = "<PMID Version=1>";
-		this.endId = "</PMID>";
-		this.endFile = "</PubmedArticleSet>";
-		this.openAbst = "<AbstractText>";
-		this.closeAbst = "</AbstractText>";
-		this.abstractLabel = "<AbstractText ";
-		this.classTag = "<TRIAGE>";
-		this.openTitle = "<ArticleTitle>";
-		this.closeTitle = "</ArticleTitle>";
+		
+		//defining relevant paper text fields
+		this.id = "PMID";
+		this.openJournal = "Title";
+		this.openAbst = "AbstractText";		
+		this.openEC = "RegistryNumber";
+		this.classTag = "TRIAGE";
+		this.openTitle = "ArticleTitle";		
 	}	
 	
-	static String certainty = "?"; //very relevant, relevant, fairly relevant
-	
-	
+		
 	public static void main(String[] args) {
 		
 		PathConstants pathVars = new PathConstants();
+		boolean verbose = false;
 		
 		String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE;
 		NgramExtractor nextrac = new NgramExtractor();
+		NaiveFilter featFilter = new NaiveFilter();
+		
 		//store abstract ngrams and its count
 		HashMap<String,Integer> ngram_count = new HashMap<String,Integer>();
-		//store abstract ngrams, count and "relevance(TBD)"
-		HashMap<Map<String,String>,Integer> ngrams  = new HashMap<Map<String,String>,Integer>();
+		//store abstract ngrams and doc ID
+		HashMap<String,String> ngram_ID  = new HashMap<String,String>();
 		//store title ngrams and its count
 		HashMap<String,Integer> ngram_title_count = new HashMap<String,Integer>();
 		//store title ngrams, count and "relevance(TBD)"
 		HashMap<Map<String,String>,Integer> ngram_title = new HashMap<Map<String,String>,Integer>();
-		
+		//store ID and label of documents
+		HashMap<String,String> PMIDs = new HashMap<String,String>();
+				
 		nextrac.initialize();		
 		
 		try 
-		{			
-			BufferedReader reader = new BufferedReader(new FileReader(AnCorpus));	       
-
-			//---------------------------
-			// repeat until all lines 
-			// of the file are read
-			//---------------------------
-			String line = null;
-			String features = null;
-			String id = null;
-
-
-			while((line = reader.readLine()) != null){
-
-				line = line.replaceAll("\t","");
-				line = line.replace("\"", "");
-
-				//find paper ID and store it
-				if (line.contains(nextrac.getid())){
-					line = line.replace(nextrac.getid(), "");
-					id = line.replace(nextrac.getendId(), "");
-
-					//keep reading the file
-					features = reader.readLine();
-					features = features.replaceAll("\t","");	       		
-
-					String tit_content = "";
+		{		
+			
+			//Loading file
+			File input = new File(AnCorpus);
+			//Jsoup parse
+			Document doc = Jsoup.parse(input, "UTF-8");
 
-					//continue reading until the end of file
-					while(!(features.contentEquals(nextrac.getendFile()))){
+			Elements corpus = doc.body().getElementsByTag("pubmedarticleset");
 						
-						String abstrac = "";
-
-						//find relevant doc section - Article title
-						if(features.contains(nextrac.getOpenTitle())){
-
-							//cleaning title content
-							features = features.replace(nextrac.getOpenTitle(),"");
-							features = features.replace(nextrac.getCloseTitle(), "");
-							features = nextrac.removeSpecialChar(features);
-							tit_content = nextrac.removeTags(features);
-
-							//extract n-grams from section
-							ArrayList<String> title_c = nextrac.nGrams(tit_content, pathVars);
-							nextrac.addNGram(title_c, ngram_title_count,ngram_title, pathVars);
+			//Fetching elements
+			
+			for(Element paper : corpus ){			
+
+				Elements journalTitle = paper.getElementsByTag(nextrac.getOpenJournal());
+				Elements title = paper.getElementsByTag(nextrac.getOpenTitle());
+				Elements abstractC = paper.getElementsByTag(nextrac.getopenAbst());
+				Elements ECnumber = paper.getElementsByTag(nextrac.getOpenEC());
+				Elements classDoc = paper.getElementsByTag(nextrac.getClassTag());		
+
+				String journal = "";
+				String docID = "";
+				String label = "";
+				int jTitle = 0;
+
+				//fetching the paper ID - 
+				//for all items in a paper, retrieve only PMIDs 
+				for(Element e : paper.select(nextrac.getid())){
+					//only consider the ID if the parent is medline citation
+					if(e.parentNode().nodeName().contains("medline")){						
+						docID = e.text();
+					}
+				}			
+				//fetch the doc label as well
+				if(classDoc.hasText()){
+					label = classDoc.text();									
+				}
 
-							features = reader.readLine();
-							features = features.replaceAll("\t","");
-						}
-						
+				PMIDs.put(docID, label);
 
-						if(features.contains(nextrac.getAbstractLabel())){
-							
-							String temp = "";
-							String newAbs = nextrac.getopenAbst();
-							
-							if(features.contains("</Abstract>")){
-								temp = temp + nextrac.processAbstract(features);
-							}
-							else{						
-								do{							
-									temp = temp + nextrac.processAbstract(features);								
-									features = reader.readLine();							
-								}while(!(features.contains("</Abstract>")));
-							}
-								
-							newAbs = newAbs + temp;
-							features = newAbs + nextrac.getcloseAbst();							
-						}
+				//Extracting the Journal Title
+				if(journalTitle.hasText()){
+					jTitle++;				
+					journal = journalTitle.toString();
+					journal = nextrac.removeSpecialChar(journal);				
+					journal = nextrac.removeTags(journal);									
+				}
 
-						//find relevant paper section
-						if(features.contains(nextrac.getopenAbst())){							
-							
-							features = features.replace(nextrac.getopenAbst(),"");
-							features = features.replace(nextrac.getcloseAbst(), "");
-							features = features.replace("-", " ");
-							features = nextrac.removeSpecialChar(features);
-							
-							//handle lines in which abstract text tag
-							//is separated from the actual text
-							if(features.isEmpty()){
-								features = reader.readLine();
-								features = features.replaceAll("\t","");
-								features = features.replace(nextrac.getopenAbst(),"");
-								features = features.replace(nextrac.getcloseAbst(), "");
-								features = features.replace("-", " ");
-								features = nextrac.removeSpecialChar(features);
-							}					
-							
-							//features = nextrac.removeSpecialChar(features);
-							abstrac = nextrac.removeTags(features);
-							abstrac = nextrac.removeAbstractTags(abstrac);
-							//extract n-grams from section
-							ArrayList<String> abstract_c = nextrac.nGrams(abstrac, pathVars);
-							nextrac.addNGram(abstract_c, ngram_count, ngrams, pathVars);												
+				String tit_content = "";
+				//Extracting the Paper Title
+				if(title.hasText()){
+					tit_content = title.toString();
+					tit_content = nextrac.removeSpecialChar(tit_content);
+					tit_content = nextrac.removeTags(tit_content);
 
-							//keep reading file
-							features = reader.readLine();
-							features = features.replaceAll("\t","");
-							//features = features.replaceAll("\\s+", "");
-						}
-						
-						features = reader.readLine();
-						features = features.replaceAll("\t","");
-						//features = features.replaceAll("\\s+", "");
-					}			
+					ArrayList<String> title_c = nextrac.nGrams(tit_content, featFilter, pathVars);
+					nextrac.addNGram(title_c, ngram_title_count, pathVars);		
 				}
-			}			
 
-			reader.close();      				
+				String abstrac = "";
+				//Extracting the Paper abstract
+				if(abstractC.hasText()){
+					abstrac = abstractC.toString();
+					abstrac = nextrac.removeTags(abstrac);
+					abstrac = nextrac.removeSpecialChar(abstrac);				
+					abstrac = nextrac.removeAbstractTags(abstrac);
 
+					ArrayList<String> abstract_c = nextrac.nGrams(abstrac, featFilter, pathVars);
+					nextrac.addNGram(abstract_c, ngram_count, pathVars);			
+				}
+			}
 
 		}catch (FileNotFoundException e) {
             e.printStackTrace();
@@ -202,19 +169,23 @@ public static void main(String[] args) {
             e.printStackTrace();
         } 
         
-		//print list of extracted n-grams
-		//System.out.println("\n========ABSTRACT==NGRAMS=============");
-		//nextrac.displayList(ngram_count);
-		//nextrac.displayList(ngram_title);
-		//System.out.println("\n===========TITLE==NGRAMS=============");
-		//nextrac.displayList(ngram_title_count);
-		
+		if(verbose){
+			//print list of extracted n-grams
+			nextrac.displayList(PMIDs);
+			System.out.println("\n========ABSTRACT==NGRAMS=============");
+			nextrac.displayList(ngram_count);
+			nextrac.displayList(ngram_title);
+			System.out.println("\n===========TITLE==NGRAMS=============");
+			nextrac.displayList(ngram_title_count);
+		}	
 		
-		nextrac.considerOccurance(ngram_count, pathVars);
-		nextrac.considerOccurance(ngram_title_count, pathVars);
+		//filter features by occurence			
+		featFilter.considerNgramOccurence(ngram_count, pathVars);
+		featFilter.considerNgramOccurence(ngram_title_count, pathVars);		
 		
-		
-		System.out.println("\n===========NGRAMS==EXPORT===============\n");		
+		System.out.println("\n===========NGRAMS==EXPORT===============\n");
+		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs);
+		System.out.println("..."+ PMIDs.size()+" document IDs listed.");
 		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES, ngram_count);
 		System.out.println("..."+ ngram_count.size()+" unique Abstract ngrams saved.");
 		nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS, ngram_title_count);
@@ -223,80 +194,64 @@ public static void main(String[] args) {
                
 	}
 	
-	
+
 	/**
-	 * Removes from feature list all features with 
-	 * frequency not statistically relevant (2 or less)
-	 * @param list to be cleaned
+	 * Inserts ngrams into list of features 
+	 * with a mapping for ngram count  
+	 * @param str relation of ngrams extracted
+	 * @param list_count mapping for ngram counts
+	 * @param pathVars 
 	 */
 	
-	private void considerOccurance(HashMap<String,Integer> list, PathConstants vars){
-		//going over the list of annotations and removing the
-		//statistically not significant features - frequency less than 2
-
-		Iterator <Integer> iterator = list.values().iterator();
-
-		while(iterator.hasNext()){
-			Integer key = iterator.next();
-
-			if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
-				iterator.remove();				
-			}
-		}
-	}
-	
-	private void addNGram(ArrayList<String> str, HashMap<String,Integer> list_count, HashMap<Map<String,String>,Integer> list, PathConstants pathVars){
+	private void addNGram(ArrayList<String> str, HashMap<String,Integer> list_count, PathConstants pathVars){
 		
+		//iterating over ngram list
 		for(int i = 0; i < str.size(); i++){
 			String currentNGram = str.get(i);
 			
+			//checking existence of current ngram on list mapping
 			if(list_count.containsKey(currentNGram)){
+				//retrieve the amount of current ngrams on mapping
 				int count = list_count.get(currentNGram);
-				list_count.put(currentNGram, count+1);
-
-				/*if(list.containsKey(currentNGram)){		
-					int cnt = list.get(currentNGram).get(certainty);
-					list.get(currentNGram).put(certainty, cnt+1);
-				}
-				else{
-					list.get(currentNGram).put(certainty, 1);
-				}*/
+				//insert the updated count of ngrams
+				list_count.put(currentNGram, count+1);			
 			}
 			else {
+				//insert ngram on mapping list 
 				if(currentNGram.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){
 					list_count.put(currentNGram, 1);
-					
-				/*	list.put(currentNGram, new HashMap<String, Integer>());
-					list.get(currentNGram).put(certainty, 1);*/
 				}
 			}
 		}
 	}
 	
 	/**
-	 * Extracts n-grams from the content field
-	 * and populates mapping with n-gram +count
-	 * @param str
-	 * @param id
-	 * @param gram
-	 */
-	
-	public ArrayList<String> nGrams(String str, PathConstants pathVar){
+	 * Extracts n-grams from a given content field
+	 * 
+	 * @param str text to extract ngrams
+	 * @return list of extracted grams
+	 */	
+	public ArrayList<String> nGrams(String str, NaiveFilter filter, PathConstants pathVar){
 
-		//cleaning further chars on sentence		
+		//removing ASCII special characters		
 		str = str.replace("/", "");
-		str = str.replace("\\", "");		
-		str = str.replace(" ", "-");
-		//Tokenize the sentence
+		str = str.replace("\\", "");
+		//str = str.replace("\n", " ");
+		str = str.replaceAll("\\s+"," ");
+		str = str.replace(" ", "-");		
+		
+		//Tokenizing the sentence
 		String[] words = StringUtils.split(str,"-"); 
 		ArrayList<String> ngramList = new ArrayList<String>();
 
 		int ngram =Integer.parseInt(pathVar.NGRAM_SIZE);
 
+		//Stop-words removal 
 		if(Boolean.valueOf(pathVar.NGRAM_STOP)){
-			words = StringUtils.split(removeStopList(words, pathVar)," ");
-		}		
-
+			words = StringUtils.split(filter.removeStopList(words, pathVar)," ");
+		}	
+		
+		//extracting ngrams according to gram size (1, 2, 3)
 		for(int i=0; i < words.length - (ngram - 1); i++){
 			switch(pathVar.NGRAM_SIZE){
 			case "1":
@@ -314,94 +269,58 @@ public ArrayList<String> nGrams(String str, PathConstants pathVar){
 		return ngramList;
 	}
 	
-	/**
-	 * Removes the stopwords from ngrams list
-	 * 
-	 * @param str list of ngrams
-	 * @param pathVar constants from 
-	 * @return
-	 */
+//	/**
+//	 * Removes stopwords from ngrams list
+//	 * 
+//	 * @param str list of ngrams
+//	 * @param constants 
+//	 * @return cleaned list of ngrams
+//	 */	
+//	public String removeStopList(String[] str, PathConstants pathVar){
+//		
+//		//stop-words file name
+//		String pathStop = "stopList.txt";
+//		String[] stop = null;
+//		StringBuilder cleaned = new StringBuilder();
+//		
+//		try{
+//			
+//			BufferedReader reader = new BufferedReader(new FileReader(pathStop));
+//			
+//			String line = null;	
+//			//loading stop-words list
+//			while((line = reader.readLine()) != null){
+//				stop = StringUtils.split(line,",");
+//				line = reader.readLine();
+//			}
+//			
+//			reader.close();
+//			
+//		}catch (FileNotFoundException e) {
+//            e.printStackTrace();
+//        } catch (IOException e) {
+//            e.printStackTrace();
+//        } 		
+//		
+//		//iteraing over text to be cleaned
+//		for(int i = 0; i < str.length; i++){
+//			//iterating over stop-words list
+//			for(int j = 0; j < stop.length; j++){
+//				
+//				//when stop-word is encountered, replace it
+//				if(str[i].equalsIgnoreCase(stop[j])){
+//					str[i] = str[i].replace(str[i],"*");					
+//				}				
+//			}
+//			//retrieve the text without stop-words replacements
+//			if(!(str[i].contentEquals("*"))){
+//				cleaned.append(str[i]).append(" ");				
+//			}
+//		}		
+//		return cleaned.toString().replace("  ", " ");
+//	}
 	
-	public String removeStopList(String[] str, PathConstants pathVar){
-		
-		String pathStop = "stopList.txt";
-		String[] stop = null;
-		StringBuilder cleaned = new StringBuilder();
 		
-		try{
-			
-			BufferedReader reader = new BufferedReader(new FileReader(pathStop));
-			
-			String line = null;	
-			
-			while((line = reader.readLine()) != null){
-				stop = StringUtils.split(line,",");
-				line = reader.readLine();
-			}
-			
-			reader.close();
-			
-		}catch (FileNotFoundException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        } 		
-		
-		for(int i = 0; i < str.length; i++){
-			for(int j = 0; j < stop.length; j++){
-				
-				if(str[i].equalsIgnoreCase(stop[j])){
-					str[i] = str[i].replace(str[i],"*");					
-				}				
-			}
-			if(!(str[i].contentEquals("*"))){
-				cleaned.append(str[i]).append(" ");				
-			}
-		}		
-		return cleaned.toString().replace("  ", " ");
-	}
-	
-	/**
-	 * Evaluates the level of certainty... 
-	 * TBD!!!
-	 * @param list
-	 * @return
-	 */
-	
-	public String getCertainty(HashMap<String,Map<String,Integer>> list){
-		
-		ArrayList<Object> gramsAr = new ArrayList<Object>(list.entrySet());
-		//String certainty;
-
-		Iterator<?> itr = gramsAr.iterator();
-		while(itr.hasNext()){
-			String str = itr.next().toString();
-			String[] splitted = StringUtils.split(str,"=");
-
-			int relevance = 0;
-			int count = 0;
-
-
-			try{
-				count = list.get(splitted[0]).get(certainty);
-			} catch(Exception e){
-				e.printStackTrace();
-			}
-
-			//relevance = count * getWeight();
-
-			if(relevance == 1)			
-				list.get(splitted[0]).put("fairly relevant", list.get(splitted[0]).get(certainty));							
-			else if (relevance == 2)
-				list.get(splitted[0]).put("relevant", list.get(splitted[0]).get(certainty));
-			else
-				list.get(splitted[0]).put("very relevant", list.get(splitted[0]).get(certainty));
-
-		}
-		
-		return certainty;		
-	}
-	
 	/**
 	 * Displays the keys and values of the
 	 * maps created with n-grams and counts.
@@ -416,27 +335,6 @@ public void displayList(HashMap hash){
 		System.out.println("\n=======================================\n");
 	}
 	
-	
-	/**
-	 * Accessor and mutator methods for the export
-	 * string with list values - so vector class
-	 * can access its content.
-	 * @return string with list of values.
-	 */
-	/*public static String getNgramCount() {
-		//ngramCount = exportContent(ngram_count);
-		return ngramCount;	
-	}
-	public void setNgramCount(String ngramCount) {
-		this.ngramCount = ngramCount;
-	}
-	public static String getNgram() {
-		//ngram = exportContent(ngrams);
-		return ngram;
-	}
-	public void setNgram(String ngram) {
-		this.ngram = ngram;
-	}	*/
-	
+		
 	
 }
diff --git a/src/arffmatrix/BuildModel.java b/src/arffmatrix/BuildModel.java
index 65869e8..f8d0fac 100644
--- a/src/arffmatrix/BuildModel.java
+++ b/src/arffmatrix/BuildModel.java
@@ -54,6 +54,9 @@ this software and associated documentation files (the "Software"), to deal in
  * This class reads the corpus instances and uses
  * the CreateVector class to generate a model file (ARFF) *  
  *
+ * @author Hayda Almeida, Marie-Jean Meurs
+ * @since 2014
+ *
  */
 
 public class BuildModel {
@@ -91,8 +94,7 @@ public static void main(String[] args) {
 			//by default
 			String sortarffFileName = pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + arffFileName; // default
 			
-			// create file 
-			//FileWriter fstream = new FileWriter(sortarffFileName);
+			// create file			
 			BufferedWriter out = new BufferedWriter(new FileWriter(sortarffFileName));
 			 
 			// load ARFF header and write it
@@ -254,7 +256,7 @@ else if(Integer.parseInt(pathVars.EXP_TYPE) ==1)
 	        		System.out.println("Abstract : " + abstracttext.toString() + "\n\n");
 
 	        		// end of if: collect data and write ARFF
-	        		String Arffline = vectorgenerator.getArffLine(
+	        		String Arffline = vectorgenerator.getArffLine(pmid,
 	        				journaltitle, 
 	        				title, 
 	        				abstracttext,
diff --git a/src/arffvector/CreateVector.java b/src/arffvector/CreateVector.java
index ce81dee..b112ea5 100644
--- a/src/arffvector/CreateVector.java
+++ b/src/arffvector/CreateVector.java
@@ -54,6 +54,8 @@ this software and associated documentation files (the "Software"), to deal in
  * generated corpus to create a feature vector
  * (a matrix representation of the corpus) 
  * 
+ * @author Hayda Almeida, Marie-Jean Meurs
+ * @since 2014
  * 
  */
 public class CreateVector {	
@@ -64,7 +66,8 @@ public class CreateVector {
 	ArrayList<String> ecnumbers = new ArrayList<String>();
 	ArrayList<String> titleGrams = new ArrayList<String>();
 	ArrayList<String> titleAnnot = new ArrayList<String>();
-	ArrayList<String> nGrams = new ArrayList<String>();	
+	ArrayList<String> nGrams = new ArrayList<String>();
+	ArrayList<String> docID = new ArrayList<String>();
 		
 	PathConstants pathVars = null;
 	
@@ -79,7 +82,7 @@ public class CreateVector {
 	
 	public CreateVector(PathConstants extVars) {
 		
-		pathVars = extVars;
+		pathVars = extVars;		
 		
 		String pathJournalT = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES;
 		try{
@@ -395,21 +398,27 @@ public String genArffHeader(PathConstants pathVars, int exp){
 		
 		switch(exp){
 			case 0: 
-				headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2014\n\n");
+				headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2015\n\n");
 			break;			
 			case 1: 
-				headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2014\n\n");
+				headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2015\n\n");
 			break;
 		}		
 		
 		headerArff.append("@RELATION triage\n");
 		
-		if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){
+		if(Boolean.valueOf(pathVars.USE_TEXT_SIZE)){
 			// writing the list of text sizes
 			headerArff.append("@ATTRIBUTE sizeoftitle \tREAL \t\t%size of title\n");
 			headerArff.append("@ATTRIBUTE sizeoftext \tREAL \t\t%size of text\n");			
 		}
 		
+		if(Boolean.valueOf(pathVars.USE_DOC_ID)){
+			//writing the docIDs
+			headerArff.append("@ATTRIBUTE docID \tREAL \t\t%PMID of paper\n");
+						
+		}
+		
 		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
 			for(int i = 0; i < journalTitles.size(); i++){
 			// writing list of journal titles
@@ -532,10 +541,11 @@ public String genArffHeader(PathConstants pathVars, int exp){
 	 * @return String holding counts for all features found in a document
 	 */
 	
-	public String getArffLine(String jTitle, String title, String text, String ecnum, String classTriage, int exp){
+	public String getArffLine(String paperID, String jTitle, String title, String text, String ecnum, String classTriage, int exp){
 		//String vectorArff = "";
 		StringBuilder vectorArff = new StringBuilder();
 				
+		paperID = removeSpecialChar(paperID.toLowerCase());
 		text = removeSpecialChar(text.toLowerCase());
 		title = removeSpecialChar(title.toLowerCase());
 		jTitle = removeSpecialChar(jTitle.toLowerCase());
@@ -562,6 +572,17 @@ public String getArffLine(String jTitle, String title, String text, String ecnum
 			vectorArff.append(titlesize).append(",").append(abstractsize).append(",");			
 		}
 		
+		//fill ID of documents
+		if(Boolean.valueOf(pathVars.USE_DOC_ID)){
+
+				if(paperID.length()>0){					
+					vectorArff.append(paperID).append(",");
+				}
+				else{
+					vectorArff.append("0,");
+				}			
+		}
+		
 		//fill values of journal titles
 		if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){
 			
diff --git a/src/classifier/Trainer.java b/src/classifier/Trainer.java
index 7417982..4ec0da2 100644
--- a/src/classifier/Trainer.java
+++ b/src/classifier/Trainer.java
@@ -30,8 +30,8 @@ this software and associated documentation files (the "Software"), to deal in
 
 
 package classifier;
+import java.util.ArrayList;
 import java.util.Random;
-
 import weka.attributeSelection.LatentSemanticAnalysis;
 import weka.attributeSelection.PrincipalComponents;
 import weka.attributeSelection.GainRatioAttributeEval;
@@ -41,20 +41,30 @@ this software and associated documentation files (the "Software"), to deal in
 import weka.classifiers.CostMatrix;
 import weka.classifiers.Evaluation;
 import weka.classifiers.bayes.NaiveBayes;
+import weka.classifiers.evaluation.NominalPrediction;
+import weka.classifiers.evaluation.Prediction;
+import weka.classifiers.evaluation.output.prediction.PlainText;
 import weka.classifiers.functions.LibSVM;
 import weka.classifiers.meta.AttributeSelectedClassifier;
 import weka.classifiers.meta.CostSensitiveClassifier;
+import weka.classifiers.meta.FilteredClassifier;
 import weka.classifiers.trees.LMT;
+import weka.core.Attribute;
 import weka.core.Instances;
+import weka.core.Range;
 import weka.core.converters.ConverterUtils.DataSource;
+import weka.filters.Filter;
+import weka.filters.unsupervised.attribute.Remove;
 import configure.PathConstants;
+import filter.InformedFilter;
 
 /**
  * Trains and tests a classifier, 
  * executes k-fold cross validation on train data 
  * and outputs the classification results.
  * 
- * @author halmeida
+ * @author Hayda Almeida
+ * @since 2014
  *
  */
 
@@ -65,6 +75,8 @@ public class Trainer {
 	double[][] ranking;
 	String rank;
 	
+	boolean verbose = false;
+	
 
 	/**
 	 * @param args
@@ -72,15 +84,40 @@ public class Trainer {
 	 */
 	public static void main(String[] args) throws Exception {
 		
+		
+		String classifier= "";	
+		
+		for(int i = 0; i < args.length; i++){
+			try{
+				if(args[i].matches("-lmt"))
+					classifier = "lmt";
+				if(args[i].matches("-svm"))
+					classifier = "svm";
+				if(args[i].matches("-nb"))
+				classifier = "nb";				
+			}
+			catch(Exception e){
+				System.out.println("A classifier must be given as argument. Use: \n"
+				+ "-lmt -> a LMT classifier; \n "
+				+ "-svm -> a SVM classifier; \n"
+				+ "-nb  -> a Naive Bayes classifier. ");
+				System.exit(0);
+			}
+		}
+		
 		PathConstants pathVars = new PathConstants();
 		Trainer evaluator = new Trainer();
-			
+		InformedFilter filter = new InformedFilter();			
+		Classifier cls;
 		
 		//Creating classifier
-        Classifier cls = (Classifier) new LMT();
-        //Classifier cls = (Classifier) new NaiveBayes();
-        //Classifier cls = (Classifier) new LibSVM();
-						
+		if(classifier.contains("lmt")) 
+			cls = (Classifier) new LMT();
+		else if (classifier.contains("svm")) 
+			cls = (Classifier) new LibSVM();
+		else 
+			cls = (Classifier) new NaiveBayes();
+       						
 		//Loading train data
 		DataSource sourceTrain = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TRAIN_DIR + pathVars.ARFF_TRAIN);
 		Instances trainData = sourceTrain.getDataSet();
@@ -90,9 +127,7 @@ public static void main(String[] args) throws Exception {
 		System.out.println("Class index set on training data.");
 		
 		System.out.println("Training data loaded. Number of instances: " + trainData.numInstances() + "\n");	
-		
-		//Executing k-fold cross validation
-		//train.crossFold(trainData, cls);			
+					
 		
 		//Loading test data
 		DataSource sourceTest = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TEST_DIR + pathVars.ARFF_TEST);
@@ -103,64 +138,175 @@ public static void main(String[] args) throws Exception {
 		System.out.println("Class index set on testing data.");
 		
 		System.out.println("Test data loaded. Number of instances: " + testData.numInstances() + "\n");		
+		
+		
+		//filter the file IDs, consider the new training set
+		Instances filteredTrainData = evaluator.filteredIDs(trainData);
+		Instances filteredTestData = evaluator.filteredIDs(testData);
+		
+		if(Boolean.valueOf(pathVars.USE_ODDS_RATIO)){
+			//Calculate OddsRatio for all instances
+			double[] OR = evaluator.loadFeatureFilter(filteredTrainData, filter, 1, Integer.parseInt(pathVars.OR_THRESHOLD));
+
+			//Apply Odds Ratio filtering in instances
+			filteredTrainData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTrainData);
+			filteredTestData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTestData);
+		}
+		
+		if(Boolean.valueOf(pathVars.USE_IDF)){
+			//Calculate idf for all instances
+			double[] idf = evaluator.loadFeatureFilter(filteredTrainData, filter, 2, Integer.parseInt(pathVars.IDF_THRESHOLD));
 			
-		//Creating filtered classifiers
-		//AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls);
-		//AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls);
-		//AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls);
-		//AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls);
+			//Apply idf filtering in instances
+			filteredTrainData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTrainData);
+			filteredTestData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTestData);
+		}
 				
 		//Training and testing classifier
-		evaluator.classify(trainData, testData, cls);	
+		evaluator.classify(filteredTrainData, filteredTestData, cls, testData);			
 		
-		//Training and testing costSensitive classifier
-		//evaluator.classify(trainData, testData, evaluator.classifySensitive(cls));		
+	}	
+	
+	/**
+	 * Loads evaluation of attributes according
+	 * to feature selection method provided.
+	 * 
+	 * @param data data instances
+	 * @param filter informed filter instance 
+	 * @param method identifier for selection method 
+	 * @return
+	 */
+	private double[] loadFeatureFilter(Instances data, InformedFilter filter, int method, int threshold){
+		
+		double[] values = new double[data.numAttributes()];		
 		
-		//Executing k-fold cross validation on filtered classifiers
-		//evaluator.crossFold(trainData, PCAclassifier);
-		//evaluator.crossFold(trainData, LSAclassifier);		
+		switch(method){
 		
+		case 1:
+			values = filter.oddsRatio(data, threshold);
+			break;
+		case 2:
+			values = filter.idf(data, threshold);
+			break;
+		}		
+		
+		return values;		
 	}	
 	
-
 	/**
-	 * Trains and tests a classifier when two separated
-	 * datasets are provided.
+	 * Uses evaluation of features according to 
+	 * selection method to remove attributes from
+	 * the dataset before training phase.
 	 * 
-	 * @param train training data to build classifier
-	 * @param test  test data to evaluate classifier
-	 * @param classif  type of classifier applied
+	 * @param threshold selection method threshold
+	 * @param values evaluation of attributes according to method 
+	 * @param data dataset instances
+	 * @return filtered dataset instances
 	 * @throws Exception
-	 */
-	public void classify(Instances train, Instances test, Classifier classif) throws Exception{
-
-		classif.buildClassifier(train);
-		Evaluation evaluateClassifier = new Evaluation(train);		
-		evaluateClassifier.evaluateModel(classif, test);	
+	 */	
+	private Instances applyFilter(String threshold, double[] values, Instances data) throws Exception{
+		int numberRemoved = 0;
+		
+		String indexRemove = "";		
 		
-		stats(evaluateClassifier, classif);		
+		for(int i = 0; i < values.length; i++){
+			if(values[i] == 0){
+				
+				int ind = i+1;
+				
+				if(indexRemove.length()==0) indexRemove = ind + ""; 
+				else indexRemove = indexRemove + "," + ind;
+				
+				numberRemoved++;
+			}
+		}
+		
+		try{
+			indexRemove = indexRemove.substring(0, indexRemove.length()-1);
+			//if(verbose)
+			System.out.println("\n = = = = => Filter removed " + numberRemoved +" attributes: " + indexRemove.toString() );
+		}
+		catch (Exception e){
+			System.out.println("\n = = = = => Filter threshold did not remove any attribute.");
+			}
+		
+		Remove remove = new Remove();
+		remove.setAttributeIndices(indexRemove);
+		remove.setInvertSelection(false);		
+		remove.setInputFormat(data);		
+		
+		Instances dataSubset = Filter.useFilter(data, remove);
+		return dataSubset;		
 	}
 	
+	
 	/**
-	 * Trains and tests a classifier using a 
-	 * provided Cost matrix 
+	 * Removes the ID attribute (index 1) 
+	 * from a given dataset 
 	 * 
-	 * @param classif type of classifier to be trained
-	 * @return CostSensitive classifier with costs and classifier
+	 * @param data instances
+	 * @return filtered dataset
 	 * @throws Exception
 	 */
-	
-	public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{
-		CostSensitiveClassifier costSensitive = new CostSensitiveClassifier();
-		CostMatrix matrix = new CostMatrix(2);
-		matrix.setElement(0, 1, 4);
-		matrix.setElement(1, 0, 1);
-		costSensitive.setClassifier(classif);
-		costSensitive.setCostMatrix(matrix);
+	private Instances filteredIDs(Instances data) throws Exception {
+		Remove remove = new Remove();		
+		//setting index to be removed
+		remove.setAttributeIndices("1");
+		remove.setInvertSelection(false);		
+		remove.setInputFormat(data);
 		
-		return costSensitive;
+		Instances dataSubset = Filter.useFilter(data, remove);
+		return dataSubset;
 	}
-	
+
+
+	/**
+	 * Trains and tests a classifier when two separated
+	 * datasets are provided.
+	 * 
+	 * @param train training data to build classifier
+	 * @param test  test data to evaluate classifier
+	 * @param classif  type of classifier applied
+	 * @throws Exception
+	 */
+	public void classify(Instances filteredTrain, Instances filteredTest, Classifier classif, Instances test) throws Exception{
+
+		StringBuffer sb = new StringBuffer();
+		PlainText prediction = new PlainText();
+		Range attributesToShow = null;
+		prediction.setBuffer(sb);
+		prediction.setHeader(test);				
+		prediction.setOutputDistribution(true);
+
+		classif.buildClassifier(filteredTrain);
+
+		Evaluation evaluateClassifier = new Evaluation(filteredTrain);		
+		evaluateClassifier.evaluateModel(classif, filteredTest, prediction, attributesToShow, true);
+		//evaluateClassifier.evaluateModel(classif, filteredTest);	
+
+			stats(evaluateClassifier, classif);
+
+		ArrayList<Prediction> output =  evaluateClassifier.predictions();		
+
+		if(verbose){
+		for(int i = 0; i < output.size(); i++){
+			double act = output.get(i).actual();
+			String actual;
+			if(act == 1.0) actual = "negative"; else actual = "positive";
+
+			double pred = output.get(i).predicted();
+			String predicted;
+			if(pred == 1.0) predicted = "negative"; else predicted = "positive";
+
+			String value = test.instance(i).toString(0);
+
+			System.out.println("PMID: "+ value + "\t" +
+					"Actual: " + actual + "\t" +
+					"Predicted: " + predicted								
+					);	
+		}	}			
+	}
+
 	
 	/**
 	 * Outputs classifier results.
@@ -169,7 +315,6 @@ public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exce
 	 * @param classif  type of classifier applied
 	 * @throws Exception 
 	 */
-
 	public void stats(Evaluation eval, Classifier classif) throws Exception{		
 		System.out.println("Number of attributes: " + eval.getHeader().numAttributes());
 		System.out.println(eval.toSummaryString("\n======== RESULTS ========\n", false));
@@ -177,138 +322,167 @@ public void stats(Evaluation eval, Classifier classif) throws Exception{
 		System.out.println(eval.toMatrixString("\n\n======== Confusion Matrix ========\n"));		
 	}
 	
-	/**
-	 * Executes k-fold cross validation 
-	 * on a given dataset
-	 * @param data training data provided
-	 * @param classif type of classifier usedsearch
-	 * @throws Exception
-	 */
-			
-	public void crossFold(Instances data, Classifier classif) throws Exception{
-
-		Random random = new Random(SEED); //creating seed number generator
-		Evaluation evaluateClassifier = new Evaluation(data);
-		
-		System.out.println("Classifier working...\n\n");
-		//Classifier should not be trained when cross-validation is executed. 
-		//because subsequent calls to buildClassifier method will return the same results always.
-		evaluateClassifier.crossValidateModel(classif, data, FOLDS, random);		
-						
-		stats(evaluateClassifier, classif);		
-	}
-	
 	
-	/**
-	 * Implements a Filtered GainRatio classifier, 
-	 * using the ranker as a search method.
-	 * 
-	 * @param classif type of classifier to be used
-	 * @return  filtered classif with Correlation analysis
-	 */
+	//Training and testing costSensitive classifier
+	//evaluator.classify(trainData, testData, evaluator.classifySensitive(cls));
 	
-	public AttributeSelectedClassifier setGRFilter(Classifier classif){
-		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
-		
-		//Creating evaluator and search method
-		GainRatioAttributeEval GR = new GainRatioAttributeEval();
-		Ranker rank = new Ranker();
-		//return the attributes with evaluation greater than 0
-		double threshold = 0.0;
-		rank.setThreshold(threshold);
-		
-		//Setting GainRatio filtered classifier		
-		fClassif.setClassifier(classif);
-		fClassif.setEvaluator(GR);
-		fClassif.setSearch(rank);
-		
-		return fClassif;
-		
-	}
+//	/**
+//	 * Trains and tests a classifier using a 
+//	 * provided Cost matrix 
+//	 * 
+//	 * @param classif type of classifier to be trained
+//	 * @return CostSensitive classifier with costs and classifier
+//	 * @throws Exception
+//	 */	
+//	public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{
+//		CostSensitiveClassifier costSensitive = new CostSensitiveClassifier();
+//		CostMatrix matrix = new CostMatrix(2);
+//		matrix.setElement(0, 1, 4);
+//		matrix.setElement(1, 0, 1);
+//		costSensitive.setClassifier(classif);
+//		costSensitive.setCostMatrix(matrix);
+//		
+//		return costSensitive;
+//	}
 	
-	/**
-	 * Implements a Filtered Correlation classifier, 
-	 * using the ranker as a search method.
-	 * 
-	 * @param classif type of classifier to be used
-	 * @return  filtered classif with Correlation analysis
-	 */
+	//Executing k-fold cross validation on filtered classifiers
+	//evaluator.crossFold(trainData, PCAclassifier);
+	//evaluator.crossFold(trainData, LSAclassifier);
 	
-	public AttributeSelectedClassifier setCorrFilter(Classifier classif){
-		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
-		
-		//Creating evaluator and search method
-		CorrelationAttributeEval Corr = new CorrelationAttributeEval();
-		Ranker rank = new Ranker();
-		
-		//return the attributes with evaluation greater than 0
-		double threshold = 0.03;
-		rank.setThreshold(threshold);
-		
-		//Setting GainRatio filtered classifier		
-		fClassif.setClassifier(classif);
-		fClassif.setEvaluator(Corr);
-		fClassif.setSearch(rank);
-		
-		return fClassif;
-		
-	}
+//	/**
+//	 * Executes k-fold cross validation 
+//	 * on a given dataset
+//	 * @param data training data provided
+//	 * @param classif type of classifier usedsearch
+//	 * @throws Exception
+//	 */			
+//	public void crossFold(Instances data, Classifier classif) throws Exception{
+//
+//		Random random = new Random(SEED); //creating seed number generator
+//		Evaluation evaluateClassifier = new Evaluation(data);
+//		
+//		System.out.println("Classifier working...\n\n");
+//		//Classifier should not be trained when cross-validation is executed. 
+//		//because subsequent calls to buildClassifier method will return the same results always.
+//		evaluateClassifier.crossValidateModel(classif, data, FOLDS, random);		
+//						
+//		stats(evaluateClassifier, classif);		
+//	}	
 	
-	/**
-	 * Implements a Filtered PCA classifier, 
-	 * using the ranker as a search method.
-	 * 
-	 * @param classif type of classifier to be used
-	 * @return  filtered classif with PCA analysis config
-	 */
-	public AttributeSelectedClassifier setPCAFilter(Classifier classif){
-		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
-		
-		//Creating evaluator and search method
-		PrincipalComponents PCA = new PrincipalComponents();
-		PCA.setMaximumAttributeNames(-1);
-		Ranker rank = new Ranker();
-		//return the attributes with evaluation greater than 0
-		rank.setThreshold(0);
-				
-		//Setting the PCA classifier configurations
-		fClassif.setClassifier(classif);
-		fClassif.setEvaluator(PCA);
-		fClassif.setSearch(rank);		
-		
-		return fClassif;
-	}
 	
-	/**
-	 * Implements a Filtered LSA classifier, 
-	 * using the ranker as a search method
-	 * @param classif
-	 * @return
-	 */
+	//Creating filtered classifiers
+	//AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls);
+	//AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls);
+	//AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls);
+	//AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls);
 	
-	private AttributeSelectedClassifier setLSAFilter(Classifier classif) {
-		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
-		
-		//Creating evaluator
-		LatentSemanticAnalysis LSA = new LatentSemanticAnalysis();
-		LSA.setMaximumAttributeNames(-1);
-		//value between 0 and 1 includes proportion of total latent variables
-		//greater than 1 = exact # of variables to include;
-		//less than or equal zero = include all;
-		//default = 0.95 (proportional)
-		double defaul = 0;
-		LSA.setRank(defaul);
-		//Creating search method
-		Ranker rank = new Ranker();
-		rank.setThreshold(0);
-				
-		//Setting the LSA classifier configurations
-		fClassif.setClassifier(classif);		
-		fClassif.setEvaluator(LSA);
-		fClassif.setSearch(rank);				
-		
-		return fClassif;
-	}	
+//	/**
+//	 * Implements a Filtered GainRatio classifier, 
+//	 * using the ranker as a search method.
+//	 * 
+//	 * @param classif type of classifier to be used
+//	 * @return  filtered classif with Correlation analysis
+//	 */	
+//	public AttributeSelectedClassifier setGRFilter(Classifier classif){
+//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+//		
+//		//Creating evaluator and search method
+//		GainRatioAttributeEval GR = new GainRatioAttributeEval();
+//		Ranker rank = new Ranker();
+//		//return the attributes with evaluation greater than 0
+//		double threshold = 0.0;
+//		rank.setThreshold(threshold);
+//		
+//		//Setting GainRatio filtered classifier		
+//		fClassif.setClassifier(classif);
+//		fClassif.setEvaluator(GR);
+//		fClassif.setSearch(rank);
+//		
+//		return fClassif;
+//		
+//	}
+//	
+//	/**
+//	 * Implements a Filtered Correlation classifier, 
+//	 * using the ranker as a search method.
+//	 * 
+//	 * @param classif type of classifier to be used
+//	 * @return  filtered classif with Correlation analysis
+//	 */	
+//	public AttributeSelectedClassifier setCorrFilter(Classifier classif){
+//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+//		
+//		//Creating evaluator and search method
+//		CorrelationAttributeEval Corr = new CorrelationAttributeEval();
+//		Ranker rank = new Ranker();
+//		
+//		//return the attributes with evaluation greater than 0
+//		double threshold = 0.03;
+//		rank.setThreshold(threshold);
+//		
+//		//Setting GainRatio filtered classifier		
+//		fClassif.setClassifier(classif);
+//		fClassif.setEvaluator(Corr);
+//		fClassif.setSearch(rank);
+//		
+//		return fClassif;
+//		
+//	}
+//	
+//	/**
+//	 * Implements a Filtered PCA classifier, 
+//	 * using the ranker as a search method.
+//	 * 
+//	 * @param classif type of classifier to be used
+//	 * @return  filtered classif with PCA analysis config
+//	 */
+//	public AttributeSelectedClassifier setPCAFilter(Classifier classif){
+//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+//		
+//		//Creating evaluator and search method
+//		PrincipalComponents PCA = new PrincipalComponents();
+//		PCA.setMaximumAttributeNames(-1);
+//		Ranker rank = new Ranker();
+//		//return the attributes with evaluation greater than 0
+//		rank.setThreshold(0);
+//				
+//		//Setting the PCA classifier configurations
+//		fClassif.setClassifier(classif);
+//		fClassif.setEvaluator(PCA);
+//		fClassif.setSearch(rank);		
+//		
+//		return fClassif;
+//	}
+//	
+//	/**
+//	 * Implements a Filtered LSA classifier, 
+//	 * using the ranker as a search method
+//	 * @param classif
+//	 * @return
+//	 */	
+//	private AttributeSelectedClassifier setLSAFilter(Classifier classif) {
+//		AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier();
+//		
+//		//Creating evaluator
+//		LatentSemanticAnalysis LSA = new LatentSemanticAnalysis();
+//		LSA.setMaximumAttributeNames(-1);
+//		//value between 0 and 1 includes proportion of total latent variables
+//		//greater than 1 = exact # of variables to include;
+//		//less than or equal zero = include all;
+//		//default = 0.95 (proportional)
+//		double defaul = 0;
+//		LSA.setRank(defaul);
+//		//Creating search method
+//		Ranker rank = new Ranker();
+//		rank.setThreshold(0);
+//				
+//		//Setting the LSA classifier configurations
+//		fClassif.setClassifier(classif);		
+//		fClassif.setEvaluator(LSA);
+//		fClassif.setSearch(rank);				
+//		
+//		return fClassif;
+//	}	
 	
 	
 
diff --git a/src/configure/PathConstants.java b/src/configure/PathConstants.java
index 2103118..dab7b82 100644
--- a/src/configure/PathConstants.java
+++ b/src/configure/PathConstants.java
@@ -76,7 +76,11 @@ public PathConstants(String configfile) {
 
 	//Input files
 	public String HOME_DIR;
-	public String CORPUS_DIR; 
+	public String CORPUS_DIR;
+	public String SOURCE_DIR;
+	public String DUP_DIR;
+	public String POS_DIR;
+	public String NEG_DIR;
 	public String TRAIN_DIR; 
 	public String TEST_DIR;
 	public String FEATURE_DIR;	
@@ -94,6 +98,7 @@ public PathConstants(String configfile) {
 	public String TITLE_FEATURES;
 	public String NGRAM_FEATURES;
 	public String TITLE_NGRAMS;
+	public String DOC_IDS;
 	
 	//Feature setup
 	public String USE_TEXT_SIZE;
@@ -106,7 +111,7 @@ public PathConstants(String configfile) {
 	public String USE_ANNOTATION_FEATURE;
 	public String USE_ANNOTATION_TYPE;
 	public String USE_TITLE_FEATURE;
-	
+	public String USE_DOC_ID;
 	
 	//Feature setup - Ngrams
 	public String USE_NGRAM_FEATURE;
@@ -115,6 +120,12 @@ public PathConstants(String configfile) {
 	public String NGRAM_SIZE;
 	public String USE_WEIGHTED_NGRAM;
 	public String WEIGHT;
+	
+	//Feature filtering
+	public String USE_ODDS_RATIO;
+	public String OR_THRESHOLD;
+	public String USE_IDF;
+	public String IDF_THRESHOLD;
 
 	//Task setup
 	public String EXP_TYPE;	
@@ -139,6 +150,10 @@ private void initVars() {
 		}
 		HOME_DIR = CONFIG_MAP.get("HOME_DIR");
 		CORPUS_DIR = CONFIG_MAP.get("CORPUS_DIR"); 
+		SOURCE_DIR = CONFIG_MAP.get("SOURCE_DIR");
+		DUP_DIR = CONFIG_MAP.get("DUP_DIR");
+		POS_DIR = CONFIG_MAP.get("POS_DIR");
+		NEG_DIR = CONFIG_MAP.get("NEG_DIR");
 		TRAIN_DIR = CONFIG_MAP.get("TRAIN_DIR"); 
 		TEST_DIR = CONFIG_MAP.get("TEST_DIR");
 		FEATURE_DIR = CONFIG_MAP.get("FEATURE_DIR");		
@@ -155,6 +170,7 @@ private void initVars() {
 		TITLE_FEATURES = CONFIG_MAP.get("TITLE_FEATURES");
 		NGRAM_FEATURES = CONFIG_MAP.get("NGRAM_FEATURES");
 		TITLE_NGRAMS = CONFIG_MAP.get("TITLE_NGRAMS");
+		DOC_IDS = CONFIG_MAP.get("DOC_IDS");
 		
 		USE_TEXT_SIZE = CONFIG_MAP.get("USE_TEXT_SIZE");
 		USE_JOURNAL_TITLE_FEATURE = CONFIG_MAP.get("USE_JOURNAL_TITLE_FEATURE");	
@@ -165,6 +181,7 @@ private void initVars() {
 		USE_ANNOTATION_FEATURE = CONFIG_MAP.get("USE_ANNOTATION_FEATURE");
 		USE_ANNOTATION_TYPE = CONFIG_MAP.get("USE_ANNOTATION_TYPE");		
 		USE_TITLE_FEATURE = CONFIG_MAP.get("USE_TITLE_FEATURE");
+		USE_DOC_ID = CONFIG_MAP.get("USE_DOC_ID");
 		
 		USE_NGRAM_FEATURE = CONFIG_MAP.get("USE_NGRAM_FEATURE");
 		USE_TITLE_NGRAMS = CONFIG_MAP.get("USE_TITLE_NGRAMS");
@@ -172,6 +189,11 @@ private void initVars() {
 		NGRAM_SIZE = CONFIG_MAP.get("NGRAM_SIZE");
 		USE_WEIGHTED_NGRAM = CONFIG_MAP.get("USE_WEIGHTED_NGRAM");
 		WEIGHT = CONFIG_MAP.get("WEIGHT");
+		
+		USE_ODDS_RATIO = CONFIG_MAP.get("USE_ODDS_RATIO");
+		OR_THRESHOLD = CONFIG_MAP.get("OR_THRESHOLD");
+		USE_IDF = CONFIG_MAP.get("USE_IDF");
+		IDF_THRESHOLD = CONFIG_MAP.get("IDF_THRESHOLD");
 				
 		EXP_TYPE = CONFIG_MAP.get("EXP_TYPE");		
 		NB_PARAMS = CONFIG_MAP.get("NB_PARAMS");		
diff --git a/src/filter/.gitignore b/src/filter/.gitignore
new file mode 100644
index 0000000..6b468b6
--- /dev/null
+++ b/src/filter/.gitignore
@@ -0,0 +1 @@
+*.class
diff --git a/src/filter/InformedFilter.java b/src/filter/InformedFilter.java
new file mode 100644
index 0000000..4b125db
--- /dev/null
+++ b/src/filter/InformedFilter.java
@@ -0,0 +1,182 @@
+package filter;
+
+import weka.core.Attribute;
+import weka.core.Instances;
+
+/**
+ * This class implements informed feature selection
+ * methods, to be used as filters after vector 
+ * generation and pre-model building 
+ * 
+ * @author Hayda Almeida
+ * @since 2015
+ *
+ */
+public class InformedFilter {
+	
+	private boolean verbose = true;
+	
+	/**
+	 * Calculates oddsRatio of each feature 
+	 * in a given set of Instances
+	 *  
+	 * @param data set of instances, read from ARFF file
+	 * @return oddsRatio for each attribute in the matrix
+	 */
+	public double[] oddsRatio(Instances data, int threshold){
+
+		double[] oddsRatio = new double[data.numAttributes()];
+		
+
+		for(int i = 0; i < data.numAttributes()-1; i++ ){
+
+			double OR = 0;
+
+			Attribute current = data.attribute(i);
+			double pos_docs = 0, //number of documents in class C 
+					pos_oc = 0,  //number of times term t occured in class C
+					pos_term_docs = 0, //number of docs in class C that have term
+					pos_not_docs = 0,  //number of docs in class C that do not have term
+					neg_term_docs = 0,   //number of docs not in class C with term
+					neg_not_docs = 0,  //number of docs not in class C nor with term
+					neg_docs = 0; //number of documents not in class C
+
+			for(int j = 0; j < data.size(); j++){
+
+				double current_value = data.instance(j).value(current);
+				double current_class = data.instance(j).classValue();
+
+				//class is positive  
+				if(current_class < 1){
+					pos_docs = pos_docs + 1;
+
+					//the feature occurred in the document
+					if(current_value > 0){
+						pos_oc = pos_oc + current_value;
+						pos_term_docs = pos_term_docs +1;
+					}
+					//the feature did not occur in positive docs
+					else pos_not_docs = pos_not_docs + 1;
+				}
+				//class is negative
+				else{
+					neg_docs = neg_docs+1;
+
+					//the feature occurred in the document
+					if(current_value > 0){
+						neg_term_docs = neg_term_docs +1;
+					}
+					//the feature did not occur in negative docs
+					else neg_not_docs = neg_not_docs + 1;
+				}
+
+			}
+
+			OR = ( ( (pos_term_docs / pos_docs) / (pos_not_docs/ pos_docs) ) / 
+					( (neg_term_docs / neg_docs) / (neg_not_docs / neg_docs) ) ); 
+			
+		//	OR = (pos_term_docs / pos_not_docs) / (neg_term_docs / neg_not_docs);
+			
+			
+			//99% confidence: 2.575
+			//95% confidence: 1.96
+			double confidenceLow =  Math.exp(Math.log(OR) - (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs))));
+			double confidenceHigh = Math.exp(Math.log(OR) + (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs))));
+						
+			//checking if OR value is within the confidence interval
+			//and if it satisfies the threshold
+			if( ((OR <= confidenceHigh) && (OR >= confidenceLow) 
+					&& !(OR == threshold))
+					//checking if the confidence interval holds the null hypothesis (i.e., spans 1.0)
+					&& !(confidenceLow <=1 && confidenceHigh >=1))
+				oddsRatio[i] = OR;
+			else
+				oddsRatio[i] = 0;
+			
+			if(verbose){
+			System.out.println("Attribute: "+ data.attribute(i).toString() +"\t\t OddsRatio: " + oddsRatio[i] + 
+					"\tConfidenceLow: " + confidenceLow + "\tConfidenceHigh: "+ confidenceHigh);
+			}
+		}
+		
+		return oddsRatio;		
+	}
+	
+	/**
+	 * Calculates the inverse document frequency
+	 * for each attribute in the dataset. 
+	 * 
+	 * @param data instances
+	 * @param threshold 
+	 * @return list of idfs for each attribute
+	 */
+	public double[] idf(Instances data, int threshold){
+		
+		double[] idf = new double[data.numAttributes()];		
+		
+		for(int i = 0; i < data.numAttributes()-1; i++ ){
+
+			double idf_at = 0;
+			double idf_at2 = 0;
+
+			Attribute current = data.attribute(i);
+			double pos_docs = 0, //number of documents in class C				
+					pos_term_docs = 0, //number of docs in class C that have term
+					neg_term_docs = 0,   //number of docs not in class C with term					
+					neg_docs = 0; //number of documents not in class C
+
+			for(int j = 0; j < data.size(); j++){
+
+				double current_value = data.instance(j).value(current);
+				double current_class = data.instance(j).classValue();
+
+				//class is positive  
+				if(current_class < 1){					
+					pos_docs = pos_docs + 1;
+
+					//the feature occurred in the document
+					if(current_value > 0){						
+						pos_term_docs = pos_term_docs +1;	
+					}						
+				}
+				else{
+					//class is negative 
+					neg_docs = neg_docs+1;
+					
+					//the feature occurred in the document
+					if(current_value > 0){						
+						neg_term_docs = neg_term_docs +1;
+					}					
+				}
+			}			
+						
+//			double idf_pos = Math.log((pos_docs)/(pos_term_docs));
+//			double idf_neg = Math.log((neg_docs)/(neg_term_docs));
+
+			//check if the idf in the "positive" collection
+			//is greater than the idf in the "negative" collection
+//			if (idf_pos > idf_neg) 
+//				idf_at = idf_pos;
+//				
+//			else idf_at = 0;			
+
+			idf_at = Math.log((pos_docs + neg_docs)/(pos_term_docs + neg_term_docs));
+			
+			if(idf_at <= threshold)
+				idf[i] = 0;				
+			else
+				idf[i] = idf_at;
+		}
+		
+		if(verbose){
+			for(int i = 0; i < idf.length; i++){
+				if(idf[i]>0)
+				   System.out.println("Attribute: "+ data.attribute(i).toString()+ "\t\t\t IDF: " + idf[i]);				
+			}
+		}
+		
+		return idf;			
+	}	
+	
+
+}
diff --git a/src/filter/NaiveFilter.java b/src/filter/NaiveFilter.java
new file mode 100644
index 0000000..db8a32e
--- /dev/null
+++ b/src/filter/NaiveFilter.java
@@ -0,0 +1,117 @@
+package filter;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import org.apache.commons.lang3.StringUtils;
+import configure.PathConstants;
+
+/**
+ * 
+ * This class implements naive feature filtering methods 
+ * to be used by the extractor processes pre-vector building
+ *   
+ * @author Hayda Almeida
+ * @since 2015 
+ *
+ */
+public class NaiveFilter {
+	
+	private boolean verbose = true;	
+	
+	/**
+	 * Removes from feature list all features with 
+	 * frequency not statistically relevant (2 or less)
+	 * @param list to be cleaned
+	 */	
+	public void considerAnnotationOccurence(HashMap<Map<String,String>,Integer> list, PathConstants vars){
+		//going over the list of annotations and removing the
+		//features with occurance lower than specified.
+		
+		Iterator<Map<String, String>> iterator = list.keySet().iterator();
+							
+		while(iterator.hasNext()){
+			Map<String, String> key = iterator.next();
+			int valor = list.get(key).intValue();			
+			
+			if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
+				iterator.remove();				
+			}
+		}		
+	}
+	
+	/**
+	 * Removes from feature list all features with 
+	 * frequency not statistically relevant (2 or less)
+	 * @param list to be cleaned
+	 */	
+	public void considerNgramOccurence(HashMap<String,Integer> list, PathConstants vars){
+		//going over the list of annotations and removing the
+		//statistically not significant features - frequency less than 2
+		Iterator <Integer> iterator = list.values().iterator();
+
+		while(iterator.hasNext()){
+			Integer key = iterator.next();
+
+			if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){
+				iterator.remove();				
+			}
+		}
+	}
+	
+	/**
+	 * Removes stopwords from ngrams list
+	 * 
+	 * @param str list of ngrams
+	 * @param constants 
+	 * @return cleaned list of ngrams
+	 */	
+	public String removeStopList(String[] str, PathConstants pathVar){
+		
+		//stop-words file name
+		String pathStop = "stopList.txt";
+		String[] stop = null;
+		StringBuilder cleaned = new StringBuilder();
+		
+		try{
+			
+			BufferedReader reader = new BufferedReader(new FileReader(pathStop));
+			
+			String line = null;	
+			//loading stop-words list
+			while((line = reader.readLine()) != null){
+				stop = StringUtils.split(line,",");
+				line = reader.readLine();
+			}
+			
+			reader.close();
+			
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        } 		
+		
+		//iteraing over text to be cleaned
+		for(int i = 0; i < str.length; i++){
+			//iterating over stop-words list
+			for(int j = 0; j < stop.length; j++){
+				
+				//when stop-word is encountered, replace it
+				if(str[i].equalsIgnoreCase(stop[j])){
+					str[i] = str[i].replace(str[i],"*");					
+				}				
+			}
+			//retrieve the text without stop-words replacements
+			if(!(str[i].contentEquals("*"))){
+				cleaned.append(str[i]).append(" ");				
+			}
+		}		
+		return cleaned.toString().replace("  ", " ");
+	}
+
+}
diff --git a/src/preprocessing/.gitignore b/src/preprocessing/.gitignore
new file mode 100644
index 0000000..6b468b6
--- /dev/null
+++ b/src/preprocessing/.gitignore
@@ -0,0 +1 @@
+*.class
diff --git a/src/preprocessing/ConcatXML.java b/src/preprocessing/ConcatXML.java
new file mode 100644
index 0000000..89e255f
--- /dev/null
+++ b/src/preprocessing/ConcatXML.java
@@ -0,0 +1,717 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+package preprocessing;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Date;
+import java.util.List;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import configure.PathConstants;
+
+/**
+ * Generates a corpus from raw XML doc instances, 
+ * so that features can be extracted from it
+ *   
+ * @author Hayda Almeida
+ * @since 2014
+ *
+ */
+public class ConcatXML{
+	
+	private String tag1;
+	private String tag2;
+	private String tag3;
+	private String tag4;
+	private String id;
+	private String corpusTag;
+	private String corpusTagC;
+
+
+	public ConcatXML(){	
+
+		this.setId("PMID");				
+		this.setTag1("(?s)<.*?xml.*?>");
+		this.setTag2("(?s)<.*?!DOCTYPE.*?>");
+		this.setTag3("(?s)<.*?corpus.*?>");
+		this.seTag4("(?s)<.*?/corpus.*?>");
+		this.setCorpusTag("<corpus>");
+		this.setCorpusTag("</corpus>");		
+	}
+	
+	
+
+	public static void main(String[] args) throws Exception {	
+		
+		PathConstants pathVars = new PathConstants();
+		
+		String xmlDir = "";
+		if(Integer.parseInt(pathVars.EXP_TYPE)== 1)
+			xmlDir = "test";
+		else xmlDir = "train";
+		
+		String sourceDir = "", duplicatesDir = "";
+		
+		Boolean dc = false, df = false, cl = false, cc = false;
+		
+		for(int i = 0; i < args.length; i++){
+			try{				
+				if(args[i].matches("-dc")) 	dc = true;
+				if(args[i].matches("-df"))	df = true;
+				if(args[i].matches("-cl"))  cl = true;
+				if(args[i].matches("-cc"))  cc = true;
+			}
+			catch(Exception e){
+				System.out.println("Use: \n"			
+					+ "-tr -> train, -ts -> test; \n "
+					+ "-dc 	-> check duplicates in corpus vs. folder; \n "
+					+ "-df  -> check duplicates in two folders; \n"
+					+ "-cl  -> clean a source folder; \n"
+					+ "-cc  -> concatenate files in a folder ");
+				System.exit(0);
+				};
+		}				
+		
+		String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
+		String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE;
+
+		sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + xmlDir;
+		duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.DUP_DIR;
+				
+		String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml";
+		String tagCorpus = concatCorpus;
+		
+		ConcatXML concat = new ConcatXML();		
+		
+		//================= Checking for duplicates =====================//
+		if(dc) concat.checkDupCorpus(trainCorpusPath, sourceDir);
+		if(df) concat.checkDupFolder(sourceDir, duplicatesDir);
+				
+		//================== Creating corpus ==========================//
+		if(cl){
+			concat.cleanXML(sourceDir);
+			if(duplicatesDir.length()>1)
+				concat.cleanXML(duplicatesDir);
+			}
+		if(cc){
+			concat.concatenateXML(sourceDir, "", concatCorpus);
+			concat.tagCorpus(tagCorpus);
+		}
+	}	
+	
+	/**
+	 * Returns the ID of a XML jsoup document
+	 * @param doc  a XML doc parsed by jsoup 
+	 * @return ID string
+	 * @throws IOException
+	 */
+	public String returnID(Document doc) throws IOException{
+		
+		String id = "";
+		
+		Elements paper = doc.body().getElementsByTag("pubmedarticleset");						
+								
+		//fetching the paper ID - 
+		//for all items in a paper, retrieve only PMIDs 
+		for(Element e : paper.select(getId())){
+			//only consider the ID if the parent is medline citation
+			if(e.parentNode().nodeName().contains("medline")){						
+				id = e.text();
+			}
+		}
+		return id;
+	}
+	
+	/**
+	 * Reads the file IDs in a folder and 
+	 * checks a second folder for duplicates. 
+	 *  
+	 * @param dirSrc source folder
+	 * @param dirDup folder to check for duplicates
+	 */
+	
+	public void checkDupFolder(String dirSrc, String dirDup){
+		ArrayList<String> sourceIDs = new ArrayList<String>();
+		ArrayList<String> duplicated = new ArrayList<String>();
+		ArrayList<String> dupIDs = new ArrayList<String>();
+		int ids = 0;
+
+		if(dirSrc.contentEquals(dirDup)){		
+			System.out.println("Source and duplicates directories are the same.\n\n========================\n");			
+		}		
+		else {		
+
+			File sourceDir = new File(dirSrc);
+			File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
+				@Override
+				public boolean accept(File dir, String name){
+					return name.endsWith(".xml");
+				}
+			});	
+
+			try{
+				//for each file on the source dir 
+				for (File xml : srcXMLs){				
+
+					try{
+						
+						String id  = "";
+						//Loading file
+						File input = new File(xml.getPath());
+						//Jsoup parse
+						Document doc = Jsoup.parse(input, "UTF-8");
+												
+						//fetching the document ID
+						id = returnID(doc);
+
+						if(!id.isEmpty()){
+							sourceIDs.add(id);
+							ids++;
+						}
+
+					}catch (FileNotFoundException e) {
+						e.printStackTrace();
+					}
+
+				}				
+
+			}catch (FileNotFoundException e) {
+				e.printStackTrace();
+			}
+			catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			System.out.println(ids + " source file IDs encountered.");
+			ids = 0;
+
+			File dupDir = new File(dirDup);
+
+			File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+				@Override
+				public boolean accept(File dir, String name){
+					return name.endsWith(".xml");
+				}
+			});		
+
+			try{
+				//for each file on the possibly duplicated dir 
+				for (File xml : dupXMLs){				
+
+					try{
+						String id  = "";
+												//Loading file
+						File input = new File(xml.getPath());
+						//Jsoup parse
+						Document doc = Jsoup.parse(input, "UTF-8");
+												
+						//fetching the document ID
+						id = returnID(doc);
+
+						if(!id.isEmpty()){
+							dupIDs.add(id);
+							String dupFileID = id;
+							ids++;
+							
+							for(int j = 0; j < sourceIDs.size(); j++){
+								if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){
+									
+									//add ID to duplicated list
+									duplicated.add(dupFileID);
+									
+									//rename the original file									
+									Path from = xml.toPath(); //convert from File to Path
+									Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+						    	    Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+								}
+							}							
+						}
+
+					}catch (FileNotFoundException e) {
+						e.printStackTrace();
+					}
+				}				
+
+			}catch (FileNotFoundException e) {
+				e.printStackTrace();
+			}
+			catch(Exception e){
+				throw new RuntimeException(e);
+			}
+
+			//count number of existing papers on possibly duplicated folder
+			//just to make sure we are gathering all IDs
+			System.out.println(ids + " new file IDs encountered.");
+			ids = 0;
+			//sorting the list of duplicated IDs
+			Collections.sort(duplicated, new Comparator<String>(){
+				@Override
+				public int compare(String one, String two){
+					return one.compareTo(two);
+				}
+			});	
+
+			System.out.println("\nReaded source files: " + sourceIDs.size());				
+			System.out.println("Readed new files: " + dupIDs.size());	
+
+			System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+
+			System.out.println("\nDuplicated files IDs: ");
+			for(int i = 0; i < duplicated.size(); i++){
+				System.out.println(duplicated.get(i));
+			}
+
+			System.out.println("\n========================\n");
+		}
+
+
+	}
+	
+	/**
+	 * Reads the corpus and checks the papers IDs
+	 * to identify duplicates in case new papers 
+	 * are being concatenated to corpus.
+	 * 
+	 * @param corpus path to current corpora to check
+	 * @param dir path to folder with new files to be concatenated
+	 */
+	
+	public void checkDupCorpus(String corpus, String dir){
+		ArrayList<String> trainingIDs = new ArrayList<String>();
+		ArrayList<String> duplicated = new ArrayList<String>();
+		ArrayList<String> newFiles = new ArrayList<String>();
+		
+		int ids = 0;
+		
+		try 
+		{
+			File input = new File(corpus);
+			//Jsoup parse
+			Document doc = Jsoup.parse(input, "UTF-8");
+			Elements corp = doc.body().getElementsByTag("pubmedarticleset");
+
+			String id  = "";		
+
+			for(Element paper : corp){
+				Document thisDoc = Jsoup.parseBodyFragment(paper.toString());
+				
+				//fetching the document ID
+				id = returnID(thisDoc);
+
+				if(!id.isEmpty()){
+					trainingIDs.add(id);
+					ids++;
+				}	
+			}		
+		}catch (FileNotFoundException e) {
+            e.printStackTrace();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+		
+		System.out.println(ids + " training file IDs encountered.");
+		ids = 0;
+		
+		File corpusDir = new File(dir);
+		File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});		
+		
+		try{
+			//for each file on the corpus dir 
+			for (File xml : newXMLs){				
+
+				try{
+					String id  = "";
+					//Loading file
+					File input = new File(xml.getPath());
+					//Jsoup parse
+					Document doc = Jsoup.parse(input, "UTF-8");
+
+					//fetching the document ID
+					id = returnID(doc);
+
+					if(!id.isEmpty()){						
+
+						newFiles.add(id);
+						String newFileID = id;
+						ids++;
+
+
+						for(int j = 0; j < trainingIDs.size(); j++){
+							if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){
+
+								//add ID to duplicated list
+								duplicated.add(newFileID);
+
+								//moving the original file									
+								Path from = xml.toPath(); //convert from File to Path
+								Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+								Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+							}
+						}
+					}
+				}catch (FileNotFoundException e) {
+					e.printStackTrace();
+				}
+			}			
+
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+		
+		//count number of existing papers on the training file
+		//just to make sure we are gathering all IDs
+		System.out.println(ids + " new file IDs encountered.");
+		ids = 0;
+
+		
+		//sorting the list of duplicated IDs
+		Collections.sort(duplicated, new Comparator<String>(){
+			@Override
+			public int compare(String one, String two){
+				return one.compareTo(two);
+			}
+		});	
+		
+		System.out.println("\nReaded training files: " + trainingIDs.size());				
+		System.out.println("Readed new files: " + newFiles.size());	
+				
+		System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+		
+		System.out.println("\nDuplicated files IDs: ");
+		for(int i = 0; i < duplicated.size(); i++){
+			System.out.println(duplicated.get(i));
+		}
+		
+		System.out.println("\n========================\n");
+		
+	}
+	
+	
+	/**
+	 * Reads and edits a list of XMLs files in a folder
+	 * to remove XML and previous corpus tags, 
+	 * preparing the files to be concatenated. 
+	 *  
+	 * @param dir string with folder path
+	 */
+	
+	public void cleanXML(String dir){		
+
+		//listing files on corpus dir
+		File sourceDir = new File(dir);
+		
+		File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});		
+
+		System.out.println("... Files list loaded.");				
+
+		try{
+			//for each file on the corpus dir 
+			for (File xml : newXMLs){				
+
+				try{
+					BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+					String line = null;
+					ArrayList<String> allLines = new ArrayList<String>();
+					String content  = null;
+
+					while((line = reader.readLine()) != null){						
+						content = line;	
+
+						//cleaning XML markups
+						content = content.replaceFirst(getTag1(), "");
+						content = content.replaceFirst(getTag2(), "");
+						//cleaning previous corpus tags
+						content = content.replaceFirst(getTag3(), "");
+						content = content.replaceFirst(getTag4(), "");
+						allLines.add(content);										
+					}					
+
+					PrintWriter writer = new PrintWriter(xml.getPath());
+
+					for (String l : allLines){
+						writer.println(l);			
+					}					
+					reader.close();
+					writer.close();				
+
+				}catch (FileNotFoundException e) {
+					e.printStackTrace();
+				}
+
+			}				
+
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+
+		System.out.println("... Files cleaned and saved.");
+		System.out.println("Ready for concatenation.");
+		System.out.println("\n========================\n");
+	}
+	
+
+
+	/**
+	 * Concatenates all XMLs in one folder or between two folders.
+	 * @param sourceDir main directory with XML files.
+	 * @param duplicDir second directory with duplicated XML files 
+	 * @param concatFile path name to saved concatenated corpus
+	 */
+	
+	public void concatenateXML(String sourceDir, String duplicDir, String concatFile){		
+
+		final int BUFFER = 1024 << 8;
+		byte[] buffer = new byte[BUFFER];
+
+		//listing files on corpus dir
+		File srcDir = new File(sourceDir);
+		File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});
+		
+		File dupDir = new File(duplicDir);
+		File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name) {				
+				return name.endsWith(".xml");
+			}			
+		}); 
+		
+		System.out.println("... Files list loaded.");		
+
+		//defining the output file (concatenated)
+		File newCorpus = new File(concatFile);		
+
+		try{	
+			OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus));
+			
+
+			//for each file on the corpus dir 
+			for (File xmls : srcXMLs){				
+				InputStream input = new FileInputStream(xmls);				
+				int count;				
+				
+				//if the file is not empty/finished
+				try{
+					while((count = input.read(buffer)) >= 0){										
+						
+						//write it on the concatenated final file
+						output.write(buffer, 0, count);
+					}
+				}finally{
+					input.close();
+				}
+			}
+			
+		if(dupXMLs != null){
+			for(File xmld : dupXMLs){
+				InputStream input = new FileInputStream(xmld);				
+				int count;				
+				
+				//if the file is not empty/finished
+				try{
+					while((count = input.read(buffer)) >= 0){										
+						
+						//write it on the concatenated final file
+						output.write(buffer, 0, count);
+					}
+				}finally{
+					input.close();
+				}
+			}
+		}
+			output.flush();
+			output.close();				
+			
+		}catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(Exception e){
+			throw new RuntimeException(e);
+		}
+
+		System.out.println("... File concatenated and saved.");
+		System.out.println("Ready for corpus tagging.");
+		System.out.println("\n========================\n");
+	}
+	
+	/**
+	 * Inserts corpus tag on XML file
+	 * 
+	 * @param pathToCorpus path to 
+	 * 		  concatenated corpus 
+	 */
+	
+	public void tagCorpus(String pathToCorpus){
+		
+		//tagging as corpus		
+		try{
+			BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus));
+						
+			String line = null;
+			String edit = null;
+			List<String> allLines = new ArrayList<String>();
+			
+			//adds tag at beggining of corpus
+			allLines.add(getCorpusTag());
+			
+			while((line = reader.readLine()) != null){	
+				 
+				allLines.add(line);					
+			}
+			//adds tag at the end of corpus
+			allLines.add(getCorpusTagC());			
+			
+			System.out.println("... Corpus loaded and tagged.");
+			//re-writting the file
+			PrintWriter writer = new PrintWriter(pathToCorpus);
+			
+			for (String l : allLines){
+				writer.println(l);			
+			}
+			reader.close();
+			writer.close();
+			
+			System.out.println("... File saved as tagged corpus.");			
+		}
+		catch (FileNotFoundException e) {
+			e.printStackTrace();
+		}
+		catch(IOException e){
+			e.printStackTrace();
+		}
+	}
+	
+	private String getCorpusTagC() {		
+		return corpusTagC;
+	}
+
+	private String getCorpusTag() {
+		// TODO Auto-generated method stub
+		return corpusTag;
+	}
+
+	public String getTag1() {
+		return tag1;
+	}
+
+	public void setTag1(String tag1) {
+		this.tag1 = tag1;
+	}
+
+	public String getTag2() {
+		return tag2;
+	}
+
+	public void setTag2(String tag2) {
+		this.tag2 = tag2;
+	}
+	
+	private String getTag4() {
+		// TODO Auto-generated method stub
+		return tag4;
+	}
+
+	private String getTag3() {
+		// TODO Auto-generated method stub
+		return tag3;
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(String id) {
+		this.id = id;
+	}	
+	
+	private void setCorpusTag(String string) {
+		this.corpusTag = string;
+		
+	}
+
+	private void seTag4(String string) {
+		this.tag4 = string;
+		
+	}
+
+	private void setTag3(String string) {
+		this.tag3 = string;
+		
+	}
+		
+}
+
+
diff --git a/src/preprocessing/SampleCorpus.java b/src/preprocessing/SampleCorpus.java
new file mode 100644
index 0000000..63613a8
--- /dev/null
+++ b/src/preprocessing/SampleCorpus.java
@@ -0,0 +1,237 @@
+package preprocessing;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import configure.PathConstants;
+
+/**
+ * Performs document instances sampling
+ * generating training and test files
+ * with specific balance input by user.
+ *   
+ * @author Hayda Almeida
+ * @since 2015
+ *
+ */
+public class SampleCorpus {
+
+	public static void main(String[] args) throws Exception {	
+
+		PathConstants pathVars = new PathConstants();
+		SampleCorpus sampling = new SampleCorpus();
+
+		String positiveDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.POS_DIR;
+		List positives = new LinkedList();
+
+		String negativeDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.NEG_DIR;
+		List negatives = new LinkedList();
+
+		//train or test sampling
+		Boolean tr = true, ts = true;
+		//% of test corpus WRT the collection, % positive on test set, % positive on training set 
+		int percTs = 20, posTr = 50, posTs = 10;
+
+		for(int i = 0; i < args.length; i++){
+			try{				
+				if(args[i].matches("-tr")){ 
+					tr = true;
+					posTr = Integer.parseInt(args[i+1]);					
+				}				 
+				if(args[i].matches("-ts")){
+					ts = true;
+					percTs = Integer.parseInt(args[i+1]);
+					posTs = Integer.parseInt(args[i+2]);
+				}				
+			}
+			catch(Exception e){
+				System.out.println(" Use: \n "
+						+ "-tr -> (% of positives) to sample trainig set \n"
+						+ "-ts -> (% of collection) (% of positives) to sample test set");
+				System.exit(0);
+			};
+		}
+		
+		positives = sampling.loadFiles(positiveDir);
+		negatives = sampling.loadFiles(negativeDir);
+		
+		if(tr) sampling.sampleTest(pathVars, positives, negatives, percTs, posTs);
+		
+		if(ts) sampling.sampleTrain(pathVars, positives, negatives, posTr);		
+
+	}	
+	
+	/**
+	 * Lists XML files within a folder 
+	 * @param dirSrc folder path
+	 * @return returns list of file IDs
+	 */
+	public List loadFiles(String dirSrc){						
+
+		List fileIDs = new LinkedList();
+		
+		File sourceDir = new File(dirSrc);
+		File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
+			@Override
+			public boolean accept(File dir, String name){
+				return name.endsWith(".xml");
+			}
+		});	
+
+		fileIDs = new LinkedList(Arrays.asList(srcXMLs));
+		
+		return fileIDs;
+	}
+	
+	/**
+	 * Moves a specific number of files 
+	 * in a list from origin folder to a test folder
+	 * @param pathVars 
+	 * @param files List of file IDs
+	 * @param numFiles number of files to be moved
+	 */
+	public void moveFile(PathConstants pathVars, List files, int numFiles){
+		
+		Iterator<File> filesList = files.iterator();
+		File testDir = new File(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_DIR);
+		
+		if(!testDir.exists()){
+			try{
+				testDir.mkdir();
+			}catch(Exception e){
+				System.out.println("Error creating Test folder.");
+				System.exit(0);
+			}
+		}
+		
+		while(filesList.hasNext() && numFiles > 0){		
+			try{
+				File file = (File) filesList.next();
+				File newFile = new File(testDir + "/" + file.getName());
+				
+				Files.move(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
+				
+				filesList.remove();				
+				numFiles--;
+			}
+			catch(Exception e){
+				System.out.println("Error moving files.");
+				System.exit(0);
+			}
+		}	
+		
+	}
+	
+	/**
+	 * Copies a specific number of files 
+	 * in a list from origin folder to a train folder
+	 * @param pathVars
+	 * @param files  List of file IDs
+	 * @param numFiles number of files to be moved
+	 */
+	public void copyFile(PathConstants pathVars, List files, int numFiles){
+		
+		Iterator<File> filesList = files.iterator();
+		File trainDir = new File(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR);
+		
+		if(!trainDir.exists())
+			try{
+				trainDir.mkdir();
+			}catch(Exception e){
+				System.out.println("Error creating Training folder.");
+				System.exit(0);
+			}
+		
+		while(filesList.hasNext() && numFiles > 0){				
+			try{				
+				File file = (File) filesList.next();
+				File newFile = new File(trainDir + "/"+ file.getName());
+				
+				Files.copy(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
+			}
+			catch(Exception e){
+				System.out.println("Error copying files.");
+				System.exit(0);
+			}
+		}
+		
+	}
+	
+	/**
+	 * Samples document instances from the collection
+	 * to generate a test set.
+	 * 
+	 * @param pathVars
+	 * @param positives list of positive documents IDs
+	 * @param negatives list of negative documents IDs
+	 * @param total  percentage of the document collection for test
+	 * @param pos  percentage of positive documents in the test set
+	 */
+	public void sampleTest(PathConstants pathVars, List positives, List negatives, int total, int pos){
+		
+		int instances = positives.size() + negatives.size();		
+		int testSize = (instances * total) / 100; 		
+		int posSize = (testSize * pos) / 100;		
+		int negSize = testSize - posSize;		
+		
+		Collections.shuffle(negatives);	
+		System.out.println("===== Test > Negative instances shuffled for test set.");
+		moveFile(pathVars, negatives, negSize);
+		System.out.println("===== Test > Negative instances moved to test folder. \n");
+		
+		Collections.shuffle(positives);	
+		System.out.println("===== Test > Positive instances shuffled for test set.");
+		moveFile(pathVars, positives, posSize);	
+		System.out.println("===== Test > Positive instances moved to test folder. \n");
+		
+	}
+	
+	/**
+	 * Samples document instances from the collection
+	 * to generate a training set.
+	 * 
+	 * @param pathVars
+	 * @param positives list of positive documents IDs
+	 * @param negatives list of negative documents IDs
+	 * @param pos percentage of positive documents in the training set
+	 */	
+    public void sampleTrain(PathConstants pathVars, List positives, List negatives, int pos){
+		
+    	int trainSize = positives.size() + negatives.size();  	
+    	int posSize = (trainSize * pos) / 100;
+    	int negSize = trainSize - posSize;
+    	
+    	if(positives.size() < posSize){
+    		System.out.println("Not enough positive instances for training set.");
+    		System.exit(0);
+    	}
+    	else if(negatives.size() < negSize){
+    		System.out.println("Not enough negative instances for training set.");
+    		System.exit(0);    	
+    	}
+    	else{    		
+    		Collections.shuffle(negatives);
+    		System.out.println("===== Training > Negative instances shuffled for training set.");
+    		copyFile(pathVars, negatives, negSize);
+    		System.out.println("===== Training > Negative instances copied to training folder. \n");
+    		
+    		Collections.shuffle(positives);
+    		System.out.println("===== Training > Positive instances shuffled for training set.");
+    		copyFile(pathVars, positives, posSize);
+    		System.out.println("===== Training > Positive instances copied to training folder. \n");
+    	}			
+		
+	}
+	
+
+	
+
+}