diff --git a/config-sample.cfg b/config-sample.cfg index a9b3483..3c8295f 100644 --- a/config-sample.cfg +++ b/config-sample.cfg @@ -12,6 +12,18 @@ HOME_DIR=/. # corpus directory CORPUS_DIR=corpus/ # +# source documents directory +SOURCE_DIR=src/ +# +# duplicate documents directory +DUP_DIR=test/ +# +# positive instances directory +POS_DIR=positives/ +# +# negative instances directory +NEG_DIR=negatives/ +# # train directory TRAIN_DIR=train/ # @@ -61,6 +73,9 @@ NGRAM_FEATURES=ngrams_features.txt # Paper title n-grams feature list TITLE_NGRAMS=titleGrams.txt # +# Paper ID and class +DOC_IDS=docIDs.txt +# ################################################### ########################## FEATURE SETUP ########## # Extract size of abstract and title @@ -78,6 +93,9 @@ FEATURE_MIN_FREQ=2 # minimum length (in chars) to consider a feature FEATURE_MIN_LENGTH=3 # +# extract document IDs +USE_DOC_ID=true +# ############################# ######### ANNOTATIONS ####### # Extract annotation content @@ -109,6 +127,20 @@ NGRAM_SIZE=1 # Define weight of features #WEIGHT=3 # +################################################### +########################## FEATURE SELECTION SETUP ########## +# Enable Odds Ratio (OR) filtering +USE_ODDS_RATIO=false +# +# Define minimum OR threshold to keep attribute +OR_THRESHOLD=1 +# +# Enable inverted document frequency (idf) filtering +USE_IDF=false +# +# Define minimum OR threshold to keep attribute +IDF_THRESHOLD=1 +# ################################################# ########################### TASK SETUP ########## # experiment type : train = 0 / test = 1 diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..1924ede --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.classpath +*.project +*.*~ diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java index c97cfa7..8e91951 100644 --- a/src/analyse/Extractor.java +++ b/src/analyse/Extractor.java @@ -51,7 +51,7 @@ public class Extractor { //String pathFile; String id; - String endId; + protected String endId; String openFile; String endFile; String openAbst; @@ -95,7 +95,8 @@ public String removeSpecialChar(String str){ str = str.replace(")", ""); str = str.replace("(", ""); str = str.replace("\t\t", "\t"); - str = str.replace("-", ""); + //losing ngrams because of hifen between names + str = str.replace("-", " "); str = str.replace(" ", ""); return str; @@ -154,15 +155,15 @@ public String removeAbstractTags(String str){ //this order of removing tags matters to //exclude the first tag from the abstracts. - str = str.replace("", ""); - str = str.replace("", ""); - str = str.replace("", ""); - str = str.replace("Copyright", ""); - str = str.replace("", ""); - str = str.replace("", ""); - str = str.replace("", ""); - str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("copyright", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); return str; } diff --git a/src/analyse/FeatureExtractor.java b/src/analyse/FeatureExtractor.java index 4ca93aa..4d66d4f 100644 --- a/src/analyse/FeatureExtractor.java +++ b/src/analyse/FeatureExtractor.java @@ -31,6 +31,7 @@ this software and associated documentation files (the "Software"), to deal in package analyse; import java.io.BufferedReader; +import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; @@ -38,47 +39,48 @@ this software and associated documentation files (the "Software"), to deal in import java.util.HashMap; import java.util.Iterator; import java.util.Map; + import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; + import configure.PathConstants; +import filter.NaiveFilter; /** * This class extracts and parses domain * annotation features from doc instances * - * @author halmeida + * @author Hayda Almeida + * @since 2014 + * */ public class FeatureExtractor extends Extractor{ public FeatureExtractor(){ - this.id = ""; - this.endId = ""; - this.endFile = ""; - this.openAbst = ""; - this.closeAbst = ""; - this.abstractLabel = ",Integer> abstract_count = new HashMap,Integer>(); @@ -90,189 +92,142 @@ public static void main(String[] args) { //store title features, type and count HashMap, Integer> title_count = new HashMap, Integer>(); //store title features, whole journal title content and classification - HashMap,String> title_content = new HashMap,String>(); - + HashMap,String> title_content = new HashMap,String>(); //store title content and EC numbers ArrayList ec_numbers = new ArrayList(); + + //store ID, class and features + HashMap PMIDs = new HashMap(); fextrac.initialize(); int jTitle = 0; try { - BufferedReader reader = new BufferedReader(new FileReader(AnCorpus)); - - //--------------------------- - // repeat until all lines of the file are read - //--------------------------- String line = null; String features = null; - // String id = null; - - - while((line = reader.readLine()) != null){ + //Loading file + File input = new File(AnCorpus); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); + Elements corpus = doc.body().getElementsByTag("pubmedarticleset"); - //find paper ID and store it - if (line.contains(fextrac.getid())){ - line = line.replace(fextrac.getid(), ""); - // id = line.replace(fextrac.getendId(), ""); + //Fetching elements - //continue reading - features = reader.readLine(); - features = features.replaceAll("\t",""); - - String journal = ""; + for(Element paper : corpus ){ - //continue reading until the end of file - while(!(features.contentEquals(fextrac.getendFile()))){ - - //find relevant doc section - Journal title - if(features.contains(fextrac.getOpenJournal())){ - - features = features.replace(fextrac.getOpenJournal(),""); - features = features.replace(fextrac.getCloseJournal(), ""); - features = fextrac.removeSpecialChar(features); - - //separating only the journal title content - journal = fextrac.removeTags(features); - //counting # of journal titles - jTitle++; - - features = reader.readLine(); - features = features.replaceAll("\t",""); - } - - //find relevant doc section - Article title - if(features.contains(fextrac.getOpenTitle())){ - - features = features.replace(fextrac.getOpenTitle(),""); - features = features.replace(fextrac.getCloseTitle(), ""); - features = fextrac.removeSpecialChar(features); - - //separating the title by annotations - String title_annotation = features; - - //extracting annotations and inserting them on lists - fextrac.annotations(title_annotation, title_count, title_type, pathVars); - fextrac.addContent(title_annotation, journal, title_content); - - features = reader.readLine(); - features = features.replaceAll("\t",""); - } - - if(features.contains(fextrac.getAbstractLabel())){ - - String temp = ""; - String newAbs = fextrac.getopenAbst(); - - //handling cases when the tag is already within abstract content - if(features.contains("")){ - temp = temp + fextrac.processAbstract(features); - } - else{ - do{ - temp = temp + fextrac.processAbstract(features); - features = reader.readLine(); - }while(!(features.contains(""))); - } - newAbs = newAbs + temp; - features = newAbs + fextrac.getcloseAbst(); - } - - //find relevant doc section - Abstract - if(features.contains(fextrac.getopenAbst())){ - - features = features.replace(fextrac.getopenAbst(),""); - features = features.replace(fextrac.getcloseAbst(), ""); - features = fextrac.removeSpecialChar(features); - - //handle lines in which abstract text tag - //is separated from the actual text - if(features.isEmpty()){ - features = reader.readLine(); - features = features.replaceAll("\t",""); - features = features.replace(fextrac.getopenAbst(),""); - features = features.replace(fextrac.getcloseAbst(), ""); - features = fextrac.removeSpecialChar(features); - } - - features = fextrac.removeAbstractTags(features); - - //gathering abstract annotations - String abstrac = features; - - //extract annotations and insert them on lists - fextrac.annotations(abstrac, abstract_count, abstract_type, pathVars); - - features = reader.readLine(); - features = features.replaceAll("\t",""); - //features = features.replaceAll("\\s+", ""); - } - - //identifying EC number - if(features.contains(fextrac.getOpenEC())){ - features = features.replace(fextrac.getOpenEC(), ""); - features = features.replace(fextrac.getCloseEC(), ""); - features = fextrac.removeSpecialChar(features); - - ec_numbers.add(features); - - features = reader.readLine(); - features = features.replaceAll("\t",""); - } - - //find classification of the document - if(features.contains(fextrac.getClassTag())){ - - //adding classification to the list of annotations - String classif = fextrac.getClassif(features); - fextrac.addClass(classif, abstract_type); - fextrac.addClass(classif, title_type); - fextrac.addClass(classif, title_content); - - features = reader.readLine(); - features = features.replaceAll("\t",""); - } + //Fetching elements + Elements journalTitle = paper.getElementsByTag(fextrac.getOpenJournal()); + Elements title = paper.getElementsByTag(fextrac.getOpenTitle()); + Elements abstractC = paper.getElementsByTag(fextrac.getopenAbst()); + Elements ECnumber = paper.getElementsByTag(fextrac.getOpenEC()); + Elements classDoc = paper.getElementsByTag(fextrac.getClassTag()); - features = reader.readLine(); - features = features.replaceAll("\t",""); + String journal = ""; + String docID = ""; + String label = ""; + ArrayList tempList = new ArrayList(); + StringBuffer sb = new StringBuffer(); + + //fetching the paper ID - + //for all items in a paper, retrieve only PMIDs + for(Element e : paper.select(fextrac.getid())){ + //only consider the ID if the parent is medline citation + if(e.parentNode().nodeName().contains("medline")){ + docID = e.text(); + } + } + //fetch the doc label as well + if(classDoc.hasText()){ + label = classDoc.text(); + } + + PMIDs.put(docID, label); - } + if(journalTitle.hasText()){ + jTitle++; + journal = journalTitle.toString(); + journal = fextrac.removeSpecialChar(journal); + journal = fextrac.removeTags(journal); } - } + String title_annotation = ""; + if(title.hasText()){ + title_annotation = title.toString(); + title_annotation = fextrac.removeSpecialChar(title_annotation); - reader.close(); + tempList.addAll(fextrac.annotations(title_annotation, title_count, title_type, featFilter, pathVars)); + fextrac.addContent(title_annotation, journal, title_content, featFilter); + } + String abstrac = ""; + if(abstractC.hasText()){ + abstrac = abstractC.toString(); + abstrac = fextrac.removeSpecialChar(abstrac); + abstrac = fextrac.removeAbstractTags(abstrac); + + tempList.addAll(fextrac.annotations(abstrac, abstract_count, abstract_type, featFilter, pathVars)); + } + + String ecnum = ""; + if(ECnumber.hasText()){ + for(Element number : ECnumber){ + ecnum = number.toString(); + if(ecnum.contains("EC")){ + ecnum = fextrac.removeSpecialChar(ecnum); + ecnum = fextrac.removeTags(ecnum); + ec_numbers.add(features); + } + } + } + + String triage = ""; + if(classDoc.hasText()){ + triage = classDoc.toString(); + triage = fextrac.removeSpecialChar(triage); + triage = fextrac.removeTags(triage); + + fextrac.addClass(triage, abstract_type); + fextrac.addClass(triage, title_type); + fextrac.addClass(triage, title_content); + } + +// for(int i = 0; i < tempList.size(); i++){ +// sb.append(tempList.get(i) + "-"); +// } +// +// PMIDs.put(docIDLabel, sb.toString()); + } + } + catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); - } - - - //Use for sample output - //System.out.println("\n===========TITLE==ANNOTATIONS============="); - //fextrac.displayList(title_count); - //fextrac.displayList(title_type); - //fextrac.displayList(title_content); - //System.out.println("\n========ABSTRACT==ANNOTATIONS============="); - //fextrac.displayList(abstract_count); - //fextrac.displayList(abstract_type); - - //Before exporting, take into account the - //occurence of all extracted features - fextrac.considerOccurence(abstract_count, pathVars); - fextrac.considerOccurence(title_count, pathVars); + } + if(verbose){ + //print list of extracted features + System.out.println("\n===========TITLE==ANNOTATIONS============="); + fextrac.displayList(title_count); + fextrac.displayList(title_type); + fextrac.displayList(title_content); + System.out.println("\n========ABSTRACT==ANNOTATIONS============="); + fextrac.displayList(abstract_count); + fextrac.displayList(abstract_type); + } - System.out.println("\n===========FEATURE==EXPORT==============="); + //filter features by occurence + featFilter.considerAnnotationOccurence(abstract_count, pathVars); + featFilter.considerAnnotationOccurence(title_count, pathVars); + + System.out.println("\n===========FEATURE==EXPORT==============="); + fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs); + System.out.println("..."+ PMIDs.size()+" document IDs listed."); fextrac.exportList(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES, ec_numbers); System.out.println("..."+ ec_numbers.size()+" EC numbers saved."); fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES, abstract_count); @@ -319,35 +274,14 @@ private void addClass(String element, HashMap, String> list){ while(it.hasNext()){ Map str = it.next(); - if(list.get(str).contains("positive") || list.get(str).contains("negative")){ + if(list.get(str).contains(element)){ + //if(list.get(str).contains("positive") || list.get(str).contains("negative")){ } else list.put(str, element); } - } - - - /** - * Removes from feature list all features with - * frequency not statistically relevant (2 or less) - * @param list to be cleaned - */ - private void considerOccurence(HashMap,Integer> list, PathConstants vars){ - //going over the list of annotations and removing the - //features with occurance lower than specified. - - Iterator> iterator = list.keySet().iterator(); - - while(iterator.hasNext()){ - Map key = iterator.next(); - int valor = list.get(key).intValue(); - - if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ - iterator.remove(); - } - } } - + /** * Extract the annotations from a determined section @@ -357,11 +291,11 @@ private void considerOccurence(HashMap,Integer> list, PathCon * @param count list that holds annotation, its type and its count * @param type list that holds annotation, its type and its classification */ - private void annotations(String annot, HashMap, Integer> count, HashMap,String> type, PathConstants pathVars) { + private ArrayList annotations(String annot, HashMap, Integer> count, HashMap,String> type, NaiveFilter filter, PathConstants pathVars) { HashMap features = loadAnnotationEntities(); PathConstants pathVar = new PathConstants(); NgramExtractor nextrac = new NgramExtractor(); - ArrayList content = new ArrayList(); + ArrayList content = new ArrayList(); //parsing the not edited text into HTML using Jsoup Document doc = Jsoup.parseBodyFragment(annot); @@ -394,7 +328,7 @@ private void annotations(String annot, HashMap, Integer> cou //if child is sentence (sentence inside of sentence), //then add annotations as ngrams on this if(features.get(child.nodeName()).contains("sentence")) { - content.addAll(nextrac.nGrams(child.text(), pathVar)); + content.addAll(nextrac.nGrams(child.text(), filter, pathVar)); insertAnnotation(content, an.nodeName(), count, type, pathVars); } //adding annotations on sentence as they are - no ngrams on this @@ -409,7 +343,7 @@ private void annotations(String annot, HashMap, Integer> cou tempAnnot.children().remove(); //splitting content in ngrams to whats left on the sentence - content.addAll(nextrac.nGrams(tempAnnot.text(), pathVar)); + content.addAll(nextrac.nGrams(tempAnnot.text(), filter, pathVar)); insertAnnotation(content, an.nodeName(), count, type, pathVars); } @@ -422,6 +356,7 @@ private void annotations(String annot, HashMap, Integer> cou } } + return content; } @@ -479,7 +414,7 @@ private void insertAnnotation(ArrayList content, String an_type, HashMap * @param list features used * */ - private void addContent(String annot, String wContent, HashMap,String> list) { + private void addContent(String annot, String wContent, HashMap,String> list, NaiveFilter filter) { HashMap features = loadAnnotationEntities(); ArrayList content = new ArrayList(); @@ -507,7 +442,7 @@ private void addContent(String annot, String wContent, HashMap ngram_count = new HashMap(); - //store abstract ngrams, count and "relevance(TBD)" - HashMap,Integer> ngrams = new HashMap,Integer>(); + //store abstract ngrams and doc ID + HashMap ngram_ID = new HashMap(); //store title ngrams and its count HashMap ngram_title_count = new HashMap(); //store title ngrams, count and "relevance(TBD)" HashMap,Integer> ngram_title = new HashMap,Integer>(); - + //store ID and label of documents + HashMap PMIDs = new HashMap(); + nextrac.initialize(); try - { - BufferedReader reader = new BufferedReader(new FileReader(AnCorpus)); - - //--------------------------- - // repeat until all lines - // of the file are read - //--------------------------- - String line = null; - String features = null; - String id = null; - - - while((line = reader.readLine()) != null){ - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - - //find paper ID and store it - if (line.contains(nextrac.getid())){ - line = line.replace(nextrac.getid(), ""); - id = line.replace(nextrac.getendId(), ""); - - //keep reading the file - features = reader.readLine(); - features = features.replaceAll("\t",""); - - String tit_content = ""; + { + + //Loading file + File input = new File(AnCorpus); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); - //continue reading until the end of file - while(!(features.contentEquals(nextrac.getendFile()))){ + Elements corpus = doc.body().getElementsByTag("pubmedarticleset"); - String abstrac = ""; - - //find relevant doc section - Article title - if(features.contains(nextrac.getOpenTitle())){ - - //cleaning title content - features = features.replace(nextrac.getOpenTitle(),""); - features = features.replace(nextrac.getCloseTitle(), ""); - features = nextrac.removeSpecialChar(features); - tit_content = nextrac.removeTags(features); - - //extract n-grams from section - ArrayList title_c = nextrac.nGrams(tit_content, pathVars); - nextrac.addNGram(title_c, ngram_title_count,ngram_title, pathVars); + //Fetching elements + + for(Element paper : corpus ){ + + Elements journalTitle = paper.getElementsByTag(nextrac.getOpenJournal()); + Elements title = paper.getElementsByTag(nextrac.getOpenTitle()); + Elements abstractC = paper.getElementsByTag(nextrac.getopenAbst()); + Elements ECnumber = paper.getElementsByTag(nextrac.getOpenEC()); + Elements classDoc = paper.getElementsByTag(nextrac.getClassTag()); + + String journal = ""; + String docID = ""; + String label = ""; + int jTitle = 0; + + //fetching the paper ID - + //for all items in a paper, retrieve only PMIDs + for(Element e : paper.select(nextrac.getid())){ + //only consider the ID if the parent is medline citation + if(e.parentNode().nodeName().contains("medline")){ + docID = e.text(); + } + } + //fetch the doc label as well + if(classDoc.hasText()){ + label = classDoc.text(); + } - features = reader.readLine(); - features = features.replaceAll("\t",""); - } - + PMIDs.put(docID, label); - if(features.contains(nextrac.getAbstractLabel())){ - - String temp = ""; - String newAbs = nextrac.getopenAbst(); - - if(features.contains("")){ - temp = temp + nextrac.processAbstract(features); - } - else{ - do{ - temp = temp + nextrac.processAbstract(features); - features = reader.readLine(); - }while(!(features.contains(""))); - } - - newAbs = newAbs + temp; - features = newAbs + nextrac.getcloseAbst(); - } + //Extracting the Journal Title + if(journalTitle.hasText()){ + jTitle++; + journal = journalTitle.toString(); + journal = nextrac.removeSpecialChar(journal); + journal = nextrac.removeTags(journal); + } - //find relevant paper section - if(features.contains(nextrac.getopenAbst())){ - - features = features.replace(nextrac.getopenAbst(),""); - features = features.replace(nextrac.getcloseAbst(), ""); - features = features.replace("-", " "); - features = nextrac.removeSpecialChar(features); - - //handle lines in which abstract text tag - //is separated from the actual text - if(features.isEmpty()){ - features = reader.readLine(); - features = features.replaceAll("\t",""); - features = features.replace(nextrac.getopenAbst(),""); - features = features.replace(nextrac.getcloseAbst(), ""); - features = features.replace("-", " "); - features = nextrac.removeSpecialChar(features); - } - - //features = nextrac.removeSpecialChar(features); - abstrac = nextrac.removeTags(features); - abstrac = nextrac.removeAbstractTags(abstrac); - //extract n-grams from section - ArrayList abstract_c = nextrac.nGrams(abstrac, pathVars); - nextrac.addNGram(abstract_c, ngram_count, ngrams, pathVars); + String tit_content = ""; + //Extracting the Paper Title + if(title.hasText()){ + tit_content = title.toString(); + tit_content = nextrac.removeSpecialChar(tit_content); + tit_content = nextrac.removeTags(tit_content); - //keep reading file - features = reader.readLine(); - features = features.replaceAll("\t",""); - //features = features.replaceAll("\\s+", ""); - } - - features = reader.readLine(); - features = features.replaceAll("\t",""); - //features = features.replaceAll("\\s+", ""); - } + ArrayList title_c = nextrac.nGrams(tit_content, featFilter, pathVars); + nextrac.addNGram(title_c, ngram_title_count, pathVars); } - } - reader.close(); + String abstrac = ""; + //Extracting the Paper abstract + if(abstractC.hasText()){ + abstrac = abstractC.toString(); + abstrac = nextrac.removeTags(abstrac); + abstrac = nextrac.removeSpecialChar(abstrac); + abstrac = nextrac.removeAbstractTags(abstrac); + ArrayList abstract_c = nextrac.nGrams(abstrac, featFilter, pathVars); + nextrac.addNGram(abstract_c, ngram_count, pathVars); + } + } }catch (FileNotFoundException e) { e.printStackTrace(); @@ -202,19 +169,23 @@ public static void main(String[] args) { e.printStackTrace(); } - //print list of extracted n-grams - //System.out.println("\n========ABSTRACT==NGRAMS============="); - //nextrac.displayList(ngram_count); - //nextrac.displayList(ngram_title); - //System.out.println("\n===========TITLE==NGRAMS============="); - //nextrac.displayList(ngram_title_count); - + if(verbose){ + //print list of extracted n-grams + nextrac.displayList(PMIDs); + System.out.println("\n========ABSTRACT==NGRAMS============="); + nextrac.displayList(ngram_count); + nextrac.displayList(ngram_title); + System.out.println("\n===========TITLE==NGRAMS============="); + nextrac.displayList(ngram_title_count); + } - nextrac.considerOccurance(ngram_count, pathVars); - nextrac.considerOccurance(ngram_title_count, pathVars); + //filter features by occurence + featFilter.considerNgramOccurence(ngram_count, pathVars); + featFilter.considerNgramOccurence(ngram_title_count, pathVars); - - System.out.println("\n===========NGRAMS==EXPORT===============\n"); + System.out.println("\n===========NGRAMS==EXPORT===============\n"); + nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs); + System.out.println("..."+ PMIDs.size()+" document IDs listed."); nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES, ngram_count); System.out.println("..."+ ngram_count.size()+" unique Abstract ngrams saved."); nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS, ngram_title_count); @@ -223,80 +194,64 @@ public static void main(String[] args) { } - + /** - * Removes from feature list all features with - * frequency not statistically relevant (2 or less) - * @param list to be cleaned + * Inserts ngrams into list of features + * with a mapping for ngram count + * @param str relation of ngrams extracted + * @param list_count mapping for ngram counts + * @param pathVars */ - private void considerOccurance(HashMap list, PathConstants vars){ - //going over the list of annotations and removing the - //statistically not significant features - frequency less than 2 - - Iterator iterator = list.values().iterator(); - - while(iterator.hasNext()){ - Integer key = iterator.next(); - - if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ - iterator.remove(); - } - } - } - - private void addNGram(ArrayList str, HashMap list_count, HashMap,Integer> list, PathConstants pathVars){ + private void addNGram(ArrayList str, HashMap list_count, PathConstants pathVars){ + //iterating over ngram list for(int i = 0; i < str.size(); i++){ String currentNGram = str.get(i); + //checking existence of current ngram on list mapping if(list_count.containsKey(currentNGram)){ + //retrieve the amount of current ngrams on mapping int count = list_count.get(currentNGram); - list_count.put(currentNGram, count+1); - - /*if(list.containsKey(currentNGram)){ - int cnt = list.get(currentNGram).get(certainty); - list.get(currentNGram).put(certainty, cnt+1); - } - else{ - list.get(currentNGram).put(certainty, 1); - }*/ + //insert the updated count of ngrams + list_count.put(currentNGram, count+1); } else { + //insert ngram on mapping list if(currentNGram.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){ list_count.put(currentNGram, 1); - - /* list.put(currentNGram, new HashMap()); - list.get(currentNGram).put(certainty, 1);*/ } } } } /** - * Extracts n-grams from the content field - * and populates mapping with n-gram +count - * @param str - * @param id - * @param gram - */ - - public ArrayList nGrams(String str, PathConstants pathVar){ + * Extracts n-grams from a given content field + * + * @param str text to extract ngrams + * @return list of extracted grams + */ + public ArrayList nGrams(String str, NaiveFilter filter, PathConstants pathVar){ - //cleaning further chars on sentence + //removing ASCII special characters str = str.replace("/", ""); - str = str.replace("\\", ""); - str = str.replace(" ", "-"); - //Tokenize the sentence + str = str.replace("\\", ""); + //str = str.replace("\n", " "); + str = str.replaceAll("\\s+"," "); + str = str.replace(" ", "-"); + + //Tokenizing the sentence String[] words = StringUtils.split(str,"-"); ArrayList ngramList = new ArrayList(); int ngram =Integer.parseInt(pathVar.NGRAM_SIZE); + //Stop-words removal if(Boolean.valueOf(pathVar.NGRAM_STOP)){ - words = StringUtils.split(removeStopList(words, pathVar)," "); - } - + words = StringUtils.split(filter.removeStopList(words, pathVar)," "); + } + + //extracting ngrams according to gram size (1, 2, 3) for(int i=0; i < words.length - (ngram - 1); i++){ switch(pathVar.NGRAM_SIZE){ case "1": @@ -314,94 +269,58 @@ public ArrayList nGrams(String str, PathConstants pathVar){ return ngramList; } - /** - * Removes the stopwords from ngrams list - * - * @param str list of ngrams - * @param pathVar constants from - * @return - */ +// /** +// * Removes stopwords from ngrams list +// * +// * @param str list of ngrams +// * @param constants +// * @return cleaned list of ngrams +// */ +// public String removeStopList(String[] str, PathConstants pathVar){ +// +// //stop-words file name +// String pathStop = "stopList.txt"; +// String[] stop = null; +// StringBuilder cleaned = new StringBuilder(); +// +// try{ +// +// BufferedReader reader = new BufferedReader(new FileReader(pathStop)); +// +// String line = null; +// //loading stop-words list +// while((line = reader.readLine()) != null){ +// stop = StringUtils.split(line,","); +// line = reader.readLine(); +// } +// +// reader.close(); +// +// }catch (FileNotFoundException e) { +// e.printStackTrace(); +// } catch (IOException e) { +// e.printStackTrace(); +// } +// +// //iteraing over text to be cleaned +// for(int i = 0; i < str.length; i++){ +// //iterating over stop-words list +// for(int j = 0; j < stop.length; j++){ +// +// //when stop-word is encountered, replace it +// if(str[i].equalsIgnoreCase(stop[j])){ +// str[i] = str[i].replace(str[i],"*"); +// } +// } +// //retrieve the text without stop-words replacements +// if(!(str[i].contentEquals("*"))){ +// cleaned.append(str[i]).append(" "); +// } +// } +// return cleaned.toString().replace(" ", " "); +// } - public String removeStopList(String[] str, PathConstants pathVar){ - - String pathStop = "stopList.txt"; - String[] stop = null; - StringBuilder cleaned = new StringBuilder(); - try{ - - BufferedReader reader = new BufferedReader(new FileReader(pathStop)); - - String line = null; - - while((line = reader.readLine()) != null){ - stop = StringUtils.split(line,","); - line = reader.readLine(); - } - - reader.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - - for(int i = 0; i < str.length; i++){ - for(int j = 0; j < stop.length; j++){ - - if(str[i].equalsIgnoreCase(stop[j])){ - str[i] = str[i].replace(str[i],"*"); - } - } - if(!(str[i].contentEquals("*"))){ - cleaned.append(str[i]).append(" "); - } - } - return cleaned.toString().replace(" ", " "); - } - - /** - * Evaluates the level of certainty... - * TBD!!! - * @param list - * @return - */ - - public String getCertainty(HashMap> list){ - - ArrayList gramsAr = new ArrayList(list.entrySet()); - //String certainty; - - Iterator itr = gramsAr.iterator(); - while(itr.hasNext()){ - String str = itr.next().toString(); - String[] splitted = StringUtils.split(str,"="); - - int relevance = 0; - int count = 0; - - - try{ - count = list.get(splitted[0]).get(certainty); - } catch(Exception e){ - e.printStackTrace(); - } - - //relevance = count * getWeight(); - - if(relevance == 1) - list.get(splitted[0]).put("fairly relevant", list.get(splitted[0]).get(certainty)); - else if (relevance == 2) - list.get(splitted[0]).put("relevant", list.get(splitted[0]).get(certainty)); - else - list.get(splitted[0]).put("very relevant", list.get(splitted[0]).get(certainty)); - - } - - return certainty; - } - /** * Displays the keys and values of the * maps created with n-grams and counts. @@ -416,27 +335,6 @@ public void displayList(HashMap hash){ System.out.println("\n=======================================\n"); } - - /** - * Accessor and mutator methods for the export - * string with list values - so vector class - * can access its content. - * @return string with list of values. - */ - /*public static String getNgramCount() { - //ngramCount = exportContent(ngram_count); - return ngramCount; - } - public void setNgramCount(String ngramCount) { - this.ngramCount = ngramCount; - } - public static String getNgram() { - //ngram = exportContent(ngrams); - return ngram; - } - public void setNgram(String ngram) { - this.ngram = ngram; - } */ - + } diff --git a/src/arffmatrix/BuildModel.java b/src/arffmatrix/BuildModel.java index 65869e8..f8d0fac 100644 --- a/src/arffmatrix/BuildModel.java +++ b/src/arffmatrix/BuildModel.java @@ -54,6 +54,9 @@ this software and associated documentation files (the "Software"), to deal in * This class reads the corpus instances and uses * the CreateVector class to generate a model file (ARFF) * * + * @author Hayda Almeida, Marie-Jean Meurs + * @since 2014 + * */ public class BuildModel { @@ -91,8 +94,7 @@ public static void main(String[] args) { //by default String sortarffFileName = pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + arffFileName; // default - // create file - //FileWriter fstream = new FileWriter(sortarffFileName); + // create file BufferedWriter out = new BufferedWriter(new FileWriter(sortarffFileName)); // load ARFF header and write it @@ -254,7 +256,7 @@ else if(Integer.parseInt(pathVars.EXP_TYPE) ==1) System.out.println("Abstract : " + abstracttext.toString() + "\n\n"); // end of if: collect data and write ARFF - String Arffline = vectorgenerator.getArffLine( + String Arffline = vectorgenerator.getArffLine(pmid, journaltitle, title, abstracttext, diff --git a/src/arffvector/CreateVector.java b/src/arffvector/CreateVector.java index ce81dee..b112ea5 100644 --- a/src/arffvector/CreateVector.java +++ b/src/arffvector/CreateVector.java @@ -54,6 +54,8 @@ this software and associated documentation files (the "Software"), to deal in * generated corpus to create a feature vector * (a matrix representation of the corpus) * + * @author Hayda Almeida, Marie-Jean Meurs + * @since 2014 * */ public class CreateVector { @@ -64,7 +66,8 @@ public class CreateVector { ArrayList ecnumbers = new ArrayList(); ArrayList titleGrams = new ArrayList(); ArrayList titleAnnot = new ArrayList(); - ArrayList nGrams = new ArrayList(); + ArrayList nGrams = new ArrayList(); + ArrayList docID = new ArrayList(); PathConstants pathVars = null; @@ -79,7 +82,7 @@ public class CreateVector { public CreateVector(PathConstants extVars) { - pathVars = extVars; + pathVars = extVars; String pathJournalT = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES; try{ @@ -395,21 +398,27 @@ public String genArffHeader(PathConstants pathVars, int exp){ switch(exp){ case 0: - headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2014\n\n"); + headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2015\n\n"); break; case 1: - headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2014\n\n"); + headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2015\n\n"); break; } headerArff.append("@RELATION triage\n"); - if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){ + if(Boolean.valueOf(pathVars.USE_TEXT_SIZE)){ // writing the list of text sizes headerArff.append("@ATTRIBUTE sizeoftitle \tREAL \t\t%size of title\n"); headerArff.append("@ATTRIBUTE sizeoftext \tREAL \t\t%size of text\n"); } + if(Boolean.valueOf(pathVars.USE_DOC_ID)){ + //writing the docIDs + headerArff.append("@ATTRIBUTE docID \tREAL \t\t%PMID of paper\n"); + + } + if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ for(int i = 0; i < journalTitles.size(); i++){ // writing list of journal titles @@ -532,10 +541,11 @@ public String genArffHeader(PathConstants pathVars, int exp){ * @return String holding counts for all features found in a document */ - public String getArffLine(String jTitle, String title, String text, String ecnum, String classTriage, int exp){ + public String getArffLine(String paperID, String jTitle, String title, String text, String ecnum, String classTriage, int exp){ //String vectorArff = ""; StringBuilder vectorArff = new StringBuilder(); + paperID = removeSpecialChar(paperID.toLowerCase()); text = removeSpecialChar(text.toLowerCase()); title = removeSpecialChar(title.toLowerCase()); jTitle = removeSpecialChar(jTitle.toLowerCase()); @@ -562,6 +572,17 @@ public String getArffLine(String jTitle, String title, String text, String ecnum vectorArff.append(titlesize).append(",").append(abstractsize).append(","); } + //fill ID of documents + if(Boolean.valueOf(pathVars.USE_DOC_ID)){ + + if(paperID.length()>0){ + vectorArff.append(paperID).append(","); + } + else{ + vectorArff.append("0,"); + } + } + //fill values of journal titles if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ diff --git a/src/classifier/Trainer.java b/src/classifier/Trainer.java index 7417982..4ec0da2 100644 --- a/src/classifier/Trainer.java +++ b/src/classifier/Trainer.java @@ -30,8 +30,8 @@ this software and associated documentation files (the "Software"), to deal in package classifier; +import java.util.ArrayList; import java.util.Random; - import weka.attributeSelection.LatentSemanticAnalysis; import weka.attributeSelection.PrincipalComponents; import weka.attributeSelection.GainRatioAttributeEval; @@ -41,20 +41,30 @@ this software and associated documentation files (the "Software"), to deal in import weka.classifiers.CostMatrix; import weka.classifiers.Evaluation; import weka.classifiers.bayes.NaiveBayes; +import weka.classifiers.evaluation.NominalPrediction; +import weka.classifiers.evaluation.Prediction; +import weka.classifiers.evaluation.output.prediction.PlainText; import weka.classifiers.functions.LibSVM; import weka.classifiers.meta.AttributeSelectedClassifier; import weka.classifiers.meta.CostSensitiveClassifier; +import weka.classifiers.meta.FilteredClassifier; import weka.classifiers.trees.LMT; +import weka.core.Attribute; import weka.core.Instances; +import weka.core.Range; import weka.core.converters.ConverterUtils.DataSource; +import weka.filters.Filter; +import weka.filters.unsupervised.attribute.Remove; import configure.PathConstants; +import filter.InformedFilter; /** * Trains and tests a classifier, * executes k-fold cross validation on train data * and outputs the classification results. * - * @author halmeida + * @author Hayda Almeida + * @since 2014 * */ @@ -65,6 +75,8 @@ public class Trainer { double[][] ranking; String rank; + boolean verbose = false; + /** * @param args @@ -72,15 +84,40 @@ public class Trainer { */ public static void main(String[] args) throws Exception { + + String classifier= ""; + + for(int i = 0; i < args.length; i++){ + try{ + if(args[i].matches("-lmt")) + classifier = "lmt"; + if(args[i].matches("-svm")) + classifier = "svm"; + if(args[i].matches("-nb")) + classifier = "nb"; + } + catch(Exception e){ + System.out.println("A classifier must be given as argument. Use: \n" + + "-lmt -> a LMT classifier; \n " + + "-svm -> a SVM classifier; \n" + + "-nb -> a Naive Bayes classifier. "); + System.exit(0); + } + } + PathConstants pathVars = new PathConstants(); Trainer evaluator = new Trainer(); - + InformedFilter filter = new InformedFilter(); + Classifier cls; //Creating classifier - Classifier cls = (Classifier) new LMT(); - //Classifier cls = (Classifier) new NaiveBayes(); - //Classifier cls = (Classifier) new LibSVM(); - + if(classifier.contains("lmt")) + cls = (Classifier) new LMT(); + else if (classifier.contains("svm")) + cls = (Classifier) new LibSVM(); + else + cls = (Classifier) new NaiveBayes(); + //Loading train data DataSource sourceTrain = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TRAIN_DIR + pathVars.ARFF_TRAIN); Instances trainData = sourceTrain.getDataSet(); @@ -90,9 +127,7 @@ public static void main(String[] args) throws Exception { System.out.println("Class index set on training data."); System.out.println("Training data loaded. Number of instances: " + trainData.numInstances() + "\n"); - - //Executing k-fold cross validation - //train.crossFold(trainData, cls); + //Loading test data DataSource sourceTest = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TEST_DIR + pathVars.ARFF_TEST); @@ -103,64 +138,175 @@ public static void main(String[] args) throws Exception { System.out.println("Class index set on testing data."); System.out.println("Test data loaded. Number of instances: " + testData.numInstances() + "\n"); + + + //filter the file IDs, consider the new training set + Instances filteredTrainData = evaluator.filteredIDs(trainData); + Instances filteredTestData = evaluator.filteredIDs(testData); + + if(Boolean.valueOf(pathVars.USE_ODDS_RATIO)){ + //Calculate OddsRatio for all instances + double[] OR = evaluator.loadFeatureFilter(filteredTrainData, filter, 1, Integer.parseInt(pathVars.OR_THRESHOLD)); + + //Apply Odds Ratio filtering in instances + filteredTrainData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTrainData); + filteredTestData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTestData); + } + + if(Boolean.valueOf(pathVars.USE_IDF)){ + //Calculate idf for all instances + double[] idf = evaluator.loadFeatureFilter(filteredTrainData, filter, 2, Integer.parseInt(pathVars.IDF_THRESHOLD)); - //Creating filtered classifiers - //AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls); - //AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls); - //AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls); - //AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls); + //Apply idf filtering in instances + filteredTrainData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTrainData); + filteredTestData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTestData); + } //Training and testing classifier - evaluator.classify(trainData, testData, cls); + evaluator.classify(filteredTrainData, filteredTestData, cls, testData); - //Training and testing costSensitive classifier - //evaluator.classify(trainData, testData, evaluator.classifySensitive(cls)); + } + + /** + * Loads evaluation of attributes according + * to feature selection method provided. + * + * @param data data instances + * @param filter informed filter instance + * @param method identifier for selection method + * @return + */ + private double[] loadFeatureFilter(Instances data, InformedFilter filter, int method, int threshold){ + + double[] values = new double[data.numAttributes()]; - //Executing k-fold cross validation on filtered classifiers - //evaluator.crossFold(trainData, PCAclassifier); - //evaluator.crossFold(trainData, LSAclassifier); + switch(method){ + case 1: + values = filter.oddsRatio(data, threshold); + break; + case 2: + values = filter.idf(data, threshold); + break; + } + + return values; } - /** - * Trains and tests a classifier when two separated - * datasets are provided. + * Uses evaluation of features according to + * selection method to remove attributes from + * the dataset before training phase. * - * @param train training data to build classifier - * @param test test data to evaluate classifier - * @param classif type of classifier applied + * @param threshold selection method threshold + * @param values evaluation of attributes according to method + * @param data dataset instances + * @return filtered dataset instances * @throws Exception - */ - public void classify(Instances train, Instances test, Classifier classif) throws Exception{ - - classif.buildClassifier(train); - Evaluation evaluateClassifier = new Evaluation(train); - evaluateClassifier.evaluateModel(classif, test); + */ + private Instances applyFilter(String threshold, double[] values, Instances data) throws Exception{ + int numberRemoved = 0; + + String indexRemove = ""; - stats(evaluateClassifier, classif); + for(int i = 0; i < values.length; i++){ + if(values[i] == 0){ + + int ind = i+1; + + if(indexRemove.length()==0) indexRemove = ind + ""; + else indexRemove = indexRemove + "," + ind; + + numberRemoved++; + } + } + + try{ + indexRemove = indexRemove.substring(0, indexRemove.length()-1); + //if(verbose) + System.out.println("\n = = = = => Filter removed " + numberRemoved +" attributes: " + indexRemove.toString() ); + } + catch (Exception e){ + System.out.println("\n = = = = => Filter threshold did not remove any attribute."); + } + + Remove remove = new Remove(); + remove.setAttributeIndices(indexRemove); + remove.setInvertSelection(false); + remove.setInputFormat(data); + + Instances dataSubset = Filter.useFilter(data, remove); + return dataSubset; } + /** - * Trains and tests a classifier using a - * provided Cost matrix + * Removes the ID attribute (index 1) + * from a given dataset * - * @param classif type of classifier to be trained - * @return CostSensitive classifier with costs and classifier + * @param data instances + * @return filtered dataset * @throws Exception */ - - public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{ - CostSensitiveClassifier costSensitive = new CostSensitiveClassifier(); - CostMatrix matrix = new CostMatrix(2); - matrix.setElement(0, 1, 4); - matrix.setElement(1, 0, 1); - costSensitive.setClassifier(classif); - costSensitive.setCostMatrix(matrix); + private Instances filteredIDs(Instances data) throws Exception { + Remove remove = new Remove(); + //setting index to be removed + remove.setAttributeIndices("1"); + remove.setInvertSelection(false); + remove.setInputFormat(data); - return costSensitive; + Instances dataSubset = Filter.useFilter(data, remove); + return dataSubset; } - + + + /** + * Trains and tests a classifier when two separated + * datasets are provided. + * + * @param train training data to build classifier + * @param test test data to evaluate classifier + * @param classif type of classifier applied + * @throws Exception + */ + public void classify(Instances filteredTrain, Instances filteredTest, Classifier classif, Instances test) throws Exception{ + + StringBuffer sb = new StringBuffer(); + PlainText prediction = new PlainText(); + Range attributesToShow = null; + prediction.setBuffer(sb); + prediction.setHeader(test); + prediction.setOutputDistribution(true); + + classif.buildClassifier(filteredTrain); + + Evaluation evaluateClassifier = new Evaluation(filteredTrain); + evaluateClassifier.evaluateModel(classif, filteredTest, prediction, attributesToShow, true); + //evaluateClassifier.evaluateModel(classif, filteredTest); + + stats(evaluateClassifier, classif); + + ArrayList output = evaluateClassifier.predictions(); + + if(verbose){ + for(int i = 0; i < output.size(); i++){ + double act = output.get(i).actual(); + String actual; + if(act == 1.0) actual = "negative"; else actual = "positive"; + + double pred = output.get(i).predicted(); + String predicted; + if(pred == 1.0) predicted = "negative"; else predicted = "positive"; + + String value = test.instance(i).toString(0); + + System.out.println("PMID: "+ value + "\t" + + "Actual: " + actual + "\t" + + "Predicted: " + predicted + ); + } } + } + /** * Outputs classifier results. @@ -169,7 +315,6 @@ public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exce * @param classif type of classifier applied * @throws Exception */ - public void stats(Evaluation eval, Classifier classif) throws Exception{ System.out.println("Number of attributes: " + eval.getHeader().numAttributes()); System.out.println(eval.toSummaryString("\n======== RESULTS ========\n", false)); @@ -177,138 +322,167 @@ public void stats(Evaluation eval, Classifier classif) throws Exception{ System.out.println(eval.toMatrixString("\n\n======== Confusion Matrix ========\n")); } - /** - * Executes k-fold cross validation - * on a given dataset - * @param data training data provided - * @param classif type of classifier usedsearch - * @throws Exception - */ - - public void crossFold(Instances data, Classifier classif) throws Exception{ - - Random random = new Random(SEED); //creating seed number generator - Evaluation evaluateClassifier = new Evaluation(data); - - System.out.println("Classifier working...\n\n"); - //Classifier should not be trained when cross-validation is executed. - //because subsequent calls to buildClassifier method will return the same results always. - evaluateClassifier.crossValidateModel(classif, data, FOLDS, random); - - stats(evaluateClassifier, classif); - } - - /** - * Implements a Filtered GainRatio classifier, - * using the ranker as a search method. - * - * @param classif type of classifier to be used - * @return filtered classif with Correlation analysis - */ + //Training and testing costSensitive classifier + //evaluator.classify(trainData, testData, evaluator.classifySensitive(cls)); - public AttributeSelectedClassifier setGRFilter(Classifier classif){ - AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); - - //Creating evaluator and search method - GainRatioAttributeEval GR = new GainRatioAttributeEval(); - Ranker rank = new Ranker(); - //return the attributes with evaluation greater than 0 - double threshold = 0.0; - rank.setThreshold(threshold); - - //Setting GainRatio filtered classifier - fClassif.setClassifier(classif); - fClassif.setEvaluator(GR); - fClassif.setSearch(rank); - - return fClassif; - - } +// /** +// * Trains and tests a classifier using a +// * provided Cost matrix +// * +// * @param classif type of classifier to be trained +// * @return CostSensitive classifier with costs and classifier +// * @throws Exception +// */ +// public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{ +// CostSensitiveClassifier costSensitive = new CostSensitiveClassifier(); +// CostMatrix matrix = new CostMatrix(2); +// matrix.setElement(0, 1, 4); +// matrix.setElement(1, 0, 1); +// costSensitive.setClassifier(classif); +// costSensitive.setCostMatrix(matrix); +// +// return costSensitive; +// } - /** - * Implements a Filtered Correlation classifier, - * using the ranker as a search method. - * - * @param classif type of classifier to be used - * @return filtered classif with Correlation analysis - */ + //Executing k-fold cross validation on filtered classifiers + //evaluator.crossFold(trainData, PCAclassifier); + //evaluator.crossFold(trainData, LSAclassifier); - public AttributeSelectedClassifier setCorrFilter(Classifier classif){ - AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); - - //Creating evaluator and search method - CorrelationAttributeEval Corr = new CorrelationAttributeEval(); - Ranker rank = new Ranker(); - - //return the attributes with evaluation greater than 0 - double threshold = 0.03; - rank.setThreshold(threshold); - - //Setting GainRatio filtered classifier - fClassif.setClassifier(classif); - fClassif.setEvaluator(Corr); - fClassif.setSearch(rank); - - return fClassif; - - } +// /** +// * Executes k-fold cross validation +// * on a given dataset +// * @param data training data provided +// * @param classif type of classifier usedsearch +// * @throws Exception +// */ +// public void crossFold(Instances data, Classifier classif) throws Exception{ +// +// Random random = new Random(SEED); //creating seed number generator +// Evaluation evaluateClassifier = new Evaluation(data); +// +// System.out.println("Classifier working...\n\n"); +// //Classifier should not be trained when cross-validation is executed. +// //because subsequent calls to buildClassifier method will return the same results always. +// evaluateClassifier.crossValidateModel(classif, data, FOLDS, random); +// +// stats(evaluateClassifier, classif); +// } - /** - * Implements a Filtered PCA classifier, - * using the ranker as a search method. - * - * @param classif type of classifier to be used - * @return filtered classif with PCA analysis config - */ - public AttributeSelectedClassifier setPCAFilter(Classifier classif){ - AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); - - //Creating evaluator and search method - PrincipalComponents PCA = new PrincipalComponents(); - PCA.setMaximumAttributeNames(-1); - Ranker rank = new Ranker(); - //return the attributes with evaluation greater than 0 - rank.setThreshold(0); - - //Setting the PCA classifier configurations - fClassif.setClassifier(classif); - fClassif.setEvaluator(PCA); - fClassif.setSearch(rank); - - return fClassif; - } - /** - * Implements a Filtered LSA classifier, - * using the ranker as a search method - * @param classif - * @return - */ + //Creating filtered classifiers + //AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls); + //AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls); + //AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls); + //AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls); - private AttributeSelectedClassifier setLSAFilter(Classifier classif) { - AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); - - //Creating evaluator - LatentSemanticAnalysis LSA = new LatentSemanticAnalysis(); - LSA.setMaximumAttributeNames(-1); - //value between 0 and 1 includes proportion of total latent variables - //greater than 1 = exact # of variables to include; - //less than or equal zero = include all; - //default = 0.95 (proportional) - double defaul = 0; - LSA.setRank(defaul); - //Creating search method - Ranker rank = new Ranker(); - rank.setThreshold(0); - - //Setting the LSA classifier configurations - fClassif.setClassifier(classif); - fClassif.setEvaluator(LSA); - fClassif.setSearch(rank); - - return fClassif; - } +// /** +// * Implements a Filtered GainRatio classifier, +// * using the ranker as a search method. +// * +// * @param classif type of classifier to be used +// * @return filtered classif with Correlation analysis +// */ +// public AttributeSelectedClassifier setGRFilter(Classifier classif){ +// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); +// +// //Creating evaluator and search method +// GainRatioAttributeEval GR = new GainRatioAttributeEval(); +// Ranker rank = new Ranker(); +// //return the attributes with evaluation greater than 0 +// double threshold = 0.0; +// rank.setThreshold(threshold); +// +// //Setting GainRatio filtered classifier +// fClassif.setClassifier(classif); +// fClassif.setEvaluator(GR); +// fClassif.setSearch(rank); +// +// return fClassif; +// +// } +// +// /** +// * Implements a Filtered Correlation classifier, +// * using the ranker as a search method. +// * +// * @param classif type of classifier to be used +// * @return filtered classif with Correlation analysis +// */ +// public AttributeSelectedClassifier setCorrFilter(Classifier classif){ +// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); +// +// //Creating evaluator and search method +// CorrelationAttributeEval Corr = new CorrelationAttributeEval(); +// Ranker rank = new Ranker(); +// +// //return the attributes with evaluation greater than 0 +// double threshold = 0.03; +// rank.setThreshold(threshold); +// +// //Setting GainRatio filtered classifier +// fClassif.setClassifier(classif); +// fClassif.setEvaluator(Corr); +// fClassif.setSearch(rank); +// +// return fClassif; +// +// } +// +// /** +// * Implements a Filtered PCA classifier, +// * using the ranker as a search method. +// * +// * @param classif type of classifier to be used +// * @return filtered classif with PCA analysis config +// */ +// public AttributeSelectedClassifier setPCAFilter(Classifier classif){ +// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); +// +// //Creating evaluator and search method +// PrincipalComponents PCA = new PrincipalComponents(); +// PCA.setMaximumAttributeNames(-1); +// Ranker rank = new Ranker(); +// //return the attributes with evaluation greater than 0 +// rank.setThreshold(0); +// +// //Setting the PCA classifier configurations +// fClassif.setClassifier(classif); +// fClassif.setEvaluator(PCA); +// fClassif.setSearch(rank); +// +// return fClassif; +// } +// +// /** +// * Implements a Filtered LSA classifier, +// * using the ranker as a search method +// * @param classif +// * @return +// */ +// private AttributeSelectedClassifier setLSAFilter(Classifier classif) { +// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); +// +// //Creating evaluator +// LatentSemanticAnalysis LSA = new LatentSemanticAnalysis(); +// LSA.setMaximumAttributeNames(-1); +// //value between 0 and 1 includes proportion of total latent variables +// //greater than 1 = exact # of variables to include; +// //less than or equal zero = include all; +// //default = 0.95 (proportional) +// double defaul = 0; +// LSA.setRank(defaul); +// //Creating search method +// Ranker rank = new Ranker(); +// rank.setThreshold(0); +// +// //Setting the LSA classifier configurations +// fClassif.setClassifier(classif); +// fClassif.setEvaluator(LSA); +// fClassif.setSearch(rank); +// +// return fClassif; +// } diff --git a/src/configure/PathConstants.java b/src/configure/PathConstants.java index 2103118..dab7b82 100644 --- a/src/configure/PathConstants.java +++ b/src/configure/PathConstants.java @@ -76,7 +76,11 @@ public PathConstants(String configfile) { //Input files public String HOME_DIR; - public String CORPUS_DIR; + public String CORPUS_DIR; + public String SOURCE_DIR; + public String DUP_DIR; + public String POS_DIR; + public String NEG_DIR; public String TRAIN_DIR; public String TEST_DIR; public String FEATURE_DIR; @@ -94,6 +98,7 @@ public PathConstants(String configfile) { public String TITLE_FEATURES; public String NGRAM_FEATURES; public String TITLE_NGRAMS; + public String DOC_IDS; //Feature setup public String USE_TEXT_SIZE; @@ -106,7 +111,7 @@ public PathConstants(String configfile) { public String USE_ANNOTATION_FEATURE; public String USE_ANNOTATION_TYPE; public String USE_TITLE_FEATURE; - + public String USE_DOC_ID; //Feature setup - Ngrams public String USE_NGRAM_FEATURE; @@ -115,6 +120,12 @@ public PathConstants(String configfile) { public String NGRAM_SIZE; public String USE_WEIGHTED_NGRAM; public String WEIGHT; + + //Feature filtering + public String USE_ODDS_RATIO; + public String OR_THRESHOLD; + public String USE_IDF; + public String IDF_THRESHOLD; //Task setup public String EXP_TYPE; @@ -139,6 +150,10 @@ private void initVars() { } HOME_DIR = CONFIG_MAP.get("HOME_DIR"); CORPUS_DIR = CONFIG_MAP.get("CORPUS_DIR"); + SOURCE_DIR = CONFIG_MAP.get("SOURCE_DIR"); + DUP_DIR = CONFIG_MAP.get("DUP_DIR"); + POS_DIR = CONFIG_MAP.get("POS_DIR"); + NEG_DIR = CONFIG_MAP.get("NEG_DIR"); TRAIN_DIR = CONFIG_MAP.get("TRAIN_DIR"); TEST_DIR = CONFIG_MAP.get("TEST_DIR"); FEATURE_DIR = CONFIG_MAP.get("FEATURE_DIR"); @@ -155,6 +170,7 @@ private void initVars() { TITLE_FEATURES = CONFIG_MAP.get("TITLE_FEATURES"); NGRAM_FEATURES = CONFIG_MAP.get("NGRAM_FEATURES"); TITLE_NGRAMS = CONFIG_MAP.get("TITLE_NGRAMS"); + DOC_IDS = CONFIG_MAP.get("DOC_IDS"); USE_TEXT_SIZE = CONFIG_MAP.get("USE_TEXT_SIZE"); USE_JOURNAL_TITLE_FEATURE = CONFIG_MAP.get("USE_JOURNAL_TITLE_FEATURE"); @@ -165,6 +181,7 @@ private void initVars() { USE_ANNOTATION_FEATURE = CONFIG_MAP.get("USE_ANNOTATION_FEATURE"); USE_ANNOTATION_TYPE = CONFIG_MAP.get("USE_ANNOTATION_TYPE"); USE_TITLE_FEATURE = CONFIG_MAP.get("USE_TITLE_FEATURE"); + USE_DOC_ID = CONFIG_MAP.get("USE_DOC_ID"); USE_NGRAM_FEATURE = CONFIG_MAP.get("USE_NGRAM_FEATURE"); USE_TITLE_NGRAMS = CONFIG_MAP.get("USE_TITLE_NGRAMS"); @@ -172,6 +189,11 @@ private void initVars() { NGRAM_SIZE = CONFIG_MAP.get("NGRAM_SIZE"); USE_WEIGHTED_NGRAM = CONFIG_MAP.get("USE_WEIGHTED_NGRAM"); WEIGHT = CONFIG_MAP.get("WEIGHT"); + + USE_ODDS_RATIO = CONFIG_MAP.get("USE_ODDS_RATIO"); + OR_THRESHOLD = CONFIG_MAP.get("OR_THRESHOLD"); + USE_IDF = CONFIG_MAP.get("USE_IDF"); + IDF_THRESHOLD = CONFIG_MAP.get("IDF_THRESHOLD"); EXP_TYPE = CONFIG_MAP.get("EXP_TYPE"); NB_PARAMS = CONFIG_MAP.get("NB_PARAMS"); diff --git a/src/filter/.gitignore b/src/filter/.gitignore new file mode 100644 index 0000000..6b468b6 --- /dev/null +++ b/src/filter/.gitignore @@ -0,0 +1 @@ +*.class diff --git a/src/filter/InformedFilter.java b/src/filter/InformedFilter.java new file mode 100644 index 0000000..4b125db --- /dev/null +++ b/src/filter/InformedFilter.java @@ -0,0 +1,182 @@ +package filter; + +import weka.core.Attribute; +import weka.core.Instances; + +/** + * This class implements informed feature selection + * methods, to be used as filters after vector + * generation and pre-model building + * + * @author Hayda Almeida + * @since 2015 + * + */ +public class InformedFilter { + + private boolean verbose = true; + + /** + * Calculates oddsRatio of each feature + * in a given set of Instances + * + * @param data set of instances, read from ARFF file + * @return oddsRatio for each attribute in the matrix + */ + public double[] oddsRatio(Instances data, int threshold){ + + double[] oddsRatio = new double[data.numAttributes()]; + + + for(int i = 0; i < data.numAttributes()-1; i++ ){ + + double OR = 0; + + Attribute current = data.attribute(i); + double pos_docs = 0, //number of documents in class C + pos_oc = 0, //number of times term t occured in class C + pos_term_docs = 0, //number of docs in class C that have term + pos_not_docs = 0, //number of docs in class C that do not have term + neg_term_docs = 0, //number of docs not in class C with term + neg_not_docs = 0, //number of docs not in class C nor with term + neg_docs = 0; //number of documents not in class C + + for(int j = 0; j < data.size(); j++){ + + double current_value = data.instance(j).value(current); + double current_class = data.instance(j).classValue(); + + //class is positive + if(current_class < 1){ + pos_docs = pos_docs + 1; + + //the feature occurred in the document + if(current_value > 0){ + pos_oc = pos_oc + current_value; + pos_term_docs = pos_term_docs +1; + } + //the feature did not occur in positive docs + else pos_not_docs = pos_not_docs + 1; + } + //class is negative + else{ + neg_docs = neg_docs+1; + + //the feature occurred in the document + if(current_value > 0){ + neg_term_docs = neg_term_docs +1; + } + //the feature did not occur in negative docs + else neg_not_docs = neg_not_docs + 1; + } + + } + + OR = ( ( (pos_term_docs / pos_docs) / (pos_not_docs/ pos_docs) ) / + ( (neg_term_docs / neg_docs) / (neg_not_docs / neg_docs) ) ); + + // OR = (pos_term_docs / pos_not_docs) / (neg_term_docs / neg_not_docs); + + + //99% confidence: 2.575 + //95% confidence: 1.96 + double confidenceLow = Math.exp(Math.log(OR) - (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs)))); + double confidenceHigh = Math.exp(Math.log(OR) + (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs)))); + + //checking if OR value is within the confidence interval + //and if it satisfies the threshold + if( ((OR <= confidenceHigh) && (OR >= confidenceLow) + && !(OR == threshold)) + //checking if the confidence interval holds the null hypothesis (i.e., spans 1.0) + && !(confidenceLow <=1 && confidenceHigh >=1)) + oddsRatio[i] = OR; + else + oddsRatio[i] = 0; + + if(verbose){ + System.out.println("Attribute: "+ data.attribute(i).toString() +"\t\t OddsRatio: " + oddsRatio[i] + + "\tConfidenceLow: " + confidenceLow + "\tConfidenceHigh: "+ confidenceHigh); + } + } + + return oddsRatio; + } + + /** + * Calculates the inverse document frequency + * for each attribute in the dataset. + * + * @param data instances + * @param threshold + * @return list of idfs for each attribute + */ + public double[] idf(Instances data, int threshold){ + + double[] idf = new double[data.numAttributes()]; + + for(int i = 0; i < data.numAttributes()-1; i++ ){ + + double idf_at = 0; + double idf_at2 = 0; + + Attribute current = data.attribute(i); + double pos_docs = 0, //number of documents in class C + pos_term_docs = 0, //number of docs in class C that have term + neg_term_docs = 0, //number of docs not in class C with term + neg_docs = 0; //number of documents not in class C + + for(int j = 0; j < data.size(); j++){ + + double current_value = data.instance(j).value(current); + double current_class = data.instance(j).classValue(); + + //class is positive + if(current_class < 1){ + pos_docs = pos_docs + 1; + + //the feature occurred in the document + if(current_value > 0){ + pos_term_docs = pos_term_docs +1; + } + } + else{ + //class is negative + neg_docs = neg_docs+1; + + //the feature occurred in the document + if(current_value > 0){ + neg_term_docs = neg_term_docs +1; + } + } + } + +// double idf_pos = Math.log((pos_docs)/(pos_term_docs)); +// double idf_neg = Math.log((neg_docs)/(neg_term_docs)); + + //check if the idf in the "positive" collection + //is greater than the idf in the "negative" collection +// if (idf_pos > idf_neg) +// idf_at = idf_pos; +// +// else idf_at = 0; + + idf_at = Math.log((pos_docs + neg_docs)/(pos_term_docs + neg_term_docs)); + + if(idf_at <= threshold) + idf[i] = 0; + else + idf[i] = idf_at; + } + + if(verbose){ + for(int i = 0; i < idf.length; i++){ + if(idf[i]>0) + System.out.println("Attribute: "+ data.attribute(i).toString()+ "\t\t\t IDF: " + idf[i]); + } + } + + return idf; + } + + +} diff --git a/src/filter/NaiveFilter.java b/src/filter/NaiveFilter.java new file mode 100644 index 0000000..db8a32e --- /dev/null +++ b/src/filter/NaiveFilter.java @@ -0,0 +1,117 @@ +package filter; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import configure.PathConstants; + +/** + * + * This class implements naive feature filtering methods + * to be used by the extractor processes pre-vector building + * + * @author Hayda Almeida + * @since 2015 + * + */ +public class NaiveFilter { + + private boolean verbose = true; + + /** + * Removes from feature list all features with + * frequency not statistically relevant (2 or less) + * @param list to be cleaned + */ + public void considerAnnotationOccurence(HashMap,Integer> list, PathConstants vars){ + //going over the list of annotations and removing the + //features with occurance lower than specified. + + Iterator> iterator = list.keySet().iterator(); + + while(iterator.hasNext()){ + Map key = iterator.next(); + int valor = list.get(key).intValue(); + + if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ + iterator.remove(); + } + } + } + + /** + * Removes from feature list all features with + * frequency not statistically relevant (2 or less) + * @param list to be cleaned + */ + public void considerNgramOccurence(HashMap list, PathConstants vars){ + //going over the list of annotations and removing the + //statistically not significant features - frequency less than 2 + Iterator iterator = list.values().iterator(); + + while(iterator.hasNext()){ + Integer key = iterator.next(); + + if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ + iterator.remove(); + } + } + } + + /** + * Removes stopwords from ngrams list + * + * @param str list of ngrams + * @param constants + * @return cleaned list of ngrams + */ + public String removeStopList(String[] str, PathConstants pathVar){ + + //stop-words file name + String pathStop = "stopList.txt"; + String[] stop = null; + StringBuilder cleaned = new StringBuilder(); + + try{ + + BufferedReader reader = new BufferedReader(new FileReader(pathStop)); + + String line = null; + //loading stop-words list + while((line = reader.readLine()) != null){ + stop = StringUtils.split(line,","); + line = reader.readLine(); + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + //iteraing over text to be cleaned + for(int i = 0; i < str.length; i++){ + //iterating over stop-words list + for(int j = 0; j < stop.length; j++){ + + //when stop-word is encountered, replace it + if(str[i].equalsIgnoreCase(stop[j])){ + str[i] = str[i].replace(str[i],"*"); + } + } + //retrieve the text without stop-words replacements + if(!(str[i].contentEquals("*"))){ + cleaned.append(str[i]).append(" "); + } + } + return cleaned.toString().replace(" ", " "); + } + +} diff --git a/src/preprocessing/.gitignore b/src/preprocessing/.gitignore new file mode 100644 index 0000000..6b468b6 --- /dev/null +++ b/src/preprocessing/.gitignore @@ -0,0 +1 @@ +*.class diff --git a/src/preprocessing/ConcatXML.java b/src/preprocessing/ConcatXML.java new file mode 100644 index 0000000..89e255f --- /dev/null +++ b/src/preprocessing/ConcatXML.java @@ -0,0 +1,717 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +package preprocessing; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Date; +import java.util.List; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import configure.PathConstants; + +/** + * Generates a corpus from raw XML doc instances, + * so that features can be extracted from it + * + * @author Hayda Almeida + * @since 2014 + * + */ +public class ConcatXML{ + + private String tag1; + private String tag2; + private String tag3; + private String tag4; + private String id; + private String corpusTag; + private String corpusTagC; + + + public ConcatXML(){ + + this.setId("PMID"); + this.setTag1("(?s)<.*?xml.*?>"); + this.setTag2("(?s)<.*?!DOCTYPE.*?>"); + this.setTag3("(?s)<.*?corpus.*?>"); + this.seTag4("(?s)<.*?/corpus.*?>"); + this.setCorpusTag(""); + this.setCorpusTag(""); + } + + + + public static void main(String[] args) throws Exception { + + PathConstants pathVars = new PathConstants(); + + String xmlDir = ""; + if(Integer.parseInt(pathVars.EXP_TYPE)== 1) + xmlDir = "test"; + else xmlDir = "train"; + + String sourceDir = "", duplicatesDir = ""; + + Boolean dc = false, df = false, cl = false, cc = false; + + for(int i = 0; i < args.length; i++){ + try{ + if(args[i].matches("-dc")) dc = true; + if(args[i].matches("-df")) df = true; + if(args[i].matches("-cl")) cl = true; + if(args[i].matches("-cc")) cc = true; + } + catch(Exception e){ + System.out.println("Use: \n" + + "-tr -> train, -ts -> test; \n " + + "-dc -> check duplicates in corpus vs. folder; \n " + + "-df -> check duplicates in two folders; \n" + + "-cl -> clean a source folder; \n" + + "-cc -> concatenate files in a folder "); + System.exit(0); + }; + } + + String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); + String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE; + + sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + xmlDir; + duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.DUP_DIR; + + String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml"; + String tagCorpus = concatCorpus; + + ConcatXML concat = new ConcatXML(); + + //================= Checking for duplicates =====================// + if(dc) concat.checkDupCorpus(trainCorpusPath, sourceDir); + if(df) concat.checkDupFolder(sourceDir, duplicatesDir); + + //================== Creating corpus ==========================// + if(cl){ + concat.cleanXML(sourceDir); + if(duplicatesDir.length()>1) + concat.cleanXML(duplicatesDir); + } + if(cc){ + concat.concatenateXML(sourceDir, "", concatCorpus); + concat.tagCorpus(tagCorpus); + } + } + + /** + * Returns the ID of a XML jsoup document + * @param doc a XML doc parsed by jsoup + * @return ID string + * @throws IOException + */ + public String returnID(Document doc) throws IOException{ + + String id = ""; + + Elements paper = doc.body().getElementsByTag("pubmedarticleset"); + + //fetching the paper ID - + //for all items in a paper, retrieve only PMIDs + for(Element e : paper.select(getId())){ + //only consider the ID if the parent is medline citation + if(e.parentNode().nodeName().contains("medline")){ + id = e.text(); + } + } + return id; + } + + /** + * Reads the file IDs in a folder and + * checks a second folder for duplicates. + * + * @param dirSrc source folder + * @param dirDup folder to check for duplicates + */ + + public void checkDupFolder(String dirSrc, String dirDup){ + ArrayList sourceIDs = new ArrayList(); + ArrayList duplicated = new ArrayList(); + ArrayList dupIDs = new ArrayList(); + int ids = 0; + + if(dirSrc.contentEquals(dirDup)){ + System.out.println("Source and duplicates directories are the same.\n\n========================\n"); + } + else { + + File sourceDir = new File(dirSrc); + File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the source dir + for (File xml : srcXMLs){ + + try{ + + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + sourceIDs.add(id); + ids++; + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println(ids + " source file IDs encountered."); + ids = 0; + + File dupDir = new File(dirDup); + + File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the possibly duplicated dir + for (File xml : dupXMLs){ + + try{ + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + dupIDs.add(id); + String dupFileID = id; + ids++; + + for(int j = 0; j < sourceIDs.size(); j++){ + if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){ + + //add ID to duplicated list + duplicated.add(dupFileID); + + //rename the original file + Path from = xml.toPath(); //convert from File to Path + Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + //count number of existing papers on possibly duplicated folder + //just to make sure we are gathering all IDs + System.out.println(ids + " new file IDs encountered."); + ids = 0; + //sorting the list of duplicated IDs + Collections.sort(duplicated, new Comparator(){ + @Override + public int compare(String one, String two){ + return one.compareTo(two); + } + }); + + System.out.println("\nReaded source files: " + sourceIDs.size()); + System.out.println("Readed new files: " + dupIDs.size()); + + System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); + + System.out.println("\nDuplicated files IDs: "); + for(int i = 0; i < duplicated.size(); i++){ + System.out.println(duplicated.get(i)); + } + + System.out.println("\n========================\n"); + } + + + } + + /** + * Reads the corpus and checks the papers IDs + * to identify duplicates in case new papers + * are being concatenated to corpus. + * + * @param corpus path to current corpora to check + * @param dir path to folder with new files to be concatenated + */ + + public void checkDupCorpus(String corpus, String dir){ + ArrayList trainingIDs = new ArrayList(); + ArrayList duplicated = new ArrayList(); + ArrayList newFiles = new ArrayList(); + + int ids = 0; + + try + { + File input = new File(corpus); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + Elements corp = doc.body().getElementsByTag("pubmedarticleset"); + + String id = ""; + + for(Element paper : corp){ + Document thisDoc = Jsoup.parseBodyFragment(paper.toString()); + + //fetching the document ID + id = returnID(thisDoc); + + if(!id.isEmpty()){ + trainingIDs.add(id); + ids++; + } + } + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + System.out.println(ids + " training file IDs encountered."); + ids = 0; + + File corpusDir = new File(dir); + File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the corpus dir + for (File xml : newXMLs){ + + try{ + String id = ""; + //Loading file + File input = new File(xml.getPath()); + //Jsoup parse + Document doc = Jsoup.parse(input, "UTF-8"); + + //fetching the document ID + id = returnID(doc); + + if(!id.isEmpty()){ + + newFiles.add(id); + String newFileID = id; + ids++; + + + for(int j = 0; j < trainingIDs.size(); j++){ + if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){ + + //add ID to duplicated list + duplicated.add(newFileID); + + //moving the original file + Path from = xml.toPath(); //convert from File to Path + Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + } + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + //count number of existing papers on the training file + //just to make sure we are gathering all IDs + System.out.println(ids + " new file IDs encountered."); + ids = 0; + + + //sorting the list of duplicated IDs + Collections.sort(duplicated, new Comparator(){ + @Override + public int compare(String one, String two){ + return one.compareTo(two); + } + }); + + System.out.println("\nReaded training files: " + trainingIDs.size()); + System.out.println("Readed new files: " + newFiles.size()); + + System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); + + System.out.println("\nDuplicated files IDs: "); + for(int i = 0; i < duplicated.size(); i++){ + System.out.println(duplicated.get(i)); + } + + System.out.println("\n========================\n"); + + } + + + /** + * Reads and edits a list of XMLs files in a folder + * to remove XML and previous corpus tags, + * preparing the files to be concatenated. + * + * @param dir string with folder path + */ + + public void cleanXML(String dir){ + + //listing files on corpus dir + File sourceDir = new File(dir); + + File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + System.out.println("... Files list loaded."); + + try{ + //for each file on the corpus dir + for (File xml : newXMLs){ + + try{ + BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); + + String line = null; + ArrayList allLines = new ArrayList(); + String content = null; + + while((line = reader.readLine()) != null){ + content = line; + + //cleaning XML markups + content = content.replaceFirst(getTag1(), ""); + content = content.replaceFirst(getTag2(), ""); + //cleaning previous corpus tags + content = content.replaceFirst(getTag3(), ""); + content = content.replaceFirst(getTag4(), ""); + allLines.add(content); + } + + PrintWriter writer = new PrintWriter(xml.getPath()); + + for (String l : allLines){ + writer.println(l); + } + reader.close(); + writer.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println("... Files cleaned and saved."); + System.out.println("Ready for concatenation."); + System.out.println("\n========================\n"); + } + + + + /** + * Concatenates all XMLs in one folder or between two folders. + * @param sourceDir main directory with XML files. + * @param duplicDir second directory with duplicated XML files + * @param concatFile path name to saved concatenated corpus + */ + + public void concatenateXML(String sourceDir, String duplicDir, String concatFile){ + + final int BUFFER = 1024 << 8; + byte[] buffer = new byte[BUFFER]; + + //listing files on corpus dir + File srcDir = new File(sourceDir); + File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + File dupDir = new File(duplicDir); + File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name) { + return name.endsWith(".xml"); + } + }); + + System.out.println("... Files list loaded."); + + //defining the output file (concatenated) + File newCorpus = new File(concatFile); + + try{ + OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus)); + + + //for each file on the corpus dir + for (File xmls : srcXMLs){ + InputStream input = new FileInputStream(xmls); + int count; + + //if the file is not empty/finished + try{ + while((count = input.read(buffer)) >= 0){ + + //write it on the concatenated final file + output.write(buffer, 0, count); + } + }finally{ + input.close(); + } + } + + if(dupXMLs != null){ + for(File xmld : dupXMLs){ + InputStream input = new FileInputStream(xmld); + int count; + + //if the file is not empty/finished + try{ + while((count = input.read(buffer)) >= 0){ + + //write it on the concatenated final file + output.write(buffer, 0, count); + } + }finally{ + input.close(); + } + } + } + output.flush(); + output.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println("... File concatenated and saved."); + System.out.println("Ready for corpus tagging."); + System.out.println("\n========================\n"); + } + + /** + * Inserts corpus tag on XML file + * + * @param pathToCorpus path to + * concatenated corpus + */ + + public void tagCorpus(String pathToCorpus){ + + //tagging as corpus + try{ + BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus)); + + String line = null; + String edit = null; + List allLines = new ArrayList(); + + //adds tag at beggining of corpus + allLines.add(getCorpusTag()); + + while((line = reader.readLine()) != null){ + + allLines.add(line); + } + //adds tag at the end of corpus + allLines.add(getCorpusTagC()); + + System.out.println("... Corpus loaded and tagged."); + //re-writting the file + PrintWriter writer = new PrintWriter(pathToCorpus); + + for (String l : allLines){ + writer.println(l); + } + reader.close(); + writer.close(); + + System.out.println("... File saved as tagged corpus."); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(IOException e){ + e.printStackTrace(); + } + } + + private String getCorpusTagC() { + return corpusTagC; + } + + private String getCorpusTag() { + // TODO Auto-generated method stub + return corpusTag; + } + + public String getTag1() { + return tag1; + } + + public void setTag1(String tag1) { + this.tag1 = tag1; + } + + public String getTag2() { + return tag2; + } + + public void setTag2(String tag2) { + this.tag2 = tag2; + } + + private String getTag4() { + // TODO Auto-generated method stub + return tag4; + } + + private String getTag3() { + // TODO Auto-generated method stub + return tag3; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + private void setCorpusTag(String string) { + this.corpusTag = string; + + } + + private void seTag4(String string) { + this.tag4 = string; + + } + + private void setTag3(String string) { + this.tag3 = string; + + } + +} + + diff --git a/src/preprocessing/SampleCorpus.java b/src/preprocessing/SampleCorpus.java new file mode 100644 index 0000000..63613a8 --- /dev/null +++ b/src/preprocessing/SampleCorpus.java @@ -0,0 +1,237 @@ +package preprocessing; + +import java.io.File; +import java.io.FilenameFilter; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +import configure.PathConstants; + +/** + * Performs document instances sampling + * generating training and test files + * with specific balance input by user. + * + * @author Hayda Almeida + * @since 2015 + * + */ +public class SampleCorpus { + + public static void main(String[] args) throws Exception { + + PathConstants pathVars = new PathConstants(); + SampleCorpus sampling = new SampleCorpus(); + + String positiveDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.POS_DIR; + List positives = new LinkedList(); + + String negativeDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.NEG_DIR; + List negatives = new LinkedList(); + + //train or test sampling + Boolean tr = true, ts = true; + //% of test corpus WRT the collection, % positive on test set, % positive on training set + int percTs = 20, posTr = 50, posTs = 10; + + for(int i = 0; i < args.length; i++){ + try{ + if(args[i].matches("-tr")){ + tr = true; + posTr = Integer.parseInt(args[i+1]); + } + if(args[i].matches("-ts")){ + ts = true; + percTs = Integer.parseInt(args[i+1]); + posTs = Integer.parseInt(args[i+2]); + } + } + catch(Exception e){ + System.out.println(" Use: \n " + + "-tr -> (% of positives) to sample trainig set \n" + + "-ts -> (% of collection) (% of positives) to sample test set"); + System.exit(0); + }; + } + + positives = sampling.loadFiles(positiveDir); + negatives = sampling.loadFiles(negativeDir); + + if(tr) sampling.sampleTest(pathVars, positives, negatives, percTs, posTs); + + if(ts) sampling.sampleTrain(pathVars, positives, negatives, posTr); + + } + + /** + * Lists XML files within a folder + * @param dirSrc folder path + * @return returns list of file IDs + */ + public List loadFiles(String dirSrc){ + + List fileIDs = new LinkedList(); + + File sourceDir = new File(dirSrc); + File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + fileIDs = new LinkedList(Arrays.asList(srcXMLs)); + + return fileIDs; + } + + /** + * Moves a specific number of files + * in a list from origin folder to a test folder + * @param pathVars + * @param files List of file IDs + * @param numFiles number of files to be moved + */ + public void moveFile(PathConstants pathVars, List files, int numFiles){ + + Iterator filesList = files.iterator(); + File testDir = new File(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_DIR); + + if(!testDir.exists()){ + try{ + testDir.mkdir(); + }catch(Exception e){ + System.out.println("Error creating Test folder."); + System.exit(0); + } + } + + while(filesList.hasNext() && numFiles > 0){ + try{ + File file = (File) filesList.next(); + File newFile = new File(testDir + "/" + file.getName()); + + Files.move(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + + filesList.remove(); + numFiles--; + } + catch(Exception e){ + System.out.println("Error moving files."); + System.exit(0); + } + } + + } + + /** + * Copies a specific number of files + * in a list from origin folder to a train folder + * @param pathVars + * @param files List of file IDs + * @param numFiles number of files to be moved + */ + public void copyFile(PathConstants pathVars, List files, int numFiles){ + + Iterator filesList = files.iterator(); + File trainDir = new File(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR); + + if(!trainDir.exists()) + try{ + trainDir.mkdir(); + }catch(Exception e){ + System.out.println("Error creating Training folder."); + System.exit(0); + } + + while(filesList.hasNext() && numFiles > 0){ + try{ + File file = (File) filesList.next(); + File newFile = new File(trainDir + "/"+ file.getName()); + + Files.copy(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + catch(Exception e){ + System.out.println("Error copying files."); + System.exit(0); + } + } + + } + + /** + * Samples document instances from the collection + * to generate a test set. + * + * @param pathVars + * @param positives list of positive documents IDs + * @param negatives list of negative documents IDs + * @param total percentage of the document collection for test + * @param pos percentage of positive documents in the test set + */ + public void sampleTest(PathConstants pathVars, List positives, List negatives, int total, int pos){ + + int instances = positives.size() + negatives.size(); + int testSize = (instances * total) / 100; + int posSize = (testSize * pos) / 100; + int negSize = testSize - posSize; + + Collections.shuffle(negatives); + System.out.println("===== Test > Negative instances shuffled for test set."); + moveFile(pathVars, negatives, negSize); + System.out.println("===== Test > Negative instances moved to test folder. \n"); + + Collections.shuffle(positives); + System.out.println("===== Test > Positive instances shuffled for test set."); + moveFile(pathVars, positives, posSize); + System.out.println("===== Test > Positive instances moved to test folder. \n"); + + } + + /** + * Samples document instances from the collection + * to generate a training set. + * + * @param pathVars + * @param positives list of positive documents IDs + * @param negatives list of negative documents IDs + * @param pos percentage of positive documents in the training set + */ + public void sampleTrain(PathConstants pathVars, List positives, List negatives, int pos){ + + int trainSize = positives.size() + negatives.size(); + int posSize = (trainSize * pos) / 100; + int negSize = trainSize - posSize; + + if(positives.size() < posSize){ + System.out.println("Not enough positive instances for training set."); + System.exit(0); + } + else if(negatives.size() < negSize){ + System.out.println("Not enough negative instances for training set."); + System.exit(0); + } + else{ + Collections.shuffle(negatives); + System.out.println("===== Training > Negative instances shuffled for training set."); + copyFile(pathVars, negatives, negSize); + System.out.println("===== Training > Negative instances copied to training folder. \n"); + + Collections.shuffle(positives); + System.out.println("===== Training > Positive instances shuffled for training set."); + copyFile(pathVars, positives, posSize); + System.out.println("===== Training > Positive instances copied to training folder. \n"); + } + + } + + + + +}