diff --git a/config-sample.cfg b/config-sample.cfg index 3c8295f..137b6b9 100644 --- a/config-sample.cfg +++ b/config-sample.cfg @@ -7,14 +7,11 @@ ################################################## ########################### DIRECTORIES ########## # project home -HOME_DIR=/. +HOME_DIR=/home/usr/mycosort-pck-version/ # # corpus directory CORPUS_DIR=corpus/ # -# source documents directory -SOURCE_DIR=src/ -# # duplicate documents directory DUP_DIR=test/ # @@ -37,6 +34,23 @@ FEATURE_DIR=features/ OUTPUT_MODEL=arff/ # ################################################# +###################### CORPUS SAMPLING ########## +# true if training set must be sampled +SAMPLE_TRAIN=false +# +# true if test set must be sampled +SAMPLE_TEST=false +# +# % of test corpus WRT the collection +PERCT_TEST=15 +# +# % positive on training set +PERCT_POS_TRAIN=50 +# +# % positive on test set +PERCT_POS_TEST=10 +# +################################################# ########################## INPUT FILES ########## # training file TRAINING_FILE=/triagecorpus_train.xml diff --git a/jar/README b/jar/README deleted file mode 100644 index 9a9b435..0000000 --- a/jar/README +++ /dev/null @@ -1,7 +0,0 @@ -Please add to this folder the following libraries: -commons-lang3-3.2.1.jar -jsoup-1.7.3.jar -weka.jar -LibSVM.jar -LibSVM/libsvm.jar - diff --git a/jar/README~ b/jar/README~ deleted file mode 100644 index 56f2ce9..0000000 --- a/jar/README~ +++ /dev/null @@ -1,7 +0,0 @@ -Please add to this folder the following libraries: -commons-lang3-3.2.1.jar -jsoup-1.7.3.jar -weka.jar -LibSVM.jar -libsvm.jar - diff --git a/src/.gitignore b/src/.gitignore deleted file mode 100644 index 1924ede..0000000 --- a/src/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.classpath -*.project -*.*~ diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore deleted file mode 100644 index 6b468b6..0000000 --- a/src/analyse/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.class diff --git a/src/analyse/ConcatXML.java b/src/analyse/ConcatXML.java deleted file mode 100644 index 9c24173..0000000 --- a/src/analyse/ConcatXML.java +++ /dev/null @@ -1,734 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - - -package analyse; - -import java.io.BufferedOutputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.FilenameFilter; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.PrintWriter; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.StandardCopyOption; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.Date; -import java.util.List; - -import configure.PathConstants; - -/** - * Generates a corpus from raw XML doc instances, - * so that features can be extracted from it - * - * @author halmeida - * - */ -public class ConcatXML extends Extractor{ - - private String tag1; - private String tag2; - private String tag3; - - - public ConcatXML(){ - this.id = ""; - this.tag2 = ""; - this.tag3 = ""; - } - - public static void main(String[] args) throws IOException { - - PathConstants pathVars = new PathConstants(); - - String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); - - String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE; - String xmlDir = "train"; - String sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "all_nbs/"+ xmlDir; - String duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "/src"+ "/annotated_GH27-36_2013_12_31"; - - String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml"; - String tagCorpus = concatCorpus; - - ConcatXML concat = new ConcatXML(); - - //================= Checking for duplicates =====================// - //concat.checkDupCorpus(trainCorpusPath, sourceDir); - //concat.checkDupFolder(sourceDir, duplicatesDir); - - - //================== Creating corpus ==========================// - concat.cleanXML(sourceDir); - //concat.cleanXML(duplicatesDir); - concat.concatenateXML(sourceDir, "", concatCorpus); - concat.tagCorpus(tagCorpus); - } - - /** - * Reads the file IDs in a folder and - * checks a second folder for duplicates. - * - * @param dirSrc source folder - * @param dirDup folder to check for duplicates - */ - - public void checkDupFolder(String dirSrc, String dirDup){ - ArrayList sourceIDs = new ArrayList(); - ArrayList duplicated = new ArrayList(); - ArrayList dupIDs = new ArrayList(); - int ids = 0; - - if(dirSrc.contentEquals(dirDup)){ - System.out.println("Source and duplicates directories are the same.\n\n========================\n"); - } - else { - - File sourceDir = new File(dirSrc); - File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - try{ - //for each file on the source dir - for (File xml : srcXMLs){ - - try{ - BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); - - String line = null; - - String id = null; - - while((line = reader.readLine()) != null){ - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - - //get the IDs of the new files - if (line.contains(getid())){ - - line = line.substring(line.indexOf(">", ""); - - id = line.replace(getendId(), ""); - - sourceIDs.add(id); - - line = reader.readLine(); - line = line.replaceAll("\t",""); - } - - if(line.contains(getOpenJournal())){ - ids++; - } - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - } - - reader.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - System.out.println(ids + " source file IDs encountered."); - ids = 0; - - File dupDir = new File(dirDup); - - File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - try{ - //for each file on the possibly duplicated dir - for (File xml : dupXMLs){ - - try{ - BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); - - String line = null; - - String id = null; - - while((line = reader.readLine()) != null){ - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - - //get the IDs of the new files - if (line.contains(getid())){ - - line = line.substring(line.indexOf(">", ""); - - id = line.replace(getendId(), ""); - - dupIDs.add(id); - String dupFileID = id; - - for(int j = 0; j < sourceIDs.size(); j++){ - if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){ - //moving the original file - Path from = xml.toPath(); //convert from File to Path - Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path - Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); - } - } - - - line = reader.readLine(); - line = line.replaceAll("\t",""); - } - - if(line.contains(getOpenJournal())){ - ids++; - } - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - } - - reader.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - //count number of existing papers on possibly duplicated folder - //just to make sure we are gathering all IDs - System.out.println(ids + " new file IDs encountered."); - ids = 0; - - //for each possible duplicated ID, - //check if it exists on source folder ID list - //if yes, list the duplicated ones - for(int i = 0; i < dupIDs.size(); i++){ - for(int j = 0; j < sourceIDs.size(); j++){ - if(sourceIDs.get(j).equalsIgnoreCase(dupIDs.get(i))){ - duplicated.add(dupIDs.get(i)); - } - } - } - - //sorting the list of duplicated IDs - Collections.sort(duplicated, new Comparator(){ - @Override - public int compare(String one, String two){ - return one.compareTo(two); - } - }); - - System.out.println("\nReaded source files: " + sourceIDs.size()); - System.out.println("Readed new files: " + dupIDs.size()); - - System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); - - System.out.println("\nDuplicated files IDs: "); - for(int i = 0; i < duplicated.size(); i++){ - System.out.println(duplicated.get(i)); - } - - System.out.println("\n========================\n"); - } - - - } - - /** - * Reads the corpus and checks the papers IDs - * to identify duplicates in case new papers - * are being concatenated to corpus. - * - * @param corpus path to current corpora to check - * @param dir path to folder with new files to be concatenated - */ - - public void checkDupCorpus(String corpus, String dir){ - ArrayList trainingIDs = new ArrayList(); - ArrayList duplicated = new ArrayList(); - ArrayList newFiles = new ArrayList(); - - int ids = 0; - - try - { - BufferedReader reader = new BufferedReader(new FileReader(corpus)); - - String line = null; - String id = null; - - - while((line = reader.readLine()) != null){ - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - - //on the previous training corpus - //find exact paper ID and store it - if (line.contains(getid())){ - - line = line.substring(line.indexOf(">", ""); - - id = line.replace(getendId(), ""); - - //insert paper ID to existing training file list - trainingIDs.add(id); - - line = reader.readLine(); - line = line.replaceAll("\t",""); - } - - //count number of existing papers on the training file - //just to make sure we are gathering all IDs - if(line.contains(getOpenJournal())){ - ids++; - } - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - } - - reader.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - - System.out.println(ids + " training file IDs encountered."); - ids = 0; - - File corpusDir = new File(dir); - File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - try{ - //for each file on the corpus dir - for (File xml : newXMLs){ - - try{ - BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); - - String line = null; - - String id = null; - - while((line = reader.readLine()) != null){ - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - - //get the IDs of the new files - if (line.contains(getid())){ - - line = line.substring(line.indexOf(">", ""); - - id = line.replace(getendId(), ""); - - newFiles.add(id); - String newFileID = id; - - for(int j = 0; j < trainingIDs.size(); j++){ - if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){ - //moving the original file - Path from = xml.toPath(); //convert from File to Path - Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path - Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); - } - } - - - line = reader.readLine(); - line = line.replaceAll("\t",""); - } - - if(line.contains(getOpenJournal())){ - ids++; - } - - line = line.replaceAll("\t",""); - line = line.replace("\"", ""); - } - - reader.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - //count number of existing papers on the training file - //just to make sure we are gathering all IDs - System.out.println(ids + " new file IDs encountered."); - ids = 0; - - //for each new ID, check if it exists on training file ID list - //if yes, list the duplicated ones - for(int i = 0; i < newFiles.size(); i++){ - for(int j = 0; j < trainingIDs.size(); j++){ - if(trainingIDs.get(j).equalsIgnoreCase(newFiles.get(i))){ - duplicated.add(newFiles.get(i)); - } - } - } - - //sorting the list of duplicated IDs - Collections.sort(duplicated, new Comparator(){ - @Override - public int compare(String one, String two){ - return one.compareTo(two); - } - }); - - System.out.println("\nReaded training files: " + trainingIDs.size()); - System.out.println("Readed new files: " + newFiles.size()); - - System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); - - System.out.println("\nDuplicated files IDs: "); - for(int i = 0; i < duplicated.size(); i++){ - System.out.println(duplicated.get(i)); - } - - System.out.println("\n========================\n"); - - } - - - /** - * Reads and edits a list of XMLs files in a folder - * to remove XML and previous corpus tags, - * preparing the files to be concatenated. - * - * @param dir string with folder path - */ - - public void cleanXML(String dir){ - - //listing files on corpus dir - File sourceDir = new File(dir); - - File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - System.out.println("... Files list loaded."); - - try{ - //for each file on the corpus dir - for (File xml : newXMLs){ - - try{ - BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); - - String line = null; - ArrayList allLines = new ArrayList(); - String content = null; - - while((line = reader.readLine()) != null){ - content = line; - - //cleaning XML markups - if(content.contains(getTag1())){ - content = content.replace(getTag1(), ""); - allLines.add(content); - } - if(content.contains(getTag2())){ - content = content.replace(getTag2(), ""); - allLines.add(content); - } - if(content.contains(getTag3())){ - content = content.replace(getTag3(), ""); - allLines.add(content); - } - - //cleaning previous corpus tags - if(content.contains(getOpenFile())){ - content = content.replace(getOpenFile(), ""); - allLines.add(content); - } - if(content.contains(getendFile())){ - content = content.replace(getendFile(), ""); - allLines.add(content); - } - - allLines.add(content); - } - - PrintWriter writer = new PrintWriter(xml.getPath()); - - for (String l : allLines){ - writer.println(l); - } - reader.close(); - writer.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - System.out.println("... Files cleaned and saved."); - System.out.println("Ready for concatenation."); - System.out.println("\n========================\n"); - } - - /** - * Concatenates all XMLs in one folder or between two folders. - * @param sourceDir main directory with XML files. - * @param duplicDir second directory with duplicated XML files - * @param concatFile path name to saved concatenated corpus - */ - - public void concatenateXML(String sourceDir, String duplicDir, String concatFile){ - - final int BUFFER = 1024 << 8; - byte[] buffer = new byte[BUFFER]; - - //listing files on corpus dir - File srcDir = new File(sourceDir); - File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - File dupDir = new File(duplicDir); - File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name) { - return name.endsWith(".xml"); - } - }); - - System.out.println("... Files list loaded."); - - //defining the output file (concatenated) - File newCorpus = new File(concatFile); - - try{ - OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus)); - - - //for each file on the corpus dir - for (File xmls : srcXMLs){ - InputStream input = new FileInputStream(xmls); - int count; - - //if the file is not empty/finished - try{ - while((count = input.read(buffer)) >= 0){ - - //write it on the concatenated final file - output.write(buffer, 0, count); - } - }finally{ - input.close(); - } - } - - if(dupXMLs != null){ - for(File xmld : dupXMLs){ - InputStream input = new FileInputStream(xmld); - int count; - - //if the file is not empty/finished - try{ - while((count = input.read(buffer)) >= 0){ - - //write it on the concatenated final file - output.write(buffer, 0, count); - } - }finally{ - input.close(); - } - } - } - output.flush(); - output.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - System.out.println("... File concatenated and saved."); - System.out.println("Ready for corpus tagging."); - System.out.println("\n========================\n"); - } - - /** - * Inserts corpus tag on XML file - * - * @param pathToCorpus path to - * concatenated corpus - */ - - public void tagCorpus(String pathToCorpus){ - - //tagging as corpus - try{ - BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus)); - - String line = null; - String edit = null; - List allLines = new ArrayList(); - - //adds tag at beggining of corpus - allLines.add(getOpenFile()); - - while((line = reader.readLine()) != null){ - - allLines.add(line); - } - //adds tag at the end of corpus - allLines.add(getendFile()); - - System.out.println("... Corpus loaded and tagged."); - //re-writting the file - PrintWriter writer = new PrintWriter(pathToCorpus); - - for (String l : allLines){ - writer.println(l); - } - reader.close(); - writer.close(); - - System.out.println("... File saved as tagged corpus."); - } - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(IOException e){ - e.printStackTrace(); - } - } - - public String getTag1() { - return tag1; - } - - public void setTag1(String tag1) { - this.tag1 = tag1; - } - - public String getTag2() { - return tag2; - } - - public void setTag2(String tag2) { - this.tag2 = tag2; - } - - public String getTag3() { - return tag3; - } - - public void setTag3(String tag3) { - this.tag3 = tag3; - } - - -} - - diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java deleted file mode 100644 index 8e91951..0000000 --- a/src/analyse/Extractor.java +++ /dev/null @@ -1,443 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package analyse; - -import java.io.BufferedWriter; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -/** - * Implements common tools to FeatureExtractor - * and NgramExtractor classes that are used to - * extract features from doc instances - * - * @author halmeida - * - */ -public class Extractor { - - //String pathFile; - String id; - protected String endId; - String openFile; - String endFile; - String openAbst; - String closeAbst; - String abstractLabel; - String openEC; - String closeEC; - String classTag; - String openTitle; - String closeTitle; - String openJournal; - String closeJournal; - String copyR; - String closeCopyR; - - /** - * Replaces special characters to clean - * text for tokenizing. - * - * @param str text to be cleaned - * @return string with cleaned text - */ - public String removeSpecialChar(String str){ - str = str.replace("}", ""); - str = str.replace("{", ""); - str = str.replace("]", ""); - str = str.replace("[", ""); - str = str.replace("#", ""); - str = str.replace("*", ""); - str = str.replace(">", ""); - str = str.replace("&apos", ""); - str = str.replace("%", ""); - str = str.replace(""", ""); - str = str.replace("&", ""); - str = str.replace("=", ""); - str = str.replace("?", ""); - str = str.replace(";", ""); - str = str.replace(":", ""); - str = str.replace(",", ""); - str = str.replace(".", ""); - str = str.replace(")", ""); - str = str.replace("(", ""); - str = str.replace("\t\t", "\t"); - //losing ngrams because of hifen between names - str = str.replace("-", " "); - str = str.replace(" ", ""); - - return str; - } - - /** - * Handles external tags (and multiple abstract - * text tags) present in a single paper - * @param str abstract content - * @return string without external tags - */ - - public String processAbstract(String str){ - str = str.replace(" ", ""); - String[] remove = str.split(""); - StringBuilder sb = new StringBuilder(); - String temp = ""; - String abstrac = ""; - - for(int i = 0; i < remove.length; i++){ - temp = temp + remove[i]; - - if(temp.contains(""))); - } - if(temp.contains("Copyright ")){ - temp = ""; - do{ - i++; - //an exception here can mean that a copyright information - //tag content did not ended with a period - }while(!(remove[i]).equalsIgnoreCase(".")); - } - else sb.append(remove[i]); - } - - abstrac = sb.toString(); - abstrac = removeAbstractTags(abstrac); - - return abstrac; - } - - - /** - * Removes specific tags encountered on Abstract texts. - * This is used to clean the abstract text before - * processing the feature count on the model. - * @param str - * @return - */ - - public String removeAbstractTags(String str){ - //this order of removing tags matters to - //exclude the first tag from the abstracts. - - str = str.replace("", ""); - str = str.replace("", ""); - str = str.replace("", ""); - str = str.replace("copyright", ""); - str = str.replace("", ""); - str = str.replace("", ""); - str = str.replace("", ""); - str = str.replace("", ""); - - return str; - } - - - /** - * Removes the markup annotations of a - * text field, and keeps its content - * - * @param str text containing markups - * @return string with cleaned text - */ - public String removeTags(String str) { - String[] remove = str.split(""); - StringBuilder sb = new StringBuilder(); - - for(int i = 0; i < remove.length; i++){ - - if(remove[i].equalsIgnoreCase("<")){ - do{ - i++; - } - while(!(remove[i].equalsIgnoreCase(">"))); - } - else sb.append(remove[i]); - } - - return sb.toString(); - } - - - /** - * Displays the keys and values of the - * maps created. - * - * @param hash HashMap containing list, - * values, counts - */ - public void displayList(HashMap hash){ - Iterator itr = hash.keySet().iterator(); - int sum = 0; - while(itr.hasNext()){ - Object str = itr.next(); - System.out.println("key: "+str+"\t value: "+hash.get(str)); - } - } - - - /** - * Exports hashmap of values extracted - * from dataset to external file - * - * @param location folder, file name and file extension - * @param list values to be exported - */ - public void exportFile(String location, HashMap list){ - - String SEPARATOR = "\t"; - StringBuffer line = new StringBuffer(); - Iterator itr = list.keySet().iterator(); - - try{ - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8")); - - while(itr.hasNext()){ - Object str = itr.next(); - if(str != null){ - line.append(str).append(SEPARATOR).append(list.get(str)); - if(line.toString().contains("=")) - line.replace(line.indexOf("="), line.indexOf("=")+1,SEPARATOR); - //handling specificities from title content extraction - if(line.toString().contains(",")) - line.replace(line.indexOf(","), line.indexOf(",")+1,SEPARATOR); - } - if(itr.hasNext()){ - //writer.newLine(); - line.append("\n"); - } - writer.write(removeSpecialChar(line.toString())); - line.replace(0, line.length(), ""); - //writer.newLine(); - } - writer.flush(); - writer.close(); - } - catch(UnsupportedEncodingException e){ - e.printStackTrace(); - } - catch(FileNotFoundException e){ - e.printStackTrace(); - } - catch(IOException e){ - e.printStackTrace(); - } - - - //} - } - - - /** - * Exports list of values extracted - * from dataset to a string variable - * - * @param list list of values to be exported - * @return string containing values on list - * @deprecated - */ - public String exportContent(HashMap list){ - String SEPARATOR = "\t"; - Iterator itr = list.keySet().iterator(); - StringBuffer export = new StringBuffer(); - //try{ - while(itr.hasNext()){ - String str = itr.next(); - if(str != null){ - export.append(str).append(SEPARATOR).append(list.get(str)); - - if(export.toString().contains("=")) - export.replace(export.indexOf("="), export.indexOf("=")+1,SEPARATOR); - } - - if(itr.hasNext()){ - export.append("\n"); - } - } - /*} - catch(Exception e){ - - }*/ - - return removeSpecialChar(export.toString()); - } - - - /** - * Exports list of values extracted - * from dataset to external file - * - * @param location folder, file name and file extension - * @param list list of values to be exported - * - */ - public void exportList(String location, ArrayList list){ - - String SEPARATOR = "\n"; - StringBuffer line = new StringBuffer(); - - try{ - BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8")); - - for(int i = 0; i < list.size(); i++){ - String str = list.get(i); - if(str != null){ - line.append(str).append(SEPARATOR); - } - } - writer.write(removeSpecialChar(line.toString())); - - writer.flush(); - writer.close(); - } - catch(UnsupportedEncodingException e){ - e.printStackTrace(); - } - catch(FileNotFoundException e){ - e.printStackTrace(); - } - catch(IOException e){ - e.printStackTrace(); - } - - } - - - public void initialize(){ - - } - - - /** - * Accessors and mutators methods - * for Extractor variables. - * @return - */ - /*public String getPathFile() { - return pathFile; - } - public void setPathFile(String pathFile) { - this.pathFile = pathFile; - }*/ - public String getid() { - return id; - } - public void setid(String id) { - this.id = id; - } - public String getendId() { - return endId; - } - public void setendId(String endId) { - this.endId = endId; - } - public String getOpenFile() { - return openFile; - } - public void setOpenFile(String openFile) { - this.openFile = openFile; - } - public String getendFile() { - return endFile; - } - public void setendFile(String endFile) { - this.endFile = endFile; - } - public String getopenAbst() { - return openAbst; - } - public void setopenAbst(String openAbst) { - this.openAbst = openAbst; - } - public String getcloseAbst() { - return closeAbst; - } - public void setcloseAbst(String closeAbst) { - this.closeAbst = closeAbst; - } - public String getOpenEC() { - return openEC; - } - public void setOpenEC(String openEC) { - this.openEC = openEC; - } - public String getCloseEC() { - return closeEC; - } - public void setCloseEC(String closeEC) { - this.closeEC = closeEC; - } - public String getAbstractLabel() { - return abstractLabel; - } - public void setAbstractLabel(String abstractLabel) { - this.abstractLabel = abstractLabel; - } - public String getClassTag() { - return classTag; - } - public void setClassTag(String classTag) { - this.classTag = classTag; - } - public String getOpenTitle() { - return openTitle; - } - public void setOpenTitle(String titleTag) { - this.openTitle = titleTag; - } - public String getCloseTitle() { - return closeTitle; - } - public void setCloseTitle(String closeTitle) { - this.closeTitle = closeTitle; - } - public String getOpenJournal() { - return openJournal; - } - public void setOpenJournal(String openJournal) { - this.openJournal = openJournal; - } - public String getCloseJournal() { - return closeJournal; - } - public void setCloseJournal(String closeJournal) { - this.closeJournal = closeJournal; - } - -} \ No newline at end of file diff --git a/src/analyse/FeatureExtractor.java b/src/analyse/FeatureExtractor.java deleted file mode 100644 index 4d66d4f..0000000 --- a/src/analyse/FeatureExtractor.java +++ /dev/null @@ -1,526 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package analyse; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -import configure.PathConstants; -import filter.NaiveFilter; - - -/** - * This class extracts and parses domain - * annotation features from doc instances - * - * @author Hayda Almeida - * @since 2014 - * - */ - -public class FeatureExtractor extends Extractor{ - - public FeatureExtractor(){ - - this.id = "PMID"; - this.openAbst = "AbstractText"; - this.abstractLabel = "AbstractText "; - this.openEC = "RegistryNumber"; - this.classTag = "TRIAGE"; - this.openJournal = "Title"; - this.openTitle = "ArticleTitle"; - } - - - public static void main(String[] args) { - - PathConstants pathVars = new PathConstants(); - boolean verbose = false; - - String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR + pathVars.TRAINING_FILE; - FeatureExtractor fextrac = new FeatureExtractor(); - NaiveFilter featFilter = new NaiveFilter(); - - //store all features, type and count - HashMap,Integer> abstract_count = new HashMap,Integer>(); - //store all features, type and classification - HashMap,String> abstract_type = new HashMap,String>(); - - //store title features, type and classification - HashMap,String> title_type = new HashMap,String>(); - //store title features, type and count - HashMap, Integer> title_count = new HashMap, Integer>(); - //store title features, whole journal title content and classification - HashMap,String> title_content = new HashMap,String>(); - //store title content and EC numbers - ArrayList ec_numbers = new ArrayList(); - - //store ID, class and features - HashMap PMIDs = new HashMap(); - - fextrac.initialize(); - int jTitle = 0; - - try - { - String line = null; - String features = null; - //Loading file - File input = new File(AnCorpus); - //Jsoup parse - Document doc = Jsoup.parse(input, "UTF-8"); - - Elements corpus = doc.body().getElementsByTag("pubmedarticleset"); - - //Fetching elements - - for(Element paper : corpus ){ - - //Fetching elements - Elements journalTitle = paper.getElementsByTag(fextrac.getOpenJournal()); - Elements title = paper.getElementsByTag(fextrac.getOpenTitle()); - Elements abstractC = paper.getElementsByTag(fextrac.getopenAbst()); - Elements ECnumber = paper.getElementsByTag(fextrac.getOpenEC()); - Elements classDoc = paper.getElementsByTag(fextrac.getClassTag()); - - String journal = ""; - String docID = ""; - String label = ""; - ArrayList tempList = new ArrayList(); - StringBuffer sb = new StringBuffer(); - - //fetching the paper ID - - //for all items in a paper, retrieve only PMIDs - for(Element e : paper.select(fextrac.getid())){ - //only consider the ID if the parent is medline citation - if(e.parentNode().nodeName().contains("medline")){ - docID = e.text(); - } - } - //fetch the doc label as well - if(classDoc.hasText()){ - label = classDoc.text(); - } - - PMIDs.put(docID, label); - - if(journalTitle.hasText()){ - - jTitle++; - journal = journalTitle.toString(); - journal = fextrac.removeSpecialChar(journal); - journal = fextrac.removeTags(journal); - } - - String title_annotation = ""; - if(title.hasText()){ - title_annotation = title.toString(); - title_annotation = fextrac.removeSpecialChar(title_annotation); - - tempList.addAll(fextrac.annotations(title_annotation, title_count, title_type, featFilter, pathVars)); - fextrac.addContent(title_annotation, journal, title_content, featFilter); - } - - String abstrac = ""; - if(abstractC.hasText()){ - abstrac = abstractC.toString(); - abstrac = fextrac.removeSpecialChar(abstrac); - abstrac = fextrac.removeAbstractTags(abstrac); - - tempList.addAll(fextrac.annotations(abstrac, abstract_count, abstract_type, featFilter, pathVars)); - } - - String ecnum = ""; - if(ECnumber.hasText()){ - for(Element number : ECnumber){ - ecnum = number.toString(); - if(ecnum.contains("EC")){ - ecnum = fextrac.removeSpecialChar(ecnum); - ecnum = fextrac.removeTags(ecnum); - ec_numbers.add(features); - } - } - } - - String triage = ""; - if(classDoc.hasText()){ - triage = classDoc.toString(); - triage = fextrac.removeSpecialChar(triage); - triage = fextrac.removeTags(triage); - - fextrac.addClass(triage, abstract_type); - fextrac.addClass(triage, title_type); - fextrac.addClass(triage, title_content); - } - -// for(int i = 0; i < tempList.size(); i++){ -// sb.append(tempList.get(i) + "-"); -// } -// -// PMIDs.put(docIDLabel, sb.toString()); - } - - } - - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch (IOException e) { - e.printStackTrace(); - } - - if(verbose){ - //print list of extracted features - System.out.println("\n===========TITLE==ANNOTATIONS============="); - fextrac.displayList(title_count); - fextrac.displayList(title_type); - fextrac.displayList(title_content); - System.out.println("\n========ABSTRACT==ANNOTATIONS============="); - fextrac.displayList(abstract_count); - fextrac.displayList(abstract_type); - } - - //filter features by occurence - featFilter.considerAnnotationOccurence(abstract_count, pathVars); - featFilter.considerAnnotationOccurence(title_count, pathVars); - - System.out.println("\n===========FEATURE==EXPORT==============="); - fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs); - System.out.println("..."+ PMIDs.size()+" document IDs listed."); - fextrac.exportList(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES, ec_numbers); - System.out.println("..."+ ec_numbers.size()+" EC numbers saved."); - fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES, abstract_count); - System.out.println("..."+ abstract_count.size()+" unique Abstract annotations saved."); - fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES, title_count); - System.out.println("..."+ title_count.size() +" unique Title annotations saved."); - fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES, title_content); - System.out.println("..."+jTitle+" Journal titles saved."); - System.out.println("\n=========================================\n"); - - } - - /** - * Identifies the classification on doc - * - * @param clas text containing classification (after char removal) - * @return classification of doc - */ - private String getClassif(String clas) { - - //parsing the not edited text into HTML using Jsoup - Document doc = Jsoup.parseBodyFragment(clas); - //saving the text as an Jsoup element, with a main tag (the HTML body), - //attributes and child nodes (TRIAGE tags) - Element text = doc.body(); - - Elements classification = text.getElementsByTag("TRIAGE"); - - return classification.text(); - } - - /** - * Inserts the classification - * on the list of features - * - * @param class information to insert on list - * @param list list of features used - */ - private void addClass(String element, HashMap, String> list){ - //going over list to insert - //classif on document instances - Iterator>it = list.keySet().iterator(); - - while(it.hasNext()){ - Map str = it.next(); - - if(list.get(str).contains(element)){ - //if(list.get(str).contains("positive") || list.get(str).contains("negative")){ - - } - else list.put(str, element); - } - } - - - /** - * Extract the annotations from a determined section - * of the document and add them to the specified lists. - * - * @param annotation cleaned and splitted line with annotation - * @param count list that holds annotation, its type and its count - * @param type list that holds annotation, its type and its classification - */ - private ArrayList annotations(String annot, HashMap, Integer> count, HashMap,String> type, NaiveFilter filter, PathConstants pathVars) { - HashMap features = loadAnnotationEntities(); - PathConstants pathVar = new PathConstants(); - NgramExtractor nextrac = new NgramExtractor(); - ArrayList content = new ArrayList(); - - //parsing the not edited text into HTML using Jsoup - Document doc = Jsoup.parseBodyFragment(annot); - //saving the text as an Jsoup element, with a main tag (the HTML body), - //attributes and child nodes (annotation tags) - Element annotations = doc.body(); - - //iterating over list of entities - for(Map.Entry value : features.entrySet()){ - - String an_type = value.getKey(); - String an_level = value.getValue(); - - //for each entity, find the annotations on abstract - Elements annots = annotations.getElementsByTag(an_type); - - //for each annotation found, - for(Element an : annots){ - - //grabbing annotation content: - //if the annotation is made on the sentence level: - if(an_level.contains("sentence")){ - - //checkingh if sentence contains inner annotations - if(an.childNodeSize() != 0){ - - //going over list of inner annotations - for(Element child : an.children()){ - - //if child is sentence (sentence inside of sentence), - //then add annotations as ngrams on this - if(features.get(child.nodeName()).contains("sentence")) { - content.addAll(nextrac.nGrams(child.text(), filter, pathVar)); - insertAnnotation(content, an.nodeName(), count, type, pathVars); - } - //adding annotations on sentence as they are - no ngrams on this - else { - content.add(child.text()); - insertAnnotation(content, an.nodeName(), count, type, pathVars); - } - } - - //removing inner annotations from sentence, they are already added - Element tempAnnot = an.clone(); - tempAnnot.children().remove(); - - //splitting content in ngrams to whats left on the sentence - content.addAll(nextrac.nGrams(tempAnnot.text(), filter, pathVar)); - insertAnnotation(content, an.nodeName(), count, type, pathVars); - } - - } - else { - //keeping original annotation content for other cases - content.add(an.text()); - insertAnnotation(content, an.nodeName(), count, type, pathVars); - } - } - - } - return content; - - } - - - /** - * Insert annotation (or ngram list of annotation) - * on lists, used on @annotations method - * @param content content of annotation - * @param an_type type extracted from text (entity) - * @param count list of annotations and their count - * @param type list of annotations and their type - */ - private void insertAnnotation(ArrayList content, String an_type, HashMap, Integer> count, HashMap,String> type, PathConstants pathVars){ - - //iterating over list of annotations - for(int i = 0; i < content.size(); i++){ - - if(content.get(i).length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){ - - //creating the list key as: content - type mapping - Map an_content = new HashMap(); - an_content.put(content.get(i), an_type); - - //for each annotation (or ngram on annotation) - //insert content and related type - if(count.containsKey(an_content)){ - try{ - int cnt = count.get(an_content); - count.put(an_content, cnt+1); - - }catch(Exception e){ - count.put(an_content, 1); - } - } - else{ - count.put(an_content, 1); - } - //populating list of feature_an_types, with: - //feature--an_type--class - type.put(an_content, ""); - } - } - - content.clear(); - - } - - - /** - * Inserts the text (e.g.title) content into - * a list of features (e.g.title features) - * - * @param annot text with the annotations to be handled - * @param wContent whole field to be added on the list of features - * @param list features used - * - */ - private void addContent(String annot, String wContent, HashMap,String> list, NaiveFilter filter) { - - HashMap features = loadAnnotationEntities(); - ArrayList content = new ArrayList(); - NgramExtractor nextrac = new NgramExtractor(); - PathConstants pathVar = new PathConstants(); - - //parsing not edited text into HTML using Jsoup - Document doc = Jsoup.parseBodyFragment(annot); - //saving the text as an Jsoup element, with a main tag (the HTML body), - //attributes and child nodes (annotation tags) - Element annotations = doc.body(); - - //iterating over annotation types - for(Map.Entry value : features.entrySet()){ - - String an_type = value.getKey(); - String an_level = value.getValue(); - - //for each annotation type, find all related annotations on the abstract - Elements annots = annotations.getElementsByTag(an_type); - - //for each annotation type, - for(Element an : annots){ - - //grab annotation content - if(an_level.contains("sentence")) - //splitting in ngrams for sentence level annotations - content = nextrac.nGrams(an.text(), filter, pathVar); - else - //keeping original annotation for other cases - content.add(an.text()); - - //iterating over list of annotations - for(int i = 0; i < content.size(); i++){ - - Map an_content = new HashMap(); - an_content.put(content.get(i), wContent); - - //populating list of feature_an_types, with: - //feature--an_type--class - list.put(an_content, ""); - } - content.clear(); - } - } - } - - - /** - * Loads list of entities from external file - * - * @param str list of entities - * @param pathVar constants from - * @return - */ - public HashMap loadAnnotationEntities(){ - - String pathEntities = "entities.txt"; - HashMap values = new HashMap(); - - try{ - BufferedReader reader = new BufferedReader(new FileReader(pathEntities)); - - String line = null; - - while((line = reader.readLine()) != null){ - - String[] value = StringUtils.split(line, " "); - values.put(value[0].toLowerCase(), value[1].toLowerCase()); - } - - reader.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - //String[] entities = values.toArray(new String[values.size()]); - - return values; - } - - - /** - * Handles the content of annotations; when - * there is multiple elements, they are - * concatenated after extracted - * - * @param str list of annotation elements - * @return single string with all elements - */ - public String contentToString(String[] str){ - String cont = ""; - - for(int i = 0; i < str.length; i++){ - if(cont.contentEquals("")){ - cont = cont + str[i]; - } - else cont = cont+" "+ str[i]; - - } - - return cont; - } - - - -} diff --git a/src/analyse/NgramExtractor.java b/src/analyse/NgramExtractor.java deleted file mode 100644 index c101c25..0000000 --- a/src/analyse/NgramExtractor.java +++ /dev/null @@ -1,340 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package analyse; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -import configure.PathConstants; -import filter.NaiveFilter; - -/** - * This class extracts and parses n-grams - * from XML doc instances. - * - * @author Hayda Almeida - * @since 2014 - * - */ - -public class NgramExtractor extends Extractor{ - - public NgramExtractor(){ - - //defining relevant paper text fields - this.id = "PMID"; - this.openJournal = "Title"; - this.openAbst = "AbstractText"; - this.openEC = "RegistryNumber"; - this.classTag = "TRIAGE"; - this.openTitle = "ArticleTitle"; - } - - - public static void main(String[] args) { - - PathConstants pathVars = new PathConstants(); - boolean verbose = false; - - String AnCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE; - NgramExtractor nextrac = new NgramExtractor(); - NaiveFilter featFilter = new NaiveFilter(); - - //store abstract ngrams and its count - HashMap ngram_count = new HashMap(); - //store abstract ngrams and doc ID - HashMap ngram_ID = new HashMap(); - //store title ngrams and its count - HashMap ngram_title_count = new HashMap(); - //store title ngrams, count and "relevance(TBD)" - HashMap,Integer> ngram_title = new HashMap,Integer>(); - //store ID and label of documents - HashMap PMIDs = new HashMap(); - - nextrac.initialize(); - - try - { - - //Loading file - File input = new File(AnCorpus); - //Jsoup parse - Document doc = Jsoup.parse(input, "UTF-8"); - - Elements corpus = doc.body().getElementsByTag("pubmedarticleset"); - - //Fetching elements - - for(Element paper : corpus ){ - - Elements journalTitle = paper.getElementsByTag(nextrac.getOpenJournal()); - Elements title = paper.getElementsByTag(nextrac.getOpenTitle()); - Elements abstractC = paper.getElementsByTag(nextrac.getopenAbst()); - Elements ECnumber = paper.getElementsByTag(nextrac.getOpenEC()); - Elements classDoc = paper.getElementsByTag(nextrac.getClassTag()); - - String journal = ""; - String docID = ""; - String label = ""; - int jTitle = 0; - - //fetching the paper ID - - //for all items in a paper, retrieve only PMIDs - for(Element e : paper.select(nextrac.getid())){ - //only consider the ID if the parent is medline citation - if(e.parentNode().nodeName().contains("medline")){ - docID = e.text(); - } - } - //fetch the doc label as well - if(classDoc.hasText()){ - label = classDoc.text(); - } - - PMIDs.put(docID, label); - - //Extracting the Journal Title - if(journalTitle.hasText()){ - jTitle++; - journal = journalTitle.toString(); - journal = nextrac.removeSpecialChar(journal); - journal = nextrac.removeTags(journal); - } - - String tit_content = ""; - //Extracting the Paper Title - if(title.hasText()){ - tit_content = title.toString(); - tit_content = nextrac.removeSpecialChar(tit_content); - tit_content = nextrac.removeTags(tit_content); - - ArrayList title_c = nextrac.nGrams(tit_content, featFilter, pathVars); - nextrac.addNGram(title_c, ngram_title_count, pathVars); - } - - String abstrac = ""; - //Extracting the Paper abstract - if(abstractC.hasText()){ - abstrac = abstractC.toString(); - abstrac = nextrac.removeTags(abstrac); - abstrac = nextrac.removeSpecialChar(abstrac); - abstrac = nextrac.removeAbstractTags(abstrac); - - ArrayList abstract_c = nextrac.nGrams(abstrac, featFilter, pathVars); - nextrac.addNGram(abstract_c, ngram_count, pathVars); - } - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - - if(verbose){ - //print list of extracted n-grams - nextrac.displayList(PMIDs); - System.out.println("\n========ABSTRACT==NGRAMS============="); - nextrac.displayList(ngram_count); - nextrac.displayList(ngram_title); - System.out.println("\n===========TITLE==NGRAMS============="); - nextrac.displayList(ngram_title_count); - } - - //filter features by occurence - featFilter.considerNgramOccurence(ngram_count, pathVars); - featFilter.considerNgramOccurence(ngram_title_count, pathVars); - - System.out.println("\n===========NGRAMS==EXPORT===============\n"); - nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.DOC_IDS, PMIDs); - System.out.println("..."+ PMIDs.size()+" document IDs listed."); - nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES, ngram_count); - System.out.println("..."+ ngram_count.size()+" unique Abstract ngrams saved."); - nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS, ngram_title_count); - System.out.println("... "+ ngram_title_count.size() +" unique Title ngrams saved."); - System.out.println("\n========================================\n"); - - } - - - /** - * Inserts ngrams into list of features - * with a mapping for ngram count - * @param str relation of ngrams extracted - * @param list_count mapping for ngram counts - * @param pathVars - */ - - private void addNGram(ArrayList str, HashMap list_count, PathConstants pathVars){ - - //iterating over ngram list - for(int i = 0; i < str.size(); i++){ - String currentNGram = str.get(i); - - //checking existence of current ngram on list mapping - if(list_count.containsKey(currentNGram)){ - //retrieve the amount of current ngrams on mapping - int count = list_count.get(currentNGram); - //insert the updated count of ngrams - list_count.put(currentNGram, count+1); - } - else { - //insert ngram on mapping list - if(currentNGram.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){ - list_count.put(currentNGram, 1); - } - } - } - } - - /** - * Extracts n-grams from a given content field - * - * @param str text to extract ngrams - * @return list of extracted grams - */ - public ArrayList nGrams(String str, NaiveFilter filter, PathConstants pathVar){ - - //removing ASCII special characters - str = str.replace("/", ""); - str = str.replace("\\", ""); - //str = str.replace("\n", " "); - str = str.replaceAll("\\s+"," "); - str = str.replace(" ", "-"); - - //Tokenizing the sentence - String[] words = StringUtils.split(str,"-"); - ArrayList ngramList = new ArrayList(); - - int ngram =Integer.parseInt(pathVar.NGRAM_SIZE); - - //Stop-words removal - if(Boolean.valueOf(pathVar.NGRAM_STOP)){ - words = StringUtils.split(filter.removeStopList(words, pathVar)," "); - } - - //extracting ngrams according to gram size (1, 2, 3) - for(int i=0; i < words.length - (ngram - 1); i++){ - switch(pathVar.NGRAM_SIZE){ - case "1": - ngramList.add(words[i].toLowerCase()); - break; - case "2": - ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()); - break; - case "3": - ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()+" "+words[i+2].toLowerCase()); - break; - } - } - - return ngramList; - } - -// /** -// * Removes stopwords from ngrams list -// * -// * @param str list of ngrams -// * @param constants -// * @return cleaned list of ngrams -// */ -// public String removeStopList(String[] str, PathConstants pathVar){ -// -// //stop-words file name -// String pathStop = "stopList.txt"; -// String[] stop = null; -// StringBuilder cleaned = new StringBuilder(); -// -// try{ -// -// BufferedReader reader = new BufferedReader(new FileReader(pathStop)); -// -// String line = null; -// //loading stop-words list -// while((line = reader.readLine()) != null){ -// stop = StringUtils.split(line,","); -// line = reader.readLine(); -// } -// -// reader.close(); -// -// }catch (FileNotFoundException e) { -// e.printStackTrace(); -// } catch (IOException e) { -// e.printStackTrace(); -// } -// -// //iteraing over text to be cleaned -// for(int i = 0; i < str.length; i++){ -// //iterating over stop-words list -// for(int j = 0; j < stop.length; j++){ -// -// //when stop-word is encountered, replace it -// if(str[i].equalsIgnoreCase(stop[j])){ -// str[i] = str[i].replace(str[i],"*"); -// } -// } -// //retrieve the text without stop-words replacements -// if(!(str[i].contentEquals("*"))){ -// cleaned.append(str[i]).append(" "); -// } -// } -// return cleaned.toString().replace(" ", " "); -// } - - - /** - * Displays the keys and values of the - * maps created with n-grams and counts. - * @param hash HashMap containing n-grams - */ - @Override - public void displayList(HashMap hash){ - super.displayList(hash); - //sum = sum + hash.get(str); - System.out.println("\n=======================================\n"); - System.out.println("Number of unique n-grams: "+hash.size()); - System.out.println("\n=======================================\n"); - } - - - -} diff --git a/src/arffmatrix/.gitignore b/src/arffmatrix/.gitignore deleted file mode 100644 index ec5761d..0000000 --- a/src/arffmatrix/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/buildmodel.class -/buildtest.class diff --git a/src/arffmatrix/BuildModel.class b/src/arffmatrix/BuildModel.class deleted file mode 100644 index 0be977c..0000000 Binary files a/src/arffmatrix/BuildModel.class and /dev/null differ diff --git a/src/arffmatrix/BuildModel.java b/src/arffmatrix/BuildModel.java deleted file mode 100644 index f8d0fac..0000000 --- a/src/arffmatrix/BuildModel.java +++ /dev/null @@ -1,301 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -*** -* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/corpus/buildmodel.java -* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton -* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc -* -* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term -* http://creativecommons.org/licenses/by-nc/3.0/ -*/ - -package arffmatrix; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.Date; -import analyse.Extractor; -import arffvector.CreateVector; -import configure.PathConstants; - -/** - * This class reads the corpus instances and uses - * the CreateVector class to generate a model file (ARFF) * - * - * @author Hayda Almeida, Marie-Jean Meurs - * @since 2014 - * - */ - -public class BuildModel { - - private static String configfile = null; - - public static void main(String[] args) { - - //----------------------------------- - // instantiate classes of constants - // and configuration file. - //----------------------------------- - - PathConstants pathVars; - - if (configfile == null){ - pathVars = new PathConstants(); - } - else{ - pathVars = new PathConstants(configfile); - } - - Extractor model = new Extractor(); - model.initialize(); - CreateVector vectorgenerator = new CreateVector(pathVars); - String attributes = vectorgenerator.informFeatures(pathVars); - System.out.println("Features loaded ..."); - - // name output ARFF files - String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); - String arffFileName = "triage" + pathVars.EXP_TYPE + attributes +"_"+ timeStamp + ".arff"; - - try - { - //by default - String sortarffFileName = pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + arffFileName; // default - - // create file - BufferedWriter out = new BufferedWriter(new FileWriter(sortarffFileName)); - - // load ARFF header and write it - String outHeaderArff = vectorgenerator.genArffHeader(pathVars,Integer.parseInt(pathVars.EXP_TYPE)); - //System.out.println(outHeaderArff); // verbose - out.write(outHeaderArff + "\n"); - - // reader for corpus - BufferedReader reader = null; - //train corpus - if(Integer.parseInt(pathVars.EXP_TYPE) == 0) - reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR + pathVars.TRAINING_FILE)); - //test corpus - else if(Integer.parseInt(pathVars.EXP_TYPE) ==1) - reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_DIR + pathVars.TEST_FILE)); - - //-------------------------------------------- - // repeat until all lines have been read - // from the file - //-------------------------------------------- - String text = null; - String content = null; - - String abstracttext = ""; - String journaltitle = ""; - String title = ""; - String ecnumber = ""; - String classtriage = ""; - int hasText = 0; - int journaltitlecount = 0; - int abstracttitlecount = 0; - int abstracttextcount = 0; - int positivecount = 0; - int negativecount = 0; - - - while ((text = reader.readLine()) != null) { - - // detect a PubMed abstract - if (text.contains("")){ - - // Reinitialize journal title - journaltitle = ""; - - // Reinitialize abstract title - title = ""; - - // Reinitialize abstract text - abstracttext = ""; - - // Reinitialize hasText to false - hasText = 0; - - String pmid = text.replaceFirst(".*", ""); - pmid = pmid.replace("", ""); - System.out.println("PMID : " + pmid); - - // continue to read - content = reader.readLine(); - content = content.replaceAll("\t", ""); - content = content.replaceFirst("\\s+", ""); - - while ( ! content.contentEquals("") ) { - - if (content.contains("")){ - - journaltitlecount++; - - content = content.replace("<Title>", ""); - content = content.replace("", ""); - journaltitle = content; - System.out.println("Journal title : " + content); - } - - if (content.contains("")){ - - abstracttitlecount++; - - content = content.replace("", ""); - content = content.replace("", ""); - title = content; - System.out.println("Paper title : " + content); - } - - - if (content.contains("")){ - - abstracttextcount++; - hasText = 1; // use it to indicate if the abstract has some text or not - - content = content.replace("", ""); - - //checks if there are empty lines after AbstractText tag - //and keeps reading until finds the abstract content - while(content.isEmpty()){ - content = reader.readLine(); - } - abstracttext = abstracttext + content; - // clean - abstracttext = model.removeAbstractTags(abstracttext); - - - content = reader.readLine(); - // converting toLowerCase is not relevant in bio context - // because it introduces ambiguities (ie Gene name / Enzyme alias) - // abstracttext = abstracttext.toLowerCase(); - } - - if (content.contains("")){ - temp = temp + model.processAbstract(content); - } - else{ - do{ - temp = temp + model.processAbstract(content); - content = reader.readLine(); - }while(!(content.contains(""))); - } - - newAbs = newAbs + temp; - content = newAbs + ""; - - abstracttext = content; - abstracttext = model.removeAbstractTags(abstracttext); - - content = reader.readLine(); - - } - - if (content.contains("EC ")){ - content = content.replace("EC ", ""); - content = content.replace("", ""); - ecnumber = content; - } - - if (content.contains("")){ - - content = content.replace("", ""); - content = content.replace("", ""); - classtriage = content; - if(content.contains("positive")){ - positivecount++; - } - if(content.contains("negative")){ - negativecount++; - } - System.out.println("Triage classification : " + content); - } - - content = reader.readLine(); - content = content.replaceAll("\t", ""); - content = content.replaceFirst("\\s+", ""); - } - - System.out.println("Abstract : " + abstracttext.toString() + "\n\n"); - - // end of if: collect data and write ARFF - String Arffline = vectorgenerator.getArffLine(pmid, - journaltitle, - title, - abstracttext, - ecnumber, - classtriage, - Integer.parseInt(pathVars.EXP_TYPE) - ); - - Arffline = Arffline + "\n"; - // write line on disc - out.write(Arffline); - // out.write(id + " " + Arffline + "\n"); // - } - - } - - System.out.println( - "Abstracts processed: " + abstracttitlecount - + "\t with text content: " + abstracttextcount - + "\t from " + journaltitlecount + " journals" - + "\nTotal of: \n" + positivecount + " positive" - + "\t and " + negativecount + " negative documents"); - out.write("\n"); - out.close(); - - reader.close(); - - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - - } - - - -} - - - diff --git a/src/arffvector/.gitignore b/src/arffvector/.gitignore deleted file mode 100644 index bdc0ba3..0000000 --- a/src/arffvector/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -/buildvector.class -/FeatureVector.class -/CreateVector.class -/CreateWeightedVector.class -/ArbitraryWeight.class -/CountsWeightedVector.class -/ArbitraryWeightedVector.class diff --git a/src/arffvector/CreateVector.java b/src/arffvector/CreateVector.java deleted file mode 100644 index b112ea5..0000000 --- a/src/arffvector/CreateVector.java +++ /dev/null @@ -1,893 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -*** -* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/vector/buildvector.java -* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton -* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc -* -* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term -* http://creativecommons.org/licenses/by-nc/3.0/ -*/ - - - - -package arffvector; - -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import org.apache.commons.lang3.StringUtils; -import configure.PathConstants; - -/** - * Uses the features extracted and the - * generated corpus to create a feature vector - * (a matrix representation of the corpus) - * - * @author Hayda Almeida, Marie-Jean Meurs - * @since 2014 - * - */ -public class CreateVector { - - ArrayList annotations = new ArrayList(); - ArrayList annotationsType = new ArrayList(); - ArrayList journalTitles = new ArrayList(); - ArrayList ecnumbers = new ArrayList(); - ArrayList titleGrams = new ArrayList(); - ArrayList titleAnnot = new ArrayList(); - ArrayList nGrams = new ArrayList(); - ArrayList docID = new ArrayList(); - - PathConstants pathVars = null; - - /** - * Constructor to load all features extracted - * from training files. These features will be - * used to generate the ARFF header and the - * ARFF vector lines. - * - * @param extVars Variables holding system paths - */ - - public CreateVector(PathConstants extVars) { - - pathVars = extVars; - - String pathJournalT = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES; - try{ - String journalT = ""; - - //receiving journal title - BufferedReader reader = new BufferedReader(new FileReader(pathJournalT)); - int featcount = 0; - while (( journalT = reader.readLine()) != null) { - - if (Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ - - String[] features = StringUtils.split(journalT,"\n"); - - for(int i = 0; i < features.length; i++){ - - String[] featurename = StringUtils.split(features[i],"\t"); - - //checking for journal titles duplicates - if(featurename[1] != "" && !(journalTitles.contains(featurename[1]))){ - journalTitles.add(featurename[1]); - } - } - } - if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} - - } - reader.close(); - } - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch (IOException e) { - e.printStackTrace(); - } - - String pathAnnotations = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES; - String pathTitleAnnot = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES; - - try{ - String abstAnnot = ""; - String tAnnot = ""; - - //receiving abstract annotations (features) - BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations)); - BufferedReader readerT = new BufferedReader(new FileReader(pathTitleAnnot)); - - int featcount = 0; - - while (( abstAnnot = reader.readLine()) != null) { - - if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ - String[] features = StringUtils.split(abstAnnot,"\n"); - - for(int i = 0; i < features.length; i++){ - - String[] featurename = StringUtils.split(features[i],"\t"); - - //checking for duplicate abstract annotations - if(featurename[0] != "" && !(annotations.contains(featurename[0]))){ - annotations.add(featurename[0]); - } - } - } - if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} - } - - - if(!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ - while((tAnnot = readerT.readLine()) != null){ - - String[] features = StringUtils.split(tAnnot,"\n"); - - for(int i = 0; i < features.length; i++){ - - String[] featurename = StringUtils.split(features[i],"\t"); - - //checking for duplicate annotations - if(featurename[0] != "" && !(annotations.contains(featurename[0]))){ - annotations.add(featurename[0]); - } - } - - } - - } - - reader.close(); - readerT.close(); - } - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch (IOException e) { - e.printStackTrace(); - } - - try{ - String abstAnType = ""; - - //receiving abstract annotation types - BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations)); - int featcount = 0; - while (( abstAnType = reader.readLine()) != null) { - - if (Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ - - String[] features = StringUtils.split(abstAnType,"\n"); - - for(int i = 0; i < features.length; i++){ - - String[] featurename = StringUtils.split(features[i],"\t"); - - //checking for duplicate abstract annotation types - if(featurename[1] != "" && !(annotationsType.contains(featurename[1]))){ - annotationsType.add(featurename[1]); - } - - } - } - if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} - - } - reader.close(); - } - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch (IOException e) { - e.printStackTrace(); - } - - - - try{ - String titAnnot = ""; - - //receiving title annotations (features) - BufferedReader reader = new BufferedReader(new FileReader(pathTitleAnnot)); - // int featcount = 0; - while (( titAnnot = reader.readLine()) != null) { - - if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ - - //String titAnnot = FeatureExtractor.getTitCount(); - - String[] features = StringUtils.split(titAnnot,"\n"); - - for(int i = 0; i < features.length; i++){ - String[] featurename = StringUtils.split(features[i],"\t"); - - //checking for duplicate title annotations - if(!(titleAnnot.contains(featurename[0]))){ - titleAnnot.add(featurename[0]); - } - } - } - } - reader.close(); - } - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch (IOException e) { - e.printStackTrace(); - } - - - String pathECNumFeatures = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES; - - try{ - String ECNum = ""; - - //receiving EC numbers (features) - BufferedReader reader = new BufferedReader(new FileReader(pathECNumFeatures)); - // int featcount = 0; - while ((ECNum = reader.readLine()) != null) { - - if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ - - //String titAnnot = FeatureExtractor.getTitCount(); - - String[] features = StringUtils.split(ECNum,"\n"); - - for(int i = 0; i < features.length; i++){ - String[] featurename = StringUtils.split(features[i],"\t"); - - //checking for duplicate EC numbers - if(!(ecnumbers.contains(featurename[0]))){ - ecnumbers.add(featurename[0]); - } - } - } - } - reader.close(); - } - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch (IOException e) { - e.printStackTrace(); - } - - - String pathTitleGrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS; - - - try{ - String titCont = ""; - // String grams = ""; - - //receiving title ngrams - BufferedReader reader = new BufferedReader(new FileReader(pathTitleGrams)); - - int featcount = 0; - while (( titCont = reader.readLine()) != null) { - - if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ - - String[] content = StringUtils.split(titCont,"\n"); - - for(int i = 0; i < content.length; i++){ - String[] featurename = StringUtils.split(content[i],"\t"); - - //check for duplicate title ngrams - if(!(titleGrams.contains(featurename[0]))){ - titleGrams.add(featurename[0]); - } - } - } - } - - reader.close(); - - } - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch (IOException e) { - e.printStackTrace(); - } - - String pathNgrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES; - try{ - String grams = ""; - String tgrams = ""; - - //receiving ngrams - BufferedReader reader = new BufferedReader(new FileReader(pathNgrams)); - BufferedReader readerT = new BufferedReader(new FileReader(pathTitleGrams)); - - // int featcount = 0; - while (( grams = reader.readLine()) != null) { - - if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ - - String[] features = StringUtils.split(grams,"\n"); - - for(int i = 0; i < features.length; i++){ - String[] featurename = StringUtils.split(features[i],"\t"); - - //check for duplicate abstract ngrams - if(!(nGrams.contains(featurename[0]))){ - nGrams.add(featurename[0]); - } - } - } - - } - - //if not using title grams separately, - // then insert title grams with abstract grams. - if (!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){ - while (( tgrams = readerT.readLine()) != null) { - - String[] features = StringUtils.split(tgrams,"\n"); - - for(int i = 0; i < features.length; i++){ - String[] featurename = StringUtils.split(features[i],"\t"); - - //check for duplicate ngrams - if(!(nGrams.contains(featurename[0]))){ - nGrams.add(featurename[0]); - } - } - } - } - - reader.close(); - readerT.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch (IOException e) { - e.printStackTrace(); - } - } - - /** - * Gathers the list of features, according to - * experimental configurations. The list of - * features will be written on the ARFF header. - * - * @param pathVars Variables holding system paths - * @param exp experiment type: train or test - * @return a String containing the ARFF header - */ - - public String genArffHeader(PathConstants pathVars, int exp){ - - StringBuilder headerArff = new StringBuilder(); - - switch(exp){ - case 0: - headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2015\n\n"); - break; - case 1: - headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2015\n\n"); - break; - } - - headerArff.append("@RELATION triage\n"); - - if(Boolean.valueOf(pathVars.USE_TEXT_SIZE)){ - // writing the list of text sizes - headerArff.append("@ATTRIBUTE sizeoftitle \tREAL \t\t%size of title\n"); - headerArff.append("@ATTRIBUTE sizeoftext \tREAL \t\t%size of text\n"); - } - - if(Boolean.valueOf(pathVars.USE_DOC_ID)){ - //writing the docIDs - headerArff.append("@ATTRIBUTE docID \tREAL \t\t%PMID of paper\n"); - - } - - if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ - for(int i = 0; i < journalTitles.size(); i++){ - // writing list of journal titles - String feature = journalTitles.get(i); - String namefeature = feature.replaceAll("\\s", "-"); - namefeature = namefeature.replaceAll("[,:=+']", "-"); - namefeature = namefeature.replaceAll("<|>", ""); - String ref = "journalTitle" + String.valueOf(i) + namefeature; - - headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); - - } - } - - if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ - // writing list of annotation features - for(int i = 0; i < annotations.size(); i++){ - - String feature = annotations.get(i); - String namefeature = feature.replaceAll("\\s", "-"); - namefeature = namefeature.replaceAll("[,:=+']", "-"); - namefeature = namefeature.replaceAll("<|>", ""); - String ref = "annotation" + String.valueOf(i) + namefeature; - - headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); - - } - } - - if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ - // writing list of annotation entities - for(int i = 0; i < annotationsType.size(); i++){ - String feature = annotationsType.get(i); - String namefeature = feature.replaceAll("\\s", "-"); - namefeature = namefeature.replaceAll("[,:=+']", "-"); - namefeature = namefeature.replaceAll("<|>", ""); - String ref = "annotationType" + String.valueOf(i) + namefeature; - - headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); - - } - } - - if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ - // write list of title features - for( int i = 0; i < titleAnnot.size(); i++){ - - String feature = titleAnnot.get(i); - String namefeature = feature.replaceAll("\\s", "-"); - namefeature = namefeature.replaceAll("[,:=+']", "-"); - namefeature = namefeature.replaceAll("<|>", ""); - String ref = "titleAnnot" + String.valueOf(i) + namefeature; - - headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); - - } - - } - - if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ - // writing list of EC numbers - for(int i = 0; i < ecnumbers.size(); i++){ - String feature = ecnumbers.get(i); - String namefeature = feature.replaceAll("\\s", "-"); - namefeature = namefeature.replaceAll("[,:=+']", "-"); - namefeature = namefeature.replaceAll("<|>", ""); - String ref = "ECnumber" + String.valueOf(i) + namefeature; - - headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); - } - } - - if (Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ - // writing list of ngrams on titles - for( int i = 0; i < titleGrams.size(); i++){ - - String feature = titleGrams.get(i); - String namefeature = feature.replaceAll("\\s", "-"); - namefeature = namefeature.replaceAll("[,:=+']", "-"); - namefeature = namefeature.replaceAll("<|>", ""); - String ref = "titleNgram" + String.valueOf(i) + namefeature; - - headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); - - } - } - - if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ - // write list of ngrams - for(int i = 0; i < nGrams.size(); i++){ - - String feature = nGrams.get(i); - String namefeature = feature.replaceAll("\\s", "-"); - namefeature = namefeature.replaceAll("[,:=+']", "-"); - String ref = "Ngram" + String.valueOf(i) + namefeature; - - headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); - - } - } - - // writing the dataset classes - headerArff.append("@ATTRIBUTE class {positive, negative}\n"); - headerArff.append("@DATA\n"); - - return headerArff.toString(); - } - - /** - * Iterates over the list of features and - * counts number of features containing - * on a given document. - * - * @param jTitle title of journal - * @param title title of paper - * @param text abstract content - * @param ecnum paper EC numbers - * @param classTriage triage classification: positive or negative - * @param exp experiment type: train or test - * @return String holding counts for all features found in a document - */ - - public String getArffLine(String paperID, String jTitle, String title, String text, String ecnum, String classTriage, int exp){ - //String vectorArff = ""; - StringBuilder vectorArff = new StringBuilder(); - - paperID = removeSpecialChar(paperID.toLowerCase()); - text = removeSpecialChar(text.toLowerCase()); - title = removeSpecialChar(title.toLowerCase()); - jTitle = removeSpecialChar(jTitle.toLowerCase()); - ecnum = removeSpecialChar(ecnum); - - int emptyabs = 0; - - // fill title and text sizes (number of words) - // annotation markups do not matter because - // they do not introduce blank spaces hence - // they do not modify the number of words found - if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){ - - String[] titleGrams = StringUtils.split(title," "); - int titlesize = titleGrams.length; - - String[] abstractcontent = StringUtils.split(text," "); - int abstractsize = abstractcontent.length; - - if(abstractsize == 1){ - emptyabs++; - } - - vectorArff.append(titlesize).append(",").append(abstractsize).append(","); - } - - //fill ID of documents - if(Boolean.valueOf(pathVars.USE_DOC_ID)){ - - if(paperID.length()>0){ - vectorArff.append(paperID).append(","); - } - else{ - vectorArff.append("0,"); - } - } - - //fill values of journal titles - if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ - - for(int i = 0; i < journalTitles.size(); i++){ - String jfeat = ""; - int jfeatcount = 0; - jfeat = journalTitles.get(i).replaceFirst(" ", ""); - - if(jTitle.contains(jfeat)){ - jfeatcount = StringUtils.countMatches(jTitle, jfeat); - vectorArff.append(jfeatcount).append(","); - } - else{ - vectorArff.append("0,"); - } - } - } - - // fill values of annotation types taken into account - // either only the abstract or abstract and title - // adds on vector the count of occurrences - if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ - - for(int i = 0; i < annotations.size(); i++){ - String anfeat = ""; - int anfeatcount = 0; - anfeat = annotations.get(i).replaceFirst(" ", "").toLowerCase(); - - //in case the text has current annotation - if (text.contains(anfeat)){ - //check the count of the annotation - if((Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ - anfeatcount = StringUtils.countMatches(text, anfeat); - } - //adding title annot count to annotations - else if (!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ - anfeatcount = StringUtils.countMatches(text, anfeat); - //in case title has annotation, add to count - if(title.contains(anfeat)){ - anfeatcount = anfeatcount + StringUtils.countMatches(title, anfeat); - } - } - vectorArff.append(anfeatcount).append(","); - } - //handles the case that only the title (but not abstract) has current annotation - else if((!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)))){ - if(title.contains(anfeat)){ - anfeatcount = StringUtils.countMatches(title, anfeat); - } - vectorArff.append(anfeatcount).append(","); - } - else{ - vectorArff.append("0,"); - } - } - } - - //fill values of abstract annotation types - if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ - - for(int i = 0; i < annotationsType.size(); i++){ - String antype = ""; - int antypecount = 0; - antype = annotationsType.get(i).replaceFirst(" ", "").toLowerCase(); - - if (text.contains(antype)){ - //divided by 2 to match occurance - //(count considers open and close tags) - antypecount = (StringUtils.countMatches(text, antype))/2; - vectorArff.append(antypecount).append(","); - } - else{ - vectorArff.append("0,"); - } - } - - } - - //fill values of title annotations - if (Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ - - for( int i =0; i < titleAnnot.size(); i++){ - String titfeat = ""; - int titfeatcount = 0; - titfeat = titleAnnot.get(i).replaceFirst(" ", "").toLowerCase(); - - if (title.contains(titfeat)){ - titfeatcount = StringUtils.countMatches(title, titfeat); - vectorArff.append(titfeatcount).append(","); - } - else{ - vectorArff.append("0,"); - } - } - } - - if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ - - for(int i = 0; i < ecnumbers.size(); i++){ - String ecfeat = ""; - int ecnumcount = 0; - ecfeat = ecnumbers.get(i); - - if(ecnum.contains(ecfeat)){ - ecnumcount = StringUtils.countMatches(ecnum, ecfeat); - vectorArff.append(ecnumcount).append(","); - } - else{ - vectorArff.append("0,"); - } - } - } - - // fill only values of title ngrams - if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ - - String cleanTitle = removeTags(title.toLowerCase()); - - for( int i =0; i < titleGrams.size(); i++){ - String titgram = ""; - int titgramcount = 0; - titgram = titleGrams.get(i).toLowerCase(); - - //in case the title has current ngram - if (cleanTitle.contains(titgram)){ - //check the count of the ngram - titgramcount = StringUtils.countMatches(cleanTitle, titgram); - - //adding weight to current ngram count - if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ - titgramcount = applyWeight(titgramcount, Integer.parseInt(pathVars.WEIGHT)); - } - vectorArff.append(titgramcount).append(","); - } - else{ - vectorArff.append("0,"); - } - } - } - - // fill values of ngrams - if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ - String cleanText = removeTags(text.toLowerCase()); - String cleanTitle = removeTags(title.toLowerCase()); - - for( int i = 0; i < nGrams.size(); i++){ - String ngramfeat = ""; - int ngramcount = 0; - ngramfeat = nGrams.get(i).toLowerCase(); - - //in case the text has current ngram - if (cleanText.contains(ngramfeat)){ - //check the count of the ngram - if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ - ngramcount = StringUtils.countMatches(cleanText, ngramfeat); - - //adding weight to current ngram count - if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ - ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); - } - } - //checking if title ngrams should be added to the count - else if(!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){ - ngramcount = StringUtils.countMatches(cleanText, ngramfeat); - - //in case title has ngram, add to count - if(cleanTitle.contains(ngramfeat)){ - ngramcount += StringUtils.countMatches(cleanTitle, ngramfeat); - } - - //adding weight to current ngram count - if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ - ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); - } - } - - vectorArff.append(ngramcount).append(","); - } - ////handles the case that only the title (but not abstract) has current ngram - else if (!(cleanText.contains(ngramfeat))){ - //in case only the title has the ngram, add to count - if(cleanTitle.contains(ngramfeat)){ - ngramcount = StringUtils.countMatches(cleanTitle, ngramfeat); - - //adding weight to ngram count - if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ - ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); - } - } - vectorArff.append(ngramcount).append(","); - } - else{ - vectorArff.append("0,"); - } - } - } - - - //if(exp == 0){ - if (classTriage.contains("positive")){ - vectorArff.append("positive"); - //vectorArff.append("?"); - } - else { - vectorArff.append("negative"); - //vectorArff.append("?"); - } - //} - - /*else if (exp == 1){ - vectorArff.append("?"); - } */ - - return vectorArff.toString(); - } - - /** - * Cleans a given String from special characters - * - * @param str String to be cleaned - * @return String without special characters - */ - - public String removeSpecialChar(String str){ - str = str.replace("}", ""); - str = str.replace("{", ""); - str = str.replace("]", ""); - str = str.replace("[", ""); - str = str.replace("#", ""); - str = str.replace("*", ""); - str = str.replace(">", ""); - str = str.replace(""", ""); - str = str.replace("&apos", ""); - str = str.replace("%", ""); - str = str.replace("/", ""); - str = str.replace("\\", ""); - str = str.replace("&", ""); - str = str.replace("=", ""); - str = str.replace("?", ""); - str = str.replace(",", ""); - str = str.replace(":", ""); - str = str.replace(";", ""); - str = str.replace(".", ""); - str = str.replace(")", ""); - str = str.replace("(", ""); - str = str.replace("\t\t", "\t"); - str = str.replace("-", ""); - str = str.replace(" ", ""); - - return str; - } - - /** - * - * @param str - * @return - */ - public String removeTags(String str){ - String[] remove = StringUtils.split(str,""); - StringBuilder sb = new StringBuilder(); - - for(int i = 0; i < remove.length; i++){ - - if(remove[i].equalsIgnoreCase("<")){ - do{ - i++; - } - while(!(remove[i].equalsIgnoreCase(">"))); - } - else sb.append(remove[i]); - } - - return sb.toString(); - } - - public int applyWeight(int count, int weight){ - - if(weight > 0){ - count = count * weight; - } - return count; - } - - - public String informFeatures(PathConstants pathVars){ - String value = ""; - if(Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)) - value = value + "_annotations"; - if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)) - value = value + "_types"; - if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)) - value = value + "_journal"; - if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE) || Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)) - value = value + "_title"; - if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)) - value = value + "_ecnum"; - if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)) - value = value + "_ngrams_size"+ pathVars.NGRAM_SIZE; - if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE) && Boolean.valueOf(pathVars.NGRAM_STOP)) - value = value + "_stopwords"; - if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)) - value = value + "_weight"+ pathVars.WEIGHT; - - return value; - } - - -} diff --git a/src/classifier/.gitignore b/src/classifier/.gitignore deleted file mode 100644 index b92cc15..0000000 --- a/src/classifier/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/test.class -/train.class -/Trainer.class diff --git a/src/classifier/Trainer.java b/src/classifier/Trainer.java deleted file mode 100644 index 4ec0da2..0000000 --- a/src/classifier/Trainer.java +++ /dev/null @@ -1,489 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - - -package classifier; -import java.util.ArrayList; -import java.util.Random; -import weka.attributeSelection.LatentSemanticAnalysis; -import weka.attributeSelection.PrincipalComponents; -import weka.attributeSelection.GainRatioAttributeEval; -import weka.attributeSelection.CorrelationAttributeEval; -import weka.attributeSelection.Ranker; -import weka.classifiers.Classifier; -import weka.classifiers.CostMatrix; -import weka.classifiers.Evaluation; -import weka.classifiers.bayes.NaiveBayes; -import weka.classifiers.evaluation.NominalPrediction; -import weka.classifiers.evaluation.Prediction; -import weka.classifiers.evaluation.output.prediction.PlainText; -import weka.classifiers.functions.LibSVM; -import weka.classifiers.meta.AttributeSelectedClassifier; -import weka.classifiers.meta.CostSensitiveClassifier; -import weka.classifiers.meta.FilteredClassifier; -import weka.classifiers.trees.LMT; -import weka.core.Attribute; -import weka.core.Instances; -import weka.core.Range; -import weka.core.converters.ConverterUtils.DataSource; -import weka.filters.Filter; -import weka.filters.unsupervised.attribute.Remove; -import configure.PathConstants; -import filter.InformedFilter; - -/** - * Trains and tests a classifier, - * executes k-fold cross validation on train data - * and outputs the classification results. - * - * @author Hayda Almeida - * @since 2014 - * - */ - -public class Trainer { - - public static int SEED = 1; //the seed for randomizing the data - public static int FOLDS = 5; //the # of folds to generate - double[][] ranking; - String rank; - - boolean verbose = false; - - - /** - * @param args - * @throws Exception - */ - public static void main(String[] args) throws Exception { - - - String classifier= ""; - - for(int i = 0; i < args.length; i++){ - try{ - if(args[i].matches("-lmt")) - classifier = "lmt"; - if(args[i].matches("-svm")) - classifier = "svm"; - if(args[i].matches("-nb")) - classifier = "nb"; - } - catch(Exception e){ - System.out.println("A classifier must be given as argument. Use: \n" - + "-lmt -> a LMT classifier; \n " - + "-svm -> a SVM classifier; \n" - + "-nb -> a Naive Bayes classifier. "); - System.exit(0); - } - } - - PathConstants pathVars = new PathConstants(); - Trainer evaluator = new Trainer(); - InformedFilter filter = new InformedFilter(); - Classifier cls; - - //Creating classifier - if(classifier.contains("lmt")) - cls = (Classifier) new LMT(); - else if (classifier.contains("svm")) - cls = (Classifier) new LibSVM(); - else - cls = (Classifier) new NaiveBayes(); - - //Loading train data - DataSource sourceTrain = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TRAIN_DIR + pathVars.ARFF_TRAIN); - Instances trainData = sourceTrain.getDataSet(); - - //Flagging the class index on the training data - trainData.setClassIndex(trainData.numAttributes()-1); - System.out.println("Class index set on training data."); - - System.out.println("Training data loaded. Number of instances: " + trainData.numInstances() + "\n"); - - - //Loading test data - DataSource sourceTest = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TEST_DIR + pathVars.ARFF_TEST); - Instances testData = sourceTest.getDataSet(); - - //Flagging the class index on the training data - testData.setClassIndex(trainData.numAttributes()-1); - System.out.println("Class index set on testing data."); - - System.out.println("Test data loaded. Number of instances: " + testData.numInstances() + "\n"); - - - //filter the file IDs, consider the new training set - Instances filteredTrainData = evaluator.filteredIDs(trainData); - Instances filteredTestData = evaluator.filteredIDs(testData); - - if(Boolean.valueOf(pathVars.USE_ODDS_RATIO)){ - //Calculate OddsRatio for all instances - double[] OR = evaluator.loadFeatureFilter(filteredTrainData, filter, 1, Integer.parseInt(pathVars.OR_THRESHOLD)); - - //Apply Odds Ratio filtering in instances - filteredTrainData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTrainData); - filteredTestData = evaluator.applyFilter(pathVars.OR_THRESHOLD, OR, filteredTestData); - } - - if(Boolean.valueOf(pathVars.USE_IDF)){ - //Calculate idf for all instances - double[] idf = evaluator.loadFeatureFilter(filteredTrainData, filter, 2, Integer.parseInt(pathVars.IDF_THRESHOLD)); - - //Apply idf filtering in instances - filteredTrainData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTrainData); - filteredTestData = evaluator.applyFilter(pathVars.IDF_THRESHOLD, idf, filteredTestData); - } - - //Training and testing classifier - evaluator.classify(filteredTrainData, filteredTestData, cls, testData); - - } - - /** - * Loads evaluation of attributes according - * to feature selection method provided. - * - * @param data data instances - * @param filter informed filter instance - * @param method identifier for selection method - * @return - */ - private double[] loadFeatureFilter(Instances data, InformedFilter filter, int method, int threshold){ - - double[] values = new double[data.numAttributes()]; - - switch(method){ - - case 1: - values = filter.oddsRatio(data, threshold); - break; - case 2: - values = filter.idf(data, threshold); - break; - } - - return values; - } - - /** - * Uses evaluation of features according to - * selection method to remove attributes from - * the dataset before training phase. - * - * @param threshold selection method threshold - * @param values evaluation of attributes according to method - * @param data dataset instances - * @return filtered dataset instances - * @throws Exception - */ - private Instances applyFilter(String threshold, double[] values, Instances data) throws Exception{ - int numberRemoved = 0; - - String indexRemove = ""; - - for(int i = 0; i < values.length; i++){ - if(values[i] == 0){ - - int ind = i+1; - - if(indexRemove.length()==0) indexRemove = ind + ""; - else indexRemove = indexRemove + "," + ind; - - numberRemoved++; - } - } - - try{ - indexRemove = indexRemove.substring(0, indexRemove.length()-1); - //if(verbose) - System.out.println("\n = = = = => Filter removed " + numberRemoved +" attributes: " + indexRemove.toString() ); - } - catch (Exception e){ - System.out.println("\n = = = = => Filter threshold did not remove any attribute."); - } - - Remove remove = new Remove(); - remove.setAttributeIndices(indexRemove); - remove.setInvertSelection(false); - remove.setInputFormat(data); - - Instances dataSubset = Filter.useFilter(data, remove); - return dataSubset; - } - - - /** - * Removes the ID attribute (index 1) - * from a given dataset - * - * @param data instances - * @return filtered dataset - * @throws Exception - */ - private Instances filteredIDs(Instances data) throws Exception { - Remove remove = new Remove(); - //setting index to be removed - remove.setAttributeIndices("1"); - remove.setInvertSelection(false); - remove.setInputFormat(data); - - Instances dataSubset = Filter.useFilter(data, remove); - return dataSubset; - } - - - /** - * Trains and tests a classifier when two separated - * datasets are provided. - * - * @param train training data to build classifier - * @param test test data to evaluate classifier - * @param classif type of classifier applied - * @throws Exception - */ - public void classify(Instances filteredTrain, Instances filteredTest, Classifier classif, Instances test) throws Exception{ - - StringBuffer sb = new StringBuffer(); - PlainText prediction = new PlainText(); - Range attributesToShow = null; - prediction.setBuffer(sb); - prediction.setHeader(test); - prediction.setOutputDistribution(true); - - classif.buildClassifier(filteredTrain); - - Evaluation evaluateClassifier = new Evaluation(filteredTrain); - evaluateClassifier.evaluateModel(classif, filteredTest, prediction, attributesToShow, true); - //evaluateClassifier.evaluateModel(classif, filteredTest); - - stats(evaluateClassifier, classif); - - ArrayList output = evaluateClassifier.predictions(); - - if(verbose){ - for(int i = 0; i < output.size(); i++){ - double act = output.get(i).actual(); - String actual; - if(act == 1.0) actual = "negative"; else actual = "positive"; - - double pred = output.get(i).predicted(); - String predicted; - if(pred == 1.0) predicted = "negative"; else predicted = "positive"; - - String value = test.instance(i).toString(0); - - System.out.println("PMID: "+ value + "\t" + - "Actual: " + actual + "\t" + - "Predicted: " + predicted - ); - } } - } - - - /** - * Outputs classifier results. - * - * @param eval Evaluation model built by a classifier - * @param classif type of classifier applied - * @throws Exception - */ - public void stats(Evaluation eval, Classifier classif) throws Exception{ - System.out.println("Number of attributes: " + eval.getHeader().numAttributes()); - System.out.println(eval.toSummaryString("\n======== RESULTS ========\n", false)); - System.out.println(eval.toClassDetailsString("\n\n======== Detailed accuracy by class ========\n")); - System.out.println(eval.toMatrixString("\n\n======== Confusion Matrix ========\n")); - } - - - //Training and testing costSensitive classifier - //evaluator.classify(trainData, testData, evaluator.classifySensitive(cls)); - -// /** -// * Trains and tests a classifier using a -// * provided Cost matrix -// * -// * @param classif type of classifier to be trained -// * @return CostSensitive classifier with costs and classifier -// * @throws Exception -// */ -// public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{ -// CostSensitiveClassifier costSensitive = new CostSensitiveClassifier(); -// CostMatrix matrix = new CostMatrix(2); -// matrix.setElement(0, 1, 4); -// matrix.setElement(1, 0, 1); -// costSensitive.setClassifier(classif); -// costSensitive.setCostMatrix(matrix); -// -// return costSensitive; -// } - - //Executing k-fold cross validation on filtered classifiers - //evaluator.crossFold(trainData, PCAclassifier); - //evaluator.crossFold(trainData, LSAclassifier); - -// /** -// * Executes k-fold cross validation -// * on a given dataset -// * @param data training data provided -// * @param classif type of classifier usedsearch -// * @throws Exception -// */ -// public void crossFold(Instances data, Classifier classif) throws Exception{ -// -// Random random = new Random(SEED); //creating seed number generator -// Evaluation evaluateClassifier = new Evaluation(data); -// -// System.out.println("Classifier working...\n\n"); -// //Classifier should not be trained when cross-validation is executed. -// //because subsequent calls to buildClassifier method will return the same results always. -// evaluateClassifier.crossValidateModel(classif, data, FOLDS, random); -// -// stats(evaluateClassifier, classif); -// } - - - //Creating filtered classifiers - //AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls); - //AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls); - //AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls); - //AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls); - -// /** -// * Implements a Filtered GainRatio classifier, -// * using the ranker as a search method. -// * -// * @param classif type of classifier to be used -// * @return filtered classif with Correlation analysis -// */ -// public AttributeSelectedClassifier setGRFilter(Classifier classif){ -// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); -// -// //Creating evaluator and search method -// GainRatioAttributeEval GR = new GainRatioAttributeEval(); -// Ranker rank = new Ranker(); -// //return the attributes with evaluation greater than 0 -// double threshold = 0.0; -// rank.setThreshold(threshold); -// -// //Setting GainRatio filtered classifier -// fClassif.setClassifier(classif); -// fClassif.setEvaluator(GR); -// fClassif.setSearch(rank); -// -// return fClassif; -// -// } -// -// /** -// * Implements a Filtered Correlation classifier, -// * using the ranker as a search method. -// * -// * @param classif type of classifier to be used -// * @return filtered classif with Correlation analysis -// */ -// public AttributeSelectedClassifier setCorrFilter(Classifier classif){ -// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); -// -// //Creating evaluator and search method -// CorrelationAttributeEval Corr = new CorrelationAttributeEval(); -// Ranker rank = new Ranker(); -// -// //return the attributes with evaluation greater than 0 -// double threshold = 0.03; -// rank.setThreshold(threshold); -// -// //Setting GainRatio filtered classifier -// fClassif.setClassifier(classif); -// fClassif.setEvaluator(Corr); -// fClassif.setSearch(rank); -// -// return fClassif; -// -// } -// -// /** -// * Implements a Filtered PCA classifier, -// * using the ranker as a search method. -// * -// * @param classif type of classifier to be used -// * @return filtered classif with PCA analysis config -// */ -// public AttributeSelectedClassifier setPCAFilter(Classifier classif){ -// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); -// -// //Creating evaluator and search method -// PrincipalComponents PCA = new PrincipalComponents(); -// PCA.setMaximumAttributeNames(-1); -// Ranker rank = new Ranker(); -// //return the attributes with evaluation greater than 0 -// rank.setThreshold(0); -// -// //Setting the PCA classifier configurations -// fClassif.setClassifier(classif); -// fClassif.setEvaluator(PCA); -// fClassif.setSearch(rank); -// -// return fClassif; -// } -// -// /** -// * Implements a Filtered LSA classifier, -// * using the ranker as a search method -// * @param classif -// * @return -// */ -// private AttributeSelectedClassifier setLSAFilter(Classifier classif) { -// AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); -// -// //Creating evaluator -// LatentSemanticAnalysis LSA = new LatentSemanticAnalysis(); -// LSA.setMaximumAttributeNames(-1); -// //value between 0 and 1 includes proportion of total latent variables -// //greater than 1 = exact # of variables to include; -// //less than or equal zero = include all; -// //default = 0.95 (proportional) -// double defaul = 0; -// LSA.setRank(defaul); -// //Creating search method -// Ranker rank = new Ranker(); -// rank.setThreshold(0); -// -// //Setting the LSA classifier configurations -// fClassif.setClassifier(classif); -// fClassif.setEvaluator(LSA); -// fClassif.setSearch(rank); -// -// return fClassif; -// } - - - -} diff --git a/src/configure/.gitignore b/src/configure/.gitignore deleted file mode 100644 index 26ecd44..0000000 --- a/src/configure/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/DeprecatedVariables.class -/PathConstants.class diff --git a/src/configure/PathConstants.java b/src/configure/PathConstants.java deleted file mode 100644 index dab7b82..0000000 --- a/src/configure/PathConstants.java +++ /dev/null @@ -1,202 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*** -* This class re-uses https://code.google.com/p/semlinker/source/browse/trunk/src/configure/NistKBPConfiguration.java -* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton -* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc -* -* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License -* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. -*/ - -package configure; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashMap; -import java.util.logging.Level; -import java.util.logging.Logger; - -/** - * - * Variables used by the software - * - * @author Marie-Jean Meurs - * @since 2013 - * - */ -public class PathConstants { - - /** - * Default constructor - */ - public PathConstants() { - initVars(); - } - - /** - * Constructor with custom parameter file. - * @param configfile - */ - public PathConstants(String configfile) { - CONFIG_FILE = configfile; - initVars(); - } - - - public static String CONFIG_FILE = "config.cfg"; - public HashMap CONFIG_MAP = new HashMap(); - - //Input files - public String HOME_DIR; - public String CORPUS_DIR; - public String SOURCE_DIR; - public String DUP_DIR; - public String POS_DIR; - public String NEG_DIR; - public String TRAIN_DIR; - public String TEST_DIR; - public String FEATURE_DIR; - public String OUTPUT_MODEL; - public String TRAINING_FILE; - public String TEST_FILE; - public String ARFF_TRAIN; - public String ARFF_TEST; - public String STOP_LIST; - - //Output files - public String JOURNAL_TITLE_FEATURES; - public String ECNUM_FEATURES; - public String ANNOTATION_FEATURES; - public String TITLE_FEATURES; - public String NGRAM_FEATURES; - public String TITLE_NGRAMS; - public String DOC_IDS; - - //Feature setup - public String USE_TEXT_SIZE; - public String USE_JOURNAL_TITLE_FEATURE; - public String USE_ECNUM_FEATURE; - public String FEATURE_MIN_FREQ; - public String FEATURE_MIN_LENGTH; - - //Feature setup - Annotations - public String USE_ANNOTATION_FEATURE; - public String USE_ANNOTATION_TYPE; - public String USE_TITLE_FEATURE; - public String USE_DOC_ID; - - //Feature setup - Ngrams - public String USE_NGRAM_FEATURE; - public String USE_TITLE_NGRAMS; - public String NGRAM_STOP; - public String NGRAM_SIZE; - public String USE_WEIGHTED_NGRAM; - public String WEIGHT; - - //Feature filtering - public String USE_ODDS_RATIO; - public String OR_THRESHOLD; - public String USE_IDF; - public String IDF_THRESHOLD; - - //Task setup - public String EXP_TYPE; - public String NB_PARAMS; - - - private void initVars() { - String text = null; - - try { - BufferedReader reader = new BufferedReader(new FileReader(CONFIG_FILE)); - while ((text = reader.readLine()) != null) { - if (! text.startsWith("#")) { - String label = text.split("=")[0]; - String value = text.split("=")[1]; - CONFIG_MAP.put(label, value); - } - } - reader.close(); - } catch (IOException ex) { - Logger.getLogger(PathConstants.class.getName()).log(Level.SEVERE, null, ex); - } - HOME_DIR = CONFIG_MAP.get("HOME_DIR"); - CORPUS_DIR = CONFIG_MAP.get("CORPUS_DIR"); - SOURCE_DIR = CONFIG_MAP.get("SOURCE_DIR"); - DUP_DIR = CONFIG_MAP.get("DUP_DIR"); - POS_DIR = CONFIG_MAP.get("POS_DIR"); - NEG_DIR = CONFIG_MAP.get("NEG_DIR"); - TRAIN_DIR = CONFIG_MAP.get("TRAIN_DIR"); - TEST_DIR = CONFIG_MAP.get("TEST_DIR"); - FEATURE_DIR = CONFIG_MAP.get("FEATURE_DIR"); - OUTPUT_MODEL = CONFIG_MAP.get("OUTPUT_MODEL"); - TRAINING_FILE = CONFIG_MAP.get("TRAINING_FILE"); - TEST_FILE = CONFIG_MAP.get("TEST_FILE"); - ARFF_TRAIN = CONFIG_MAP.get("ARFF_TRAIN"); - ARFF_TEST = CONFIG_MAP.get("ARFF_TEST"); - STOP_LIST = CONFIG_MAP.get("STOP_LIST"); - - JOURNAL_TITLE_FEATURES = CONFIG_MAP.get("JOURNAL_TITLE_FEATURES"); - ECNUM_FEATURES = CONFIG_MAP.get("ECNUM_FEATURES"); - ANNOTATION_FEATURES = CONFIG_MAP.get("ANNOTATION_FEATURES"); - TITLE_FEATURES = CONFIG_MAP.get("TITLE_FEATURES"); - NGRAM_FEATURES = CONFIG_MAP.get("NGRAM_FEATURES"); - TITLE_NGRAMS = CONFIG_MAP.get("TITLE_NGRAMS"); - DOC_IDS = CONFIG_MAP.get("DOC_IDS"); - - USE_TEXT_SIZE = CONFIG_MAP.get("USE_TEXT_SIZE"); - USE_JOURNAL_TITLE_FEATURE = CONFIG_MAP.get("USE_JOURNAL_TITLE_FEATURE"); - USE_ECNUM_FEATURE = CONFIG_MAP.get("USE_ECNUM_FEATURE"); - FEATURE_MIN_FREQ = CONFIG_MAP.get("FEATURE_MIN_FREQ"); - FEATURE_MIN_LENGTH = CONFIG_MAP.get("FEATURE_MIN_LENGTH"); - - USE_ANNOTATION_FEATURE = CONFIG_MAP.get("USE_ANNOTATION_FEATURE"); - USE_ANNOTATION_TYPE = CONFIG_MAP.get("USE_ANNOTATION_TYPE"); - USE_TITLE_FEATURE = CONFIG_MAP.get("USE_TITLE_FEATURE"); - USE_DOC_ID = CONFIG_MAP.get("USE_DOC_ID"); - - USE_NGRAM_FEATURE = CONFIG_MAP.get("USE_NGRAM_FEATURE"); - USE_TITLE_NGRAMS = CONFIG_MAP.get("USE_TITLE_NGRAMS"); - NGRAM_STOP = CONFIG_MAP.get("NGRAM_STOP"); - NGRAM_SIZE = CONFIG_MAP.get("NGRAM_SIZE"); - USE_WEIGHTED_NGRAM = CONFIG_MAP.get("USE_WEIGHTED_NGRAM"); - WEIGHT = CONFIG_MAP.get("WEIGHT"); - - USE_ODDS_RATIO = CONFIG_MAP.get("USE_ODDS_RATIO"); - OR_THRESHOLD = CONFIG_MAP.get("OR_THRESHOLD"); - USE_IDF = CONFIG_MAP.get("USE_IDF"); - IDF_THRESHOLD = CONFIG_MAP.get("IDF_THRESHOLD"); - - EXP_TYPE = CONFIG_MAP.get("EXP_TYPE"); - NB_PARAMS = CONFIG_MAP.get("NB_PARAMS"); - - } -} diff --git a/src/filter/.gitignore b/src/filter/.gitignore deleted file mode 100644 index 6b468b6..0000000 --- a/src/filter/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.class diff --git a/src/filter/InformedFilter.java b/src/filter/InformedFilter.java deleted file mode 100644 index 4b125db..0000000 --- a/src/filter/InformedFilter.java +++ /dev/null @@ -1,182 +0,0 @@ -package filter; - -import weka.core.Attribute; -import weka.core.Instances; - -/** - * This class implements informed feature selection - * methods, to be used as filters after vector - * generation and pre-model building - * - * @author Hayda Almeida - * @since 2015 - * - */ -public class InformedFilter { - - private boolean verbose = true; - - /** - * Calculates oddsRatio of each feature - * in a given set of Instances - * - * @param data set of instances, read from ARFF file - * @return oddsRatio for each attribute in the matrix - */ - public double[] oddsRatio(Instances data, int threshold){ - - double[] oddsRatio = new double[data.numAttributes()]; - - - for(int i = 0; i < data.numAttributes()-1; i++ ){ - - double OR = 0; - - Attribute current = data.attribute(i); - double pos_docs = 0, //number of documents in class C - pos_oc = 0, //number of times term t occured in class C - pos_term_docs = 0, //number of docs in class C that have term - pos_not_docs = 0, //number of docs in class C that do not have term - neg_term_docs = 0, //number of docs not in class C with term - neg_not_docs = 0, //number of docs not in class C nor with term - neg_docs = 0; //number of documents not in class C - - for(int j = 0; j < data.size(); j++){ - - double current_value = data.instance(j).value(current); - double current_class = data.instance(j).classValue(); - - //class is positive - if(current_class < 1){ - pos_docs = pos_docs + 1; - - //the feature occurred in the document - if(current_value > 0){ - pos_oc = pos_oc + current_value; - pos_term_docs = pos_term_docs +1; - } - //the feature did not occur in positive docs - else pos_not_docs = pos_not_docs + 1; - } - //class is negative - else{ - neg_docs = neg_docs+1; - - //the feature occurred in the document - if(current_value > 0){ - neg_term_docs = neg_term_docs +1; - } - //the feature did not occur in negative docs - else neg_not_docs = neg_not_docs + 1; - } - - } - - OR = ( ( (pos_term_docs / pos_docs) / (pos_not_docs/ pos_docs) ) / - ( (neg_term_docs / neg_docs) / (neg_not_docs / neg_docs) ) ); - - // OR = (pos_term_docs / pos_not_docs) / (neg_term_docs / neg_not_docs); - - - //99% confidence: 2.575 - //95% confidence: 1.96 - double confidenceLow = Math.exp(Math.log(OR) - (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs)))); - double confidenceHigh = Math.exp(Math.log(OR) + (1.96 * Math.sqrt((1/pos_term_docs) + (1/pos_not_docs) + (1/neg_term_docs) + (1/neg_not_docs)))); - - //checking if OR value is within the confidence interval - //and if it satisfies the threshold - if( ((OR <= confidenceHigh) && (OR >= confidenceLow) - && !(OR == threshold)) - //checking if the confidence interval holds the null hypothesis (i.e., spans 1.0) - && !(confidenceLow <=1 && confidenceHigh >=1)) - oddsRatio[i] = OR; - else - oddsRatio[i] = 0; - - if(verbose){ - System.out.println("Attribute: "+ data.attribute(i).toString() +"\t\t OddsRatio: " + oddsRatio[i] + - "\tConfidenceLow: " + confidenceLow + "\tConfidenceHigh: "+ confidenceHigh); - } - } - - return oddsRatio; - } - - /** - * Calculates the inverse document frequency - * for each attribute in the dataset. - * - * @param data instances - * @param threshold - * @return list of idfs for each attribute - */ - public double[] idf(Instances data, int threshold){ - - double[] idf = new double[data.numAttributes()]; - - for(int i = 0; i < data.numAttributes()-1; i++ ){ - - double idf_at = 0; - double idf_at2 = 0; - - Attribute current = data.attribute(i); - double pos_docs = 0, //number of documents in class C - pos_term_docs = 0, //number of docs in class C that have term - neg_term_docs = 0, //number of docs not in class C with term - neg_docs = 0; //number of documents not in class C - - for(int j = 0; j < data.size(); j++){ - - double current_value = data.instance(j).value(current); - double current_class = data.instance(j).classValue(); - - //class is positive - if(current_class < 1){ - pos_docs = pos_docs + 1; - - //the feature occurred in the document - if(current_value > 0){ - pos_term_docs = pos_term_docs +1; - } - } - else{ - //class is negative - neg_docs = neg_docs+1; - - //the feature occurred in the document - if(current_value > 0){ - neg_term_docs = neg_term_docs +1; - } - } - } - -// double idf_pos = Math.log((pos_docs)/(pos_term_docs)); -// double idf_neg = Math.log((neg_docs)/(neg_term_docs)); - - //check if the idf in the "positive" collection - //is greater than the idf in the "negative" collection -// if (idf_pos > idf_neg) -// idf_at = idf_pos; -// -// else idf_at = 0; - - idf_at = Math.log((pos_docs + neg_docs)/(pos_term_docs + neg_term_docs)); - - if(idf_at <= threshold) - idf[i] = 0; - else - idf[i] = idf_at; - } - - if(verbose){ - for(int i = 0; i < idf.length; i++){ - if(idf[i]>0) - System.out.println("Attribute: "+ data.attribute(i).toString()+ "\t\t\t IDF: " + idf[i]); - } - } - - return idf; - } - - -} diff --git a/src/filter/NaiveFilter.java b/src/filter/NaiveFilter.java deleted file mode 100644 index db8a32e..0000000 --- a/src/filter/NaiveFilter.java +++ /dev/null @@ -1,117 +0,0 @@ -package filter; - -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import configure.PathConstants; - -/** - * - * This class implements naive feature filtering methods - * to be used by the extractor processes pre-vector building - * - * @author Hayda Almeida - * @since 2015 - * - */ -public class NaiveFilter { - - private boolean verbose = true; - - /** - * Removes from feature list all features with - * frequency not statistically relevant (2 or less) - * @param list to be cleaned - */ - public void considerAnnotationOccurence(HashMap,Integer> list, PathConstants vars){ - //going over the list of annotations and removing the - //features with occurance lower than specified. - - Iterator> iterator = list.keySet().iterator(); - - while(iterator.hasNext()){ - Map key = iterator.next(); - int valor = list.get(key).intValue(); - - if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ - iterator.remove(); - } - } - } - - /** - * Removes from feature list all features with - * frequency not statistically relevant (2 or less) - * @param list to be cleaned - */ - public void considerNgramOccurence(HashMap list, PathConstants vars){ - //going over the list of annotations and removing the - //statistically not significant features - frequency less than 2 - Iterator iterator = list.values().iterator(); - - while(iterator.hasNext()){ - Integer key = iterator.next(); - - if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ - iterator.remove(); - } - } - } - - /** - * Removes stopwords from ngrams list - * - * @param str list of ngrams - * @param constants - * @return cleaned list of ngrams - */ - public String removeStopList(String[] str, PathConstants pathVar){ - - //stop-words file name - String pathStop = "stopList.txt"; - String[] stop = null; - StringBuilder cleaned = new StringBuilder(); - - try{ - - BufferedReader reader = new BufferedReader(new FileReader(pathStop)); - - String line = null; - //loading stop-words list - while((line = reader.readLine()) != null){ - stop = StringUtils.split(line,","); - line = reader.readLine(); - } - - reader.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - - //iteraing over text to be cleaned - for(int i = 0; i < str.length; i++){ - //iterating over stop-words list - for(int j = 0; j < stop.length; j++){ - - //when stop-word is encountered, replace it - if(str[i].equalsIgnoreCase(stop[j])){ - str[i] = str[i].replace(str[i],"*"); - } - } - //retrieve the text without stop-words replacements - if(!(str[i].contentEquals("*"))){ - cleaned.append(str[i]).append(" "); - } - } - return cleaned.toString().replace(" ", " "); - } - -} diff --git a/src/preprocessing/.gitignore b/src/preprocessing/.gitignore deleted file mode 100644 index 6b468b6..0000000 --- a/src/preprocessing/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.class diff --git a/src/preprocessing/ConcatXML.java b/src/preprocessing/ConcatXML.java deleted file mode 100644 index 89e255f..0000000 --- a/src/preprocessing/ConcatXML.java +++ /dev/null @@ -1,717 +0,0 @@ -/* - * The MIT License (MIT) - -Copyright (c) 2014 - -Hayda Almeida -Marie-Jean Meurs - -Concordia University -Tsang Lab - - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS -FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - - -package preprocessing; - -import java.io.BufferedOutputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.FilenameFilter; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.PrintWriter; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.StandardCopyOption; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.Date; -import java.util.List; - -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -import configure.PathConstants; - -/** - * Generates a corpus from raw XML doc instances, - * so that features can be extracted from it - * - * @author Hayda Almeida - * @since 2014 - * - */ -public class ConcatXML{ - - private String tag1; - private String tag2; - private String tag3; - private String tag4; - private String id; - private String corpusTag; - private String corpusTagC; - - - public ConcatXML(){ - - this.setId("PMID"); - this.setTag1("(?s)<.*?xml.*?>"); - this.setTag2("(?s)<.*?!DOCTYPE.*?>"); - this.setTag3("(?s)<.*?corpus.*?>"); - this.seTag4("(?s)<.*?/corpus.*?>"); - this.setCorpusTag(""); - this.setCorpusTag(""); - } - - - - public static void main(String[] args) throws Exception { - - PathConstants pathVars = new PathConstants(); - - String xmlDir = ""; - if(Integer.parseInt(pathVars.EXP_TYPE)== 1) - xmlDir = "test"; - else xmlDir = "train"; - - String sourceDir = "", duplicatesDir = ""; - - Boolean dc = false, df = false, cl = false, cc = false; - - for(int i = 0; i < args.length; i++){ - try{ - if(args[i].matches("-dc")) dc = true; - if(args[i].matches("-df")) df = true; - if(args[i].matches("-cl")) cl = true; - if(args[i].matches("-cc")) cc = true; - } - catch(Exception e){ - System.out.println("Use: \n" - + "-tr -> train, -ts -> test; \n " - + "-dc -> check duplicates in corpus vs. folder; \n " - + "-df -> check duplicates in two folders; \n" - + "-cl -> clean a source folder; \n" - + "-cc -> concatenate files in a folder "); - System.exit(0); - }; - } - - String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); - String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAINING_FILE; - - sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + xmlDir; - duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.DUP_DIR; - - String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml"; - String tagCorpus = concatCorpus; - - ConcatXML concat = new ConcatXML(); - - //================= Checking for duplicates =====================// - if(dc) concat.checkDupCorpus(trainCorpusPath, sourceDir); - if(df) concat.checkDupFolder(sourceDir, duplicatesDir); - - //================== Creating corpus ==========================// - if(cl){ - concat.cleanXML(sourceDir); - if(duplicatesDir.length()>1) - concat.cleanXML(duplicatesDir); - } - if(cc){ - concat.concatenateXML(sourceDir, "", concatCorpus); - concat.tagCorpus(tagCorpus); - } - } - - /** - * Returns the ID of a XML jsoup document - * @param doc a XML doc parsed by jsoup - * @return ID string - * @throws IOException - */ - public String returnID(Document doc) throws IOException{ - - String id = ""; - - Elements paper = doc.body().getElementsByTag("pubmedarticleset"); - - //fetching the paper ID - - //for all items in a paper, retrieve only PMIDs - for(Element e : paper.select(getId())){ - //only consider the ID if the parent is medline citation - if(e.parentNode().nodeName().contains("medline")){ - id = e.text(); - } - } - return id; - } - - /** - * Reads the file IDs in a folder and - * checks a second folder for duplicates. - * - * @param dirSrc source folder - * @param dirDup folder to check for duplicates - */ - - public void checkDupFolder(String dirSrc, String dirDup){ - ArrayList sourceIDs = new ArrayList(); - ArrayList duplicated = new ArrayList(); - ArrayList dupIDs = new ArrayList(); - int ids = 0; - - if(dirSrc.contentEquals(dirDup)){ - System.out.println("Source and duplicates directories are the same.\n\n========================\n"); - } - else { - - File sourceDir = new File(dirSrc); - File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - try{ - //for each file on the source dir - for (File xml : srcXMLs){ - - try{ - - String id = ""; - //Loading file - File input = new File(xml.getPath()); - //Jsoup parse - Document doc = Jsoup.parse(input, "UTF-8"); - - //fetching the document ID - id = returnID(doc); - - if(!id.isEmpty()){ - sourceIDs.add(id); - ids++; - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - System.out.println(ids + " source file IDs encountered."); - ids = 0; - - File dupDir = new File(dirDup); - - File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - try{ - //for each file on the possibly duplicated dir - for (File xml : dupXMLs){ - - try{ - String id = ""; - //Loading file - File input = new File(xml.getPath()); - //Jsoup parse - Document doc = Jsoup.parse(input, "UTF-8"); - - //fetching the document ID - id = returnID(doc); - - if(!id.isEmpty()){ - dupIDs.add(id); - String dupFileID = id; - ids++; - - for(int j = 0; j < sourceIDs.size(); j++){ - if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){ - - //add ID to duplicated list - duplicated.add(dupFileID); - - //rename the original file - Path from = xml.toPath(); //convert from File to Path - Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path - Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); - } - } - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - //count number of existing papers on possibly duplicated folder - //just to make sure we are gathering all IDs - System.out.println(ids + " new file IDs encountered."); - ids = 0; - //sorting the list of duplicated IDs - Collections.sort(duplicated, new Comparator(){ - @Override - public int compare(String one, String two){ - return one.compareTo(two); - } - }); - - System.out.println("\nReaded source files: " + sourceIDs.size()); - System.out.println("Readed new files: " + dupIDs.size()); - - System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); - - System.out.println("\nDuplicated files IDs: "); - for(int i = 0; i < duplicated.size(); i++){ - System.out.println(duplicated.get(i)); - } - - System.out.println("\n========================\n"); - } - - - } - - /** - * Reads the corpus and checks the papers IDs - * to identify duplicates in case new papers - * are being concatenated to corpus. - * - * @param corpus path to current corpora to check - * @param dir path to folder with new files to be concatenated - */ - - public void checkDupCorpus(String corpus, String dir){ - ArrayList trainingIDs = new ArrayList(); - ArrayList duplicated = new ArrayList(); - ArrayList newFiles = new ArrayList(); - - int ids = 0; - - try - { - File input = new File(corpus); - //Jsoup parse - Document doc = Jsoup.parse(input, "UTF-8"); - Elements corp = doc.body().getElementsByTag("pubmedarticleset"); - - String id = ""; - - for(Element paper : corp){ - Document thisDoc = Jsoup.parseBodyFragment(paper.toString()); - - //fetching the document ID - id = returnID(thisDoc); - - if(!id.isEmpty()){ - trainingIDs.add(id); - ids++; - } - } - }catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - - System.out.println(ids + " training file IDs encountered."); - ids = 0; - - File corpusDir = new File(dir); - File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - try{ - //for each file on the corpus dir - for (File xml : newXMLs){ - - try{ - String id = ""; - //Loading file - File input = new File(xml.getPath()); - //Jsoup parse - Document doc = Jsoup.parse(input, "UTF-8"); - - //fetching the document ID - id = returnID(doc); - - if(!id.isEmpty()){ - - newFiles.add(id); - String newFileID = id; - ids++; - - - for(int j = 0; j < trainingIDs.size(); j++){ - if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){ - - //add ID to duplicated list - duplicated.add(newFileID); - - //moving the original file - Path from = xml.toPath(); //convert from File to Path - Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path - Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); - } - } - } - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - //count number of existing papers on the training file - //just to make sure we are gathering all IDs - System.out.println(ids + " new file IDs encountered."); - ids = 0; - - - //sorting the list of duplicated IDs - Collections.sort(duplicated, new Comparator(){ - @Override - public int compare(String one, String two){ - return one.compareTo(two); - } - }); - - System.out.println("\nReaded training files: " + trainingIDs.size()); - System.out.println("Readed new files: " + newFiles.size()); - - System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); - - System.out.println("\nDuplicated files IDs: "); - for(int i = 0; i < duplicated.size(); i++){ - System.out.println(duplicated.get(i)); - } - - System.out.println("\n========================\n"); - - } - - - /** - * Reads and edits a list of XMLs files in a folder - * to remove XML and previous corpus tags, - * preparing the files to be concatenated. - * - * @param dir string with folder path - */ - - public void cleanXML(String dir){ - - //listing files on corpus dir - File sourceDir = new File(dir); - - File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - System.out.println("... Files list loaded."); - - try{ - //for each file on the corpus dir - for (File xml : newXMLs){ - - try{ - BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); - - String line = null; - ArrayList allLines = new ArrayList(); - String content = null; - - while((line = reader.readLine()) != null){ - content = line; - - //cleaning XML markups - content = content.replaceFirst(getTag1(), ""); - content = content.replaceFirst(getTag2(), ""); - //cleaning previous corpus tags - content = content.replaceFirst(getTag3(), ""); - content = content.replaceFirst(getTag4(), ""); - allLines.add(content); - } - - PrintWriter writer = new PrintWriter(xml.getPath()); - - for (String l : allLines){ - writer.println(l); - } - reader.close(); - writer.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - - } - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - System.out.println("... Files cleaned and saved."); - System.out.println("Ready for concatenation."); - System.out.println("\n========================\n"); - } - - - - /** - * Concatenates all XMLs in one folder or between two folders. - * @param sourceDir main directory with XML files. - * @param duplicDir second directory with duplicated XML files - * @param concatFile path name to saved concatenated corpus - */ - - public void concatenateXML(String sourceDir, String duplicDir, String concatFile){ - - final int BUFFER = 1024 << 8; - byte[] buffer = new byte[BUFFER]; - - //listing files on corpus dir - File srcDir = new File(sourceDir); - File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - File dupDir = new File(duplicDir); - File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name) { - return name.endsWith(".xml"); - } - }); - - System.out.println("... Files list loaded."); - - //defining the output file (concatenated) - File newCorpus = new File(concatFile); - - try{ - OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus)); - - - //for each file on the corpus dir - for (File xmls : srcXMLs){ - InputStream input = new FileInputStream(xmls); - int count; - - //if the file is not empty/finished - try{ - while((count = input.read(buffer)) >= 0){ - - //write it on the concatenated final file - output.write(buffer, 0, count); - } - }finally{ - input.close(); - } - } - - if(dupXMLs != null){ - for(File xmld : dupXMLs){ - InputStream input = new FileInputStream(xmld); - int count; - - //if the file is not empty/finished - try{ - while((count = input.read(buffer)) >= 0){ - - //write it on the concatenated final file - output.write(buffer, 0, count); - } - }finally{ - input.close(); - } - } - } - output.flush(); - output.close(); - - }catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(Exception e){ - throw new RuntimeException(e); - } - - System.out.println("... File concatenated and saved."); - System.out.println("Ready for corpus tagging."); - System.out.println("\n========================\n"); - } - - /** - * Inserts corpus tag on XML file - * - * @param pathToCorpus path to - * concatenated corpus - */ - - public void tagCorpus(String pathToCorpus){ - - //tagging as corpus - try{ - BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus)); - - String line = null; - String edit = null; - List allLines = new ArrayList(); - - //adds tag at beggining of corpus - allLines.add(getCorpusTag()); - - while((line = reader.readLine()) != null){ - - allLines.add(line); - } - //adds tag at the end of corpus - allLines.add(getCorpusTagC()); - - System.out.println("... Corpus loaded and tagged."); - //re-writting the file - PrintWriter writer = new PrintWriter(pathToCorpus); - - for (String l : allLines){ - writer.println(l); - } - reader.close(); - writer.close(); - - System.out.println("... File saved as tagged corpus."); - } - catch (FileNotFoundException e) { - e.printStackTrace(); - } - catch(IOException e){ - e.printStackTrace(); - } - } - - private String getCorpusTagC() { - return corpusTagC; - } - - private String getCorpusTag() { - // TODO Auto-generated method stub - return corpusTag; - } - - public String getTag1() { - return tag1; - } - - public void setTag1(String tag1) { - this.tag1 = tag1; - } - - public String getTag2() { - return tag2; - } - - public void setTag2(String tag2) { - this.tag2 = tag2; - } - - private String getTag4() { - // TODO Auto-generated method stub - return tag4; - } - - private String getTag3() { - // TODO Auto-generated method stub - return tag3; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - private void setCorpusTag(String string) { - this.corpusTag = string; - - } - - private void seTag4(String string) { - this.tag4 = string; - - } - - private void setTag3(String string) { - this.tag3 = string; - - } - -} - - diff --git a/src/preprocessing/SampleCorpus.java b/src/preprocessing/SampleCorpus.java deleted file mode 100644 index 63613a8..0000000 --- a/src/preprocessing/SampleCorpus.java +++ /dev/null @@ -1,237 +0,0 @@ -package preprocessing; - -import java.io.File; -import java.io.FilenameFilter; -import java.nio.file.Files; -import java.nio.file.StandardCopyOption; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -import configure.PathConstants; - -/** - * Performs document instances sampling - * generating training and test files - * with specific balance input by user. - * - * @author Hayda Almeida - * @since 2015 - * - */ -public class SampleCorpus { - - public static void main(String[] args) throws Exception { - - PathConstants pathVars = new PathConstants(); - SampleCorpus sampling = new SampleCorpus(); - - String positiveDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.POS_DIR; - List positives = new LinkedList(); - - String negativeDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.NEG_DIR; - List negatives = new LinkedList(); - - //train or test sampling - Boolean tr = true, ts = true; - //% of test corpus WRT the collection, % positive on test set, % positive on training set - int percTs = 20, posTr = 50, posTs = 10; - - for(int i = 0; i < args.length; i++){ - try{ - if(args[i].matches("-tr")){ - tr = true; - posTr = Integer.parseInt(args[i+1]); - } - if(args[i].matches("-ts")){ - ts = true; - percTs = Integer.parseInt(args[i+1]); - posTs = Integer.parseInt(args[i+2]); - } - } - catch(Exception e){ - System.out.println(" Use: \n " - + "-tr -> (% of positives) to sample trainig set \n" - + "-ts -> (% of collection) (% of positives) to sample test set"); - System.exit(0); - }; - } - - positives = sampling.loadFiles(positiveDir); - negatives = sampling.loadFiles(negativeDir); - - if(tr) sampling.sampleTest(pathVars, positives, negatives, percTs, posTs); - - if(ts) sampling.sampleTrain(pathVars, positives, negatives, posTr); - - } - - /** - * Lists XML files within a folder - * @param dirSrc folder path - * @return returns list of file IDs - */ - public List loadFiles(String dirSrc){ - - List fileIDs = new LinkedList(); - - File sourceDir = new File(dirSrc); - File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name){ - return name.endsWith(".xml"); - } - }); - - fileIDs = new LinkedList(Arrays.asList(srcXMLs)); - - return fileIDs; - } - - /** - * Moves a specific number of files - * in a list from origin folder to a test folder - * @param pathVars - * @param files List of file IDs - * @param numFiles number of files to be moved - */ - public void moveFile(PathConstants pathVars, List files, int numFiles){ - - Iterator filesList = files.iterator(); - File testDir = new File(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_DIR); - - if(!testDir.exists()){ - try{ - testDir.mkdir(); - }catch(Exception e){ - System.out.println("Error creating Test folder."); - System.exit(0); - } - } - - while(filesList.hasNext() && numFiles > 0){ - try{ - File file = (File) filesList.next(); - File newFile = new File(testDir + "/" + file.getName()); - - Files.move(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING); - - filesList.remove(); - numFiles--; - } - catch(Exception e){ - System.out.println("Error moving files."); - System.exit(0); - } - } - - } - - /** - * Copies a specific number of files - * in a list from origin folder to a train folder - * @param pathVars - * @param files List of file IDs - * @param numFiles number of files to be moved - */ - public void copyFile(PathConstants pathVars, List files, int numFiles){ - - Iterator filesList = files.iterator(); - File trainDir = new File(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR); - - if(!trainDir.exists()) - try{ - trainDir.mkdir(); - }catch(Exception e){ - System.out.println("Error creating Training folder."); - System.exit(0); - } - - while(filesList.hasNext() && numFiles > 0){ - try{ - File file = (File) filesList.next(); - File newFile = new File(trainDir + "/"+ file.getName()); - - Files.copy(file.toPath(), newFile.toPath(), StandardCopyOption.REPLACE_EXISTING); - } - catch(Exception e){ - System.out.println("Error copying files."); - System.exit(0); - } - } - - } - - /** - * Samples document instances from the collection - * to generate a test set. - * - * @param pathVars - * @param positives list of positive documents IDs - * @param negatives list of negative documents IDs - * @param total percentage of the document collection for test - * @param pos percentage of positive documents in the test set - */ - public void sampleTest(PathConstants pathVars, List positives, List negatives, int total, int pos){ - - int instances = positives.size() + negatives.size(); - int testSize = (instances * total) / 100; - int posSize = (testSize * pos) / 100; - int negSize = testSize - posSize; - - Collections.shuffle(negatives); - System.out.println("===== Test > Negative instances shuffled for test set."); - moveFile(pathVars, negatives, negSize); - System.out.println("===== Test > Negative instances moved to test folder. \n"); - - Collections.shuffle(positives); - System.out.println("===== Test > Positive instances shuffled for test set."); - moveFile(pathVars, positives, posSize); - System.out.println("===== Test > Positive instances moved to test folder. \n"); - - } - - /** - * Samples document instances from the collection - * to generate a training set. - * - * @param pathVars - * @param positives list of positive documents IDs - * @param negatives list of negative documents IDs - * @param pos percentage of positive documents in the training set - */ - public void sampleTrain(PathConstants pathVars, List positives, List negatives, int pos){ - - int trainSize = positives.size() + negatives.size(); - int posSize = (trainSize * pos) / 100; - int negSize = trainSize - posSize; - - if(positives.size() < posSize){ - System.out.println("Not enough positive instances for training set."); - System.exit(0); - } - else if(negatives.size() < negSize){ - System.out.println("Not enough negative instances for training set."); - System.exit(0); - } - else{ - Collections.shuffle(negatives); - System.out.println("===== Training > Negative instances shuffled for training set."); - copyFile(pathVars, negatives, negSize); - System.out.println("===== Training > Negative instances copied to training folder. \n"); - - Collections.shuffle(positives); - System.out.println("===== Training > Positive instances shuffled for training set."); - copyFile(pathVars, positives, posSize); - System.out.println("===== Training > Positive instances copied to training folder. \n"); - } - - } - - - - -}