diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..038e6d4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..76992cf --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +#mycoSORT + +A machine learning system for supporting the triage of biological literature. + + + + + + + + + diff --git a/config-sample.cfg b/config-sample.cfg new file mode 100644 index 0000000..a9b3483 --- /dev/null +++ b/config-sample.cfg @@ -0,0 +1,118 @@ +################################################# +# +# +# Configuration file for mycoSORT +# +# +################################################## +########################### DIRECTORIES ########## +# project home +HOME_DIR=/. +# +# corpus directory +CORPUS_DIR=corpus/ +# +# train directory +TRAIN_DIR=train/ +# +# test directory +TEST_DIR=test/ +# +# feature directory +FEATURE_DIR=features/ +# +# output directory for arff files +OUTPUT_MODEL=arff/ +# +################################################# +########################## INPUT FILES ########## +# training file +TRAINING_FILE=/triagecorpus_train.xml +# +# test file +TEST_FILE=/triagecorpus_test.xml +# +# arff training file +ARFF_TRAIN=triage0.arff +# +# arff testing file +ARFF_TEST=triage1.arff +# +# stopwords list +STOP_LIST=stopList.txt +# +################################################## +########################## OUTPUT FILES ########## +# EC numbers feature list +ECNUM_FEATURES=ecnumbers.txt +# +# Journal title feature list +JOURNAL_TITLE_FEATURES=journaltitles.txt +# +# Abstract annotations feature list +ANNOTATION_FEATURES=annotations.txt +# +# Paper title annotations feature list +TITLE_FEATURES=titleAnnotations.txt +# +# Abstract ngrams feature list +NGRAM_FEATURES=ngrams_features.txt +# +# Paper title n-grams feature list +TITLE_NGRAMS=titleGrams.txt +# +################################################### +########################## FEATURE SETUP ########## +# Extract size of abstract and title +USE_TEXT_SIZE=false +# +# Extract Journal of publication +USE_JOURNAL_TITLE_FEATURE=false +# +# Extract EC Numbers +USE_ECNUM_FEATURE=true +# +# minimum frequency to consider a feature +FEATURE_MIN_FREQ=2 +# +# minimum length (in chars) to consider a feature +FEATURE_MIN_LENGTH=3 +# +############################# +######### ANNOTATIONS ####### +# Extract annotation content +USE_ANNOTATION_FEATURE=true +# +# Extract annotation entities +USE_ANNOTATION_TYPE=true +# +# Extract annotations from title separately +USE_TITLE_FEATURE=false +# +############################# +########## N-GRAMS ########## +# Extract ngrams +USE_NGRAM_FEATURE=false +# +# Extract ngrams from title separately +USE_TITLE_NGRAMS=false +# +#use of stopwords list on ngrams +NGRAM_STOP=true +# +# Define size of extracted n-grams +NGRAM_SIZE=1 +# +# Apply weights to ngrams +#USE_WEIGHTED_NGRAM=false +# +# Define weight of features +#WEIGHT=3 +# +################################################# +########################### TASK SETUP ########## +# experiment type : train = 0 / test = 1 +EXP_TYPE=0 +# +# limit numbers of parameters - quantity (top) or -1 all file +NB_PARAMS=-1 diff --git a/entities.txt b/entities.txt new file mode 100644 index 0000000..7714e43 --- /dev/null +++ b/entities.txt @@ -0,0 +1,23 @@ +annotation_type annotation_level +AccessionNumber entity +ActivityAssayConditions sentence +Assay entity +Buffer entity +Characterization entity +Enzyme entity +Expression sentence +Family entity +Fungus entity +Gene entity +Glycoside_Hydrolase entity +Glycosylation sentence +Kinetics sentence +Laccase entity +Lipase entity +Peroxidase entity +pH sentence +ProductAnalysis sentence +Temperature sentence +SpecificActivity sentence +Substrate entity +SubstrateSpecificity sentence \ No newline at end of file diff --git a/jar/README b/jar/README new file mode 100644 index 0000000..9a9b435 --- /dev/null +++ b/jar/README @@ -0,0 +1,7 @@ +Please add to this folder the following libraries: +commons-lang3-3.2.1.jar +jsoup-1.7.3.jar +weka.jar +LibSVM.jar +LibSVM/libsvm.jar + diff --git a/jar/README~ b/jar/README~ new file mode 100644 index 0000000..56f2ce9 --- /dev/null +++ b/jar/README~ @@ -0,0 +1,7 @@ +Please add to this folder the following libraries: +commons-lang3-3.2.1.jar +jsoup-1.7.3.jar +weka.jar +LibSVM.jar +libsvm.jar + diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore new file mode 100644 index 0000000..6b468b6 --- /dev/null +++ b/src/analyse/.gitignore @@ -0,0 +1 @@ +*.class diff --git a/src/analyse/ConcatXML.java b/src/analyse/ConcatXML.java new file mode 100644 index 0000000..9c24173 --- /dev/null +++ b/src/analyse/ConcatXML.java @@ -0,0 +1,734 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +package analyse; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Date; +import java.util.List; + +import configure.PathConstants; + +/** + * Generates a corpus from raw XML doc instances, + * so that features can be extracted from it + * + * @author halmeida + * + */ +public class ConcatXML extends Extractor{ + + private String tag1; + private String tag2; + private String tag3; + + + public ConcatXML(){ + this.id = ""; + this.tag2 = ""; + this.tag3 = ""; + } + + public static void main(String[] args) throws IOException { + + PathConstants pathVars = new PathConstants(); + + String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); + + String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE; + String xmlDir = "train"; + String sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "all_nbs/"+ xmlDir; + String duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "/src"+ "/annotated_GH27-36_2013_12_31"; + + String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml"; + String tagCorpus = concatCorpus; + + ConcatXML concat = new ConcatXML(); + + //================= Checking for duplicates =====================// + //concat.checkDupCorpus(trainCorpusPath, sourceDir); + //concat.checkDupFolder(sourceDir, duplicatesDir); + + + //================== Creating corpus ==========================// + concat.cleanXML(sourceDir); + //concat.cleanXML(duplicatesDir); + concat.concatenateXML(sourceDir, "", concatCorpus); + concat.tagCorpus(tagCorpus); + } + + /** + * Reads the file IDs in a folder and + * checks a second folder for duplicates. + * + * @param dirSrc source folder + * @param dirDup folder to check for duplicates + */ + + public void checkDupFolder(String dirSrc, String dirDup){ + ArrayList sourceIDs = new ArrayList(); + ArrayList duplicated = new ArrayList(); + ArrayList dupIDs = new ArrayList(); + int ids = 0; + + if(dirSrc.contentEquals(dirDup)){ + System.out.println("Source and duplicates directories are the same.\n\n========================\n"); + } + else { + + File sourceDir = new File(dirSrc); + File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the source dir + for (File xml : srcXMLs){ + + try{ + BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); + + String line = null; + + String id = null; + + while((line = reader.readLine()) != null){ + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + + //get the IDs of the new files + if (line.contains(getid())){ + + line = line.substring(line.indexOf(">", ""); + + id = line.replace(getendId(), ""); + + sourceIDs.add(id); + + line = reader.readLine(); + line = line.replaceAll("\t",""); + } + + if(line.contains(getOpenJournal())){ + ids++; + } + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println(ids + " source file IDs encountered."); + ids = 0; + + File dupDir = new File(dirDup); + + File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the possibly duplicated dir + for (File xml : dupXMLs){ + + try{ + BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); + + String line = null; + + String id = null; + + while((line = reader.readLine()) != null){ + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + + //get the IDs of the new files + if (line.contains(getid())){ + + line = line.substring(line.indexOf(">", ""); + + id = line.replace(getendId(), ""); + + dupIDs.add(id); + String dupFileID = id; + + for(int j = 0; j < sourceIDs.size(); j++){ + if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){ + //moving the original file + Path from = xml.toPath(); //convert from File to Path + Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + + + line = reader.readLine(); + line = line.replaceAll("\t",""); + } + + if(line.contains(getOpenJournal())){ + ids++; + } + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + //count number of existing papers on possibly duplicated folder + //just to make sure we are gathering all IDs + System.out.println(ids + " new file IDs encountered."); + ids = 0; + + //for each possible duplicated ID, + //check if it exists on source folder ID list + //if yes, list the duplicated ones + for(int i = 0; i < dupIDs.size(); i++){ + for(int j = 0; j < sourceIDs.size(); j++){ + if(sourceIDs.get(j).equalsIgnoreCase(dupIDs.get(i))){ + duplicated.add(dupIDs.get(i)); + } + } + } + + //sorting the list of duplicated IDs + Collections.sort(duplicated, new Comparator(){ + @Override + public int compare(String one, String two){ + return one.compareTo(two); + } + }); + + System.out.println("\nReaded source files: " + sourceIDs.size()); + System.out.println("Readed new files: " + dupIDs.size()); + + System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); + + System.out.println("\nDuplicated files IDs: "); + for(int i = 0; i < duplicated.size(); i++){ + System.out.println(duplicated.get(i)); + } + + System.out.println("\n========================\n"); + } + + + } + + /** + * Reads the corpus and checks the papers IDs + * to identify duplicates in case new papers + * are being concatenated to corpus. + * + * @param corpus path to current corpora to check + * @param dir path to folder with new files to be concatenated + */ + + public void checkDupCorpus(String corpus, String dir){ + ArrayList trainingIDs = new ArrayList(); + ArrayList duplicated = new ArrayList(); + ArrayList newFiles = new ArrayList(); + + int ids = 0; + + try + { + BufferedReader reader = new BufferedReader(new FileReader(corpus)); + + String line = null; + String id = null; + + + while((line = reader.readLine()) != null){ + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + + //on the previous training corpus + //find exact paper ID and store it + if (line.contains(getid())){ + + line = line.substring(line.indexOf(">", ""); + + id = line.replace(getendId(), ""); + + //insert paper ID to existing training file list + trainingIDs.add(id); + + line = reader.readLine(); + line = line.replaceAll("\t",""); + } + + //count number of existing papers on the training file + //just to make sure we are gathering all IDs + if(line.contains(getOpenJournal())){ + ids++; + } + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + System.out.println(ids + " training file IDs encountered."); + ids = 0; + + File corpusDir = new File(dir); + File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + try{ + //for each file on the corpus dir + for (File xml : newXMLs){ + + try{ + BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); + + String line = null; + + String id = null; + + while((line = reader.readLine()) != null){ + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + + //get the IDs of the new files + if (line.contains(getid())){ + + line = line.substring(line.indexOf(">", ""); + + id = line.replace(getendId(), ""); + + newFiles.add(id); + String newFileID = id; + + for(int j = 0; j < trainingIDs.size(); j++){ + if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){ + //moving the original file + Path from = xml.toPath(); //convert from File to Path + Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path + Files.move(from, to, StandardCopyOption.REPLACE_EXISTING); + } + } + + + line = reader.readLine(); + line = line.replaceAll("\t",""); + } + + if(line.contains(getOpenJournal())){ + ids++; + } + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + //count number of existing papers on the training file + //just to make sure we are gathering all IDs + System.out.println(ids + " new file IDs encountered."); + ids = 0; + + //for each new ID, check if it exists on training file ID list + //if yes, list the duplicated ones + for(int i = 0; i < newFiles.size(); i++){ + for(int j = 0; j < trainingIDs.size(); j++){ + if(trainingIDs.get(j).equalsIgnoreCase(newFiles.get(i))){ + duplicated.add(newFiles.get(i)); + } + } + } + + //sorting the list of duplicated IDs + Collections.sort(duplicated, new Comparator(){ + @Override + public int compare(String one, String two){ + return one.compareTo(two); + } + }); + + System.out.println("\nReaded training files: " + trainingIDs.size()); + System.out.println("Readed new files: " + newFiles.size()); + + System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n"); + + System.out.println("\nDuplicated files IDs: "); + for(int i = 0; i < duplicated.size(); i++){ + System.out.println(duplicated.get(i)); + } + + System.out.println("\n========================\n"); + + } + + + /** + * Reads and edits a list of XMLs files in a folder + * to remove XML and previous corpus tags, + * preparing the files to be concatenated. + * + * @param dir string with folder path + */ + + public void cleanXML(String dir){ + + //listing files on corpus dir + File sourceDir = new File(dir); + + File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + System.out.println("... Files list loaded."); + + try{ + //for each file on the corpus dir + for (File xml : newXMLs){ + + try{ + BufferedReader reader = new BufferedReader(new FileReader(xml.getPath())); + + String line = null; + ArrayList allLines = new ArrayList(); + String content = null; + + while((line = reader.readLine()) != null){ + content = line; + + //cleaning XML markups + if(content.contains(getTag1())){ + content = content.replace(getTag1(), ""); + allLines.add(content); + } + if(content.contains(getTag2())){ + content = content.replace(getTag2(), ""); + allLines.add(content); + } + if(content.contains(getTag3())){ + content = content.replace(getTag3(), ""); + allLines.add(content); + } + + //cleaning previous corpus tags + if(content.contains(getOpenFile())){ + content = content.replace(getOpenFile(), ""); + allLines.add(content); + } + if(content.contains(getendFile())){ + content = content.replace(getendFile(), ""); + allLines.add(content); + } + + allLines.add(content); + } + + PrintWriter writer = new PrintWriter(xml.getPath()); + + for (String l : allLines){ + writer.println(l); + } + reader.close(); + writer.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + + } + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println("... Files cleaned and saved."); + System.out.println("Ready for concatenation."); + System.out.println("\n========================\n"); + } + + /** + * Concatenates all XMLs in one folder or between two folders. + * @param sourceDir main directory with XML files. + * @param duplicDir second directory with duplicated XML files + * @param concatFile path name to saved concatenated corpus + */ + + public void concatenateXML(String sourceDir, String duplicDir, String concatFile){ + + final int BUFFER = 1024 << 8; + byte[] buffer = new byte[BUFFER]; + + //listing files on corpus dir + File srcDir = new File(sourceDir); + File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name){ + return name.endsWith(".xml"); + } + }); + + File dupDir = new File(duplicDir); + File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name) { + return name.endsWith(".xml"); + } + }); + + System.out.println("... Files list loaded."); + + //defining the output file (concatenated) + File newCorpus = new File(concatFile); + + try{ + OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus)); + + + //for each file on the corpus dir + for (File xmls : srcXMLs){ + InputStream input = new FileInputStream(xmls); + int count; + + //if the file is not empty/finished + try{ + while((count = input.read(buffer)) >= 0){ + + //write it on the concatenated final file + output.write(buffer, 0, count); + } + }finally{ + input.close(); + } + } + + if(dupXMLs != null){ + for(File xmld : dupXMLs){ + InputStream input = new FileInputStream(xmld); + int count; + + //if the file is not empty/finished + try{ + while((count = input.read(buffer)) >= 0){ + + //write it on the concatenated final file + output.write(buffer, 0, count); + } + }finally{ + input.close(); + } + } + } + output.flush(); + output.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(Exception e){ + throw new RuntimeException(e); + } + + System.out.println("... File concatenated and saved."); + System.out.println("Ready for corpus tagging."); + System.out.println("\n========================\n"); + } + + /** + * Inserts corpus tag on XML file + * + * @param pathToCorpus path to + * concatenated corpus + */ + + public void tagCorpus(String pathToCorpus){ + + //tagging as corpus + try{ + BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus)); + + String line = null; + String edit = null; + List allLines = new ArrayList(); + + //adds tag at beggining of corpus + allLines.add(getOpenFile()); + + while((line = reader.readLine()) != null){ + + allLines.add(line); + } + //adds tag at the end of corpus + allLines.add(getendFile()); + + System.out.println("... Corpus loaded and tagged."); + //re-writting the file + PrintWriter writer = new PrintWriter(pathToCorpus); + + for (String l : allLines){ + writer.println(l); + } + reader.close(); + writer.close(); + + System.out.println("... File saved as tagged corpus."); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch(IOException e){ + e.printStackTrace(); + } + } + + public String getTag1() { + return tag1; + } + + public void setTag1(String tag1) { + this.tag1 = tag1; + } + + public String getTag2() { + return tag2; + } + + public void setTag2(String tag2) { + this.tag2 = tag2; + } + + public String getTag3() { + return tag3; + } + + public void setTag3(String tag3) { + this.tag3 = tag3; + } + + +} + + diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java new file mode 100644 index 0000000..c97cfa7 --- /dev/null +++ b/src/analyse/Extractor.java @@ -0,0 +1,442 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package analyse; + +import java.io.BufferedWriter; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +/** + * Implements common tools to FeatureExtractor + * and NgramExtractor classes that are used to + * extract features from doc instances + * + * @author halmeida + * + */ +public class Extractor { + + //String pathFile; + String id; + String endId; + String openFile; + String endFile; + String openAbst; + String closeAbst; + String abstractLabel; + String openEC; + String closeEC; + String classTag; + String openTitle; + String closeTitle; + String openJournal; + String closeJournal; + String copyR; + String closeCopyR; + + /** + * Replaces special characters to clean + * text for tokenizing. + * + * @param str text to be cleaned + * @return string with cleaned text + */ + public String removeSpecialChar(String str){ + str = str.replace("}", ""); + str = str.replace("{", ""); + str = str.replace("]", ""); + str = str.replace("[", ""); + str = str.replace("#", ""); + str = str.replace("*", ""); + str = str.replace(">", ""); + str = str.replace("&apos", ""); + str = str.replace("%", ""); + str = str.replace(""", ""); + str = str.replace("&", ""); + str = str.replace("=", ""); + str = str.replace("?", ""); + str = str.replace(";", ""); + str = str.replace(":", ""); + str = str.replace(",", ""); + str = str.replace(".", ""); + str = str.replace(")", ""); + str = str.replace("(", ""); + str = str.replace("\t\t", "\t"); + str = str.replace("-", ""); + str = str.replace(" ", ""); + + return str; + } + + /** + * Handles external tags (and multiple abstract + * text tags) present in a single paper + * @param str abstract content + * @return string without external tags + */ + + public String processAbstract(String str){ + str = str.replace(" ", ""); + String[] remove = str.split(""); + StringBuilder sb = new StringBuilder(); + String temp = ""; + String abstrac = ""; + + for(int i = 0; i < remove.length; i++){ + temp = temp + remove[i]; + + if(temp.contains(""))); + } + if(temp.contains("Copyright ")){ + temp = ""; + do{ + i++; + //an exception here can mean that a copyright information + //tag content did not ended with a period + }while(!(remove[i]).equalsIgnoreCase(".")); + } + else sb.append(remove[i]); + } + + abstrac = sb.toString(); + abstrac = removeAbstractTags(abstrac); + + return abstrac; + } + + + /** + * Removes specific tags encountered on Abstract texts. + * This is used to clean the abstract text before + * processing the feature count on the model. + * @param str + * @return + */ + + public String removeAbstractTags(String str){ + //this order of removing tags matters to + //exclude the first tag from the abstracts. + + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("Copyright", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + str = str.replace("", ""); + + return str; + } + + + /** + * Removes the markup annotations of a + * text field, and keeps its content + * + * @param str text containing markups + * @return string with cleaned text + */ + public String removeTags(String str) { + String[] remove = str.split(""); + StringBuilder sb = new StringBuilder(); + + for(int i = 0; i < remove.length; i++){ + + if(remove[i].equalsIgnoreCase("<")){ + do{ + i++; + } + while(!(remove[i].equalsIgnoreCase(">"))); + } + else sb.append(remove[i]); + } + + return sb.toString(); + } + + + /** + * Displays the keys and values of the + * maps created. + * + * @param hash HashMap containing list, + * values, counts + */ + public void displayList(HashMap hash){ + Iterator itr = hash.keySet().iterator(); + int sum = 0; + while(itr.hasNext()){ + Object str = itr.next(); + System.out.println("key: "+str+"\t value: "+hash.get(str)); + } + } + + + /** + * Exports hashmap of values extracted + * from dataset to external file + * + * @param location folder, file name and file extension + * @param list values to be exported + */ + public void exportFile(String location, HashMap list){ + + String SEPARATOR = "\t"; + StringBuffer line = new StringBuffer(); + Iterator itr = list.keySet().iterator(); + + try{ + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8")); + + while(itr.hasNext()){ + Object str = itr.next(); + if(str != null){ + line.append(str).append(SEPARATOR).append(list.get(str)); + if(line.toString().contains("=")) + line.replace(line.indexOf("="), line.indexOf("=")+1,SEPARATOR); + //handling specificities from title content extraction + if(line.toString().contains(",")) + line.replace(line.indexOf(","), line.indexOf(",")+1,SEPARATOR); + } + if(itr.hasNext()){ + //writer.newLine(); + line.append("\n"); + } + writer.write(removeSpecialChar(line.toString())); + line.replace(0, line.length(), ""); + //writer.newLine(); + } + writer.flush(); + writer.close(); + } + catch(UnsupportedEncodingException e){ + e.printStackTrace(); + } + catch(FileNotFoundException e){ + e.printStackTrace(); + } + catch(IOException e){ + e.printStackTrace(); + } + + + //} + } + + + /** + * Exports list of values extracted + * from dataset to a string variable + * + * @param list list of values to be exported + * @return string containing values on list + * @deprecated + */ + public String exportContent(HashMap list){ + String SEPARATOR = "\t"; + Iterator itr = list.keySet().iterator(); + StringBuffer export = new StringBuffer(); + //try{ + while(itr.hasNext()){ + String str = itr.next(); + if(str != null){ + export.append(str).append(SEPARATOR).append(list.get(str)); + + if(export.toString().contains("=")) + export.replace(export.indexOf("="), export.indexOf("=")+1,SEPARATOR); + } + + if(itr.hasNext()){ + export.append("\n"); + } + } + /*} + catch(Exception e){ + + }*/ + + return removeSpecialChar(export.toString()); + } + + + /** + * Exports list of values extracted + * from dataset to external file + * + * @param location folder, file name and file extension + * @param list list of values to be exported + * + */ + public void exportList(String location, ArrayList list){ + + String SEPARATOR = "\n"; + StringBuffer line = new StringBuffer(); + + try{ + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(location), "UTF-8")); + + for(int i = 0; i < list.size(); i++){ + String str = list.get(i); + if(str != null){ + line.append(str).append(SEPARATOR); + } + } + writer.write(removeSpecialChar(line.toString())); + + writer.flush(); + writer.close(); + } + catch(UnsupportedEncodingException e){ + e.printStackTrace(); + } + catch(FileNotFoundException e){ + e.printStackTrace(); + } + catch(IOException e){ + e.printStackTrace(); + } + + } + + + public void initialize(){ + + } + + + /** + * Accessors and mutators methods + * for Extractor variables. + * @return + */ + /*public String getPathFile() { + return pathFile; + } + public void setPathFile(String pathFile) { + this.pathFile = pathFile; + }*/ + public String getid() { + return id; + } + public void setid(String id) { + this.id = id; + } + public String getendId() { + return endId; + } + public void setendId(String endId) { + this.endId = endId; + } + public String getOpenFile() { + return openFile; + } + public void setOpenFile(String openFile) { + this.openFile = openFile; + } + public String getendFile() { + return endFile; + } + public void setendFile(String endFile) { + this.endFile = endFile; + } + public String getopenAbst() { + return openAbst; + } + public void setopenAbst(String openAbst) { + this.openAbst = openAbst; + } + public String getcloseAbst() { + return closeAbst; + } + public void setcloseAbst(String closeAbst) { + this.closeAbst = closeAbst; + } + public String getOpenEC() { + return openEC; + } + public void setOpenEC(String openEC) { + this.openEC = openEC; + } + public String getCloseEC() { + return closeEC; + } + public void setCloseEC(String closeEC) { + this.closeEC = closeEC; + } + public String getAbstractLabel() { + return abstractLabel; + } + public void setAbstractLabel(String abstractLabel) { + this.abstractLabel = abstractLabel; + } + public String getClassTag() { + return classTag; + } + public void setClassTag(String classTag) { + this.classTag = classTag; + } + public String getOpenTitle() { + return openTitle; + } + public void setOpenTitle(String titleTag) { + this.openTitle = titleTag; + } + public String getCloseTitle() { + return closeTitle; + } + public void setCloseTitle(String closeTitle) { + this.closeTitle = closeTitle; + } + public String getOpenJournal() { + return openJournal; + } + public void setOpenJournal(String openJournal) { + this.openJournal = openJournal; + } + public String getCloseJournal() { + return closeJournal; + } + public void setCloseJournal(String closeJournal) { + this.closeJournal = closeJournal; + } + +} \ No newline at end of file diff --git a/src/analyse/FeatureExtractor.java b/src/analyse/FeatureExtractor.java new file mode 100644 index 0000000..4ca93aa --- /dev/null +++ b/src/analyse/FeatureExtractor.java @@ -0,0 +1,591 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package analyse; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import configure.PathConstants; + + +/** + * This class extracts and parses domain + * annotation features from doc instances + * + * @author halmeida + */ + +public class FeatureExtractor extends Extractor{ + + public FeatureExtractor(){ + + this.id = ""; + this.endId = ""; + this.endFile = ""; + this.openAbst = ""; + this.closeAbst = ""; + this.abstractLabel = ",Integer> abstract_count = new HashMap,Integer>(); + //store all features, type and classification + HashMap,String> abstract_type = new HashMap,String>(); + + //store title features, type and classification + HashMap,String> title_type = new HashMap,String>(); + //store title features, type and count + HashMap, Integer> title_count = new HashMap, Integer>(); + //store title features, whole journal title content and classification + HashMap,String> title_content = new HashMap,String>(); + + //store title content and EC numbers + ArrayList ec_numbers = new ArrayList(); + + fextrac.initialize(); + int jTitle = 0; + + try + { + BufferedReader reader = new BufferedReader(new FileReader(AnCorpus)); + + //--------------------------- + // repeat until all lines of the file are read + //--------------------------- + String line = null; + String features = null; + // String id = null; + + + while((line = reader.readLine()) != null){ + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + + //find paper ID and store it + if (line.contains(fextrac.getid())){ + line = line.replace(fextrac.getid(), ""); + // id = line.replace(fextrac.getendId(), ""); + + //continue reading + features = reader.readLine(); + features = features.replaceAll("\t",""); + + String journal = ""; + + //continue reading until the end of file + while(!(features.contentEquals(fextrac.getendFile()))){ + + //find relevant doc section - Journal title + if(features.contains(fextrac.getOpenJournal())){ + + features = features.replace(fextrac.getOpenJournal(),""); + features = features.replace(fextrac.getCloseJournal(), ""); + features = fextrac.removeSpecialChar(features); + + //separating only the journal title content + journal = fextrac.removeTags(features); + //counting # of journal titles + jTitle++; + + features = reader.readLine(); + features = features.replaceAll("\t",""); + } + + //find relevant doc section - Article title + if(features.contains(fextrac.getOpenTitle())){ + + features = features.replace(fextrac.getOpenTitle(),""); + features = features.replace(fextrac.getCloseTitle(), ""); + features = fextrac.removeSpecialChar(features); + + //separating the title by annotations + String title_annotation = features; + + //extracting annotations and inserting them on lists + fextrac.annotations(title_annotation, title_count, title_type, pathVars); + fextrac.addContent(title_annotation, journal, title_content); + + features = reader.readLine(); + features = features.replaceAll("\t",""); + } + + if(features.contains(fextrac.getAbstractLabel())){ + + String temp = ""; + String newAbs = fextrac.getopenAbst(); + + //handling cases when the tag is already within abstract content + if(features.contains("")){ + temp = temp + fextrac.processAbstract(features); + } + else{ + do{ + temp = temp + fextrac.processAbstract(features); + features = reader.readLine(); + }while(!(features.contains(""))); + } + newAbs = newAbs + temp; + features = newAbs + fextrac.getcloseAbst(); + } + + //find relevant doc section - Abstract + if(features.contains(fextrac.getopenAbst())){ + + features = features.replace(fextrac.getopenAbst(),""); + features = features.replace(fextrac.getcloseAbst(), ""); + features = fextrac.removeSpecialChar(features); + + //handle lines in which abstract text tag + //is separated from the actual text + if(features.isEmpty()){ + features = reader.readLine(); + features = features.replaceAll("\t",""); + features = features.replace(fextrac.getopenAbst(),""); + features = features.replace(fextrac.getcloseAbst(), ""); + features = fextrac.removeSpecialChar(features); + } + + features = fextrac.removeAbstractTags(features); + + //gathering abstract annotations + String abstrac = features; + + //extract annotations and insert them on lists + fextrac.annotations(abstrac, abstract_count, abstract_type, pathVars); + + features = reader.readLine(); + features = features.replaceAll("\t",""); + //features = features.replaceAll("\\s+", ""); + } + + //identifying EC number + if(features.contains(fextrac.getOpenEC())){ + features = features.replace(fextrac.getOpenEC(), ""); + features = features.replace(fextrac.getCloseEC(), ""); + features = fextrac.removeSpecialChar(features); + + ec_numbers.add(features); + + features = reader.readLine(); + features = features.replaceAll("\t",""); + } + + //find classification of the document + if(features.contains(fextrac.getClassTag())){ + + //adding classification to the list of annotations + String classif = fextrac.getClassif(features); + fextrac.addClass(classif, abstract_type); + fextrac.addClass(classif, title_type); + fextrac.addClass(classif, title_content); + + features = reader.readLine(); + features = features.replaceAll("\t",""); + } + + features = reader.readLine(); + features = features.replaceAll("\t",""); + + } + + } + + } + + reader.close(); + + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + + //Use for sample output + //System.out.println("\n===========TITLE==ANNOTATIONS============="); + //fextrac.displayList(title_count); + //fextrac.displayList(title_type); + //fextrac.displayList(title_content); + //System.out.println("\n========ABSTRACT==ANNOTATIONS============="); + //fextrac.displayList(abstract_count); + //fextrac.displayList(abstract_type); + + //Before exporting, take into account the + //occurence of all extracted features + fextrac.considerOccurence(abstract_count, pathVars); + fextrac.considerOccurence(title_count, pathVars); + + + System.out.println("\n===========FEATURE==EXPORT==============="); + fextrac.exportList(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES, ec_numbers); + System.out.println("..."+ ec_numbers.size()+" EC numbers saved."); + fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES, abstract_count); + System.out.println("..."+ abstract_count.size()+" unique Abstract annotations saved."); + fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES, title_count); + System.out.println("..."+ title_count.size() +" unique Title annotations saved."); + fextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES, title_content); + System.out.println("..."+jTitle+" Journal titles saved."); + System.out.println("\n=========================================\n"); + + } + + /** + * Identifies the classification on doc + * + * @param clas text containing classification (after char removal) + * @return classification of doc + */ + private String getClassif(String clas) { + + //parsing the not edited text into HTML using Jsoup + Document doc = Jsoup.parseBodyFragment(clas); + //saving the text as an Jsoup element, with a main tag (the HTML body), + //attributes and child nodes (TRIAGE tags) + Element text = doc.body(); + + Elements classification = text.getElementsByTag("TRIAGE"); + + return classification.text(); + } + + /** + * Inserts the classification + * on the list of features + * + * @param class information to insert on list + * @param list list of features used + */ + private void addClass(String element, HashMap, String> list){ + //going over list to insert + //classif on document instances + Iterator>it = list.keySet().iterator(); + + while(it.hasNext()){ + Map str = it.next(); + + if(list.get(str).contains("positive") || list.get(str).contains("negative")){ + + } + else list.put(str, element); + } + } + + + /** + * Removes from feature list all features with + * frequency not statistically relevant (2 or less) + * @param list to be cleaned + */ + private void considerOccurence(HashMap,Integer> list, PathConstants vars){ + //going over the list of annotations and removing the + //features with occurance lower than specified. + + Iterator> iterator = list.keySet().iterator(); + + while(iterator.hasNext()){ + Map key = iterator.next(); + int valor = list.get(key).intValue(); + + if(valor < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ + iterator.remove(); + } + } + } + + + /** + * Extract the annotations from a determined section + * of the document and add them to the specified lists. + * + * @param annotation cleaned and splitted line with annotation + * @param count list that holds annotation, its type and its count + * @param type list that holds annotation, its type and its classification + */ + private void annotations(String annot, HashMap, Integer> count, HashMap,String> type, PathConstants pathVars) { + HashMap features = loadAnnotationEntities(); + PathConstants pathVar = new PathConstants(); + NgramExtractor nextrac = new NgramExtractor(); + ArrayList content = new ArrayList(); + + //parsing the not edited text into HTML using Jsoup + Document doc = Jsoup.parseBodyFragment(annot); + //saving the text as an Jsoup element, with a main tag (the HTML body), + //attributes and child nodes (annotation tags) + Element annotations = doc.body(); + + //iterating over list of entities + for(Map.Entry value : features.entrySet()){ + + String an_type = value.getKey(); + String an_level = value.getValue(); + + //for each entity, find the annotations on abstract + Elements annots = annotations.getElementsByTag(an_type); + + //for each annotation found, + for(Element an : annots){ + + //grabbing annotation content: + //if the annotation is made on the sentence level: + if(an_level.contains("sentence")){ + + //checkingh if sentence contains inner annotations + if(an.childNodeSize() != 0){ + + //going over list of inner annotations + for(Element child : an.children()){ + + //if child is sentence (sentence inside of sentence), + //then add annotations as ngrams on this + if(features.get(child.nodeName()).contains("sentence")) { + content.addAll(nextrac.nGrams(child.text(), pathVar)); + insertAnnotation(content, an.nodeName(), count, type, pathVars); + } + //adding annotations on sentence as they are - no ngrams on this + else { + content.add(child.text()); + insertAnnotation(content, an.nodeName(), count, type, pathVars); + } + } + + //removing inner annotations from sentence, they are already added + Element tempAnnot = an.clone(); + tempAnnot.children().remove(); + + //splitting content in ngrams to whats left on the sentence + content.addAll(nextrac.nGrams(tempAnnot.text(), pathVar)); + insertAnnotation(content, an.nodeName(), count, type, pathVars); + } + + } + else { + //keeping original annotation content for other cases + content.add(an.text()); + insertAnnotation(content, an.nodeName(), count, type, pathVars); + } + } + + } + + } + + + /** + * Insert annotation (or ngram list of annotation) + * on lists, used on @annotations method + * @param content content of annotation + * @param an_type type extracted from text (entity) + * @param count list of annotations and their count + * @param type list of annotations and their type + */ + private void insertAnnotation(ArrayList content, String an_type, HashMap, Integer> count, HashMap,String> type, PathConstants pathVars){ + + //iterating over list of annotations + for(int i = 0; i < content.size(); i++){ + + if(content.get(i).length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){ + + //creating the list key as: content - type mapping + Map an_content = new HashMap(); + an_content.put(content.get(i), an_type); + + //for each annotation (or ngram on annotation) + //insert content and related type + if(count.containsKey(an_content)){ + try{ + int cnt = count.get(an_content); + count.put(an_content, cnt+1); + + }catch(Exception e){ + count.put(an_content, 1); + } + } + else{ + count.put(an_content, 1); + } + //populating list of feature_an_types, with: + //feature--an_type--class + type.put(an_content, ""); + } + } + + content.clear(); + + } + + + /** + * Inserts the text (e.g.title) content into + * a list of features (e.g.title features) + * + * @param annot text with the annotations to be handled + * @param wContent whole field to be added on the list of features + * @param list features used + * + */ + private void addContent(String annot, String wContent, HashMap,String> list) { + + HashMap features = loadAnnotationEntities(); + ArrayList content = new ArrayList(); + NgramExtractor nextrac = new NgramExtractor(); + PathConstants pathVar = new PathConstants(); + + //parsing not edited text into HTML using Jsoup + Document doc = Jsoup.parseBodyFragment(annot); + //saving the text as an Jsoup element, with a main tag (the HTML body), + //attributes and child nodes (annotation tags) + Element annotations = doc.body(); + + //iterating over annotation types + for(Map.Entry value : features.entrySet()){ + + String an_type = value.getKey(); + String an_level = value.getValue(); + + //for each annotation type, find all related annotations on the abstract + Elements annots = annotations.getElementsByTag(an_type); + + //for each annotation type, + for(Element an : annots){ + + //grab annotation content + if(an_level.contains("sentence")) + //splitting in ngrams for sentence level annotations + content = nextrac.nGrams(an.text(), pathVar); + else + //keeping original annotation for other cases + content.add(an.text()); + + //iterating over list of annotations + for(int i = 0; i < content.size(); i++){ + + Map an_content = new HashMap(); + an_content.put(content.get(i), wContent); + + //populating list of feature_an_types, with: + //feature--an_type--class + list.put(an_content, ""); + } + content.clear(); + } + } + } + + + /** + * Loads list of entities from external file + * + * @param str list of entities + * @param pathVar constants from + * @return + */ + public HashMap loadAnnotationEntities(){ + + String pathEntities = "entities.txt"; + HashMap values = new HashMap(); + + try{ + BufferedReader reader = new BufferedReader(new FileReader(pathEntities)); + + String line = null; + + while((line = reader.readLine()) != null){ + + String[] value = StringUtils.split(line, " "); + values.put(value[0].toLowerCase(), value[1].toLowerCase()); + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + //String[] entities = values.toArray(new String[values.size()]); + + return values; + } + + + /** + * Handles the content of annotations; when + * there is multiple elements, they are + * concatenated after extracted + * + * @param str list of annotation elements + * @return single string with all elements + */ + public String contentToString(String[] str){ + String cont = ""; + + for(int i = 0; i < str.length; i++){ + if(cont.contentEquals("")){ + cont = cont + str[i]; + } + else cont = cont+" "+ str[i]; + + } + + return cont; + } + + + +} diff --git a/src/analyse/NgramExtractor.java b/src/analyse/NgramExtractor.java new file mode 100644 index 0000000..e3a8085 --- /dev/null +++ b/src/analyse/NgramExtractor.java @@ -0,0 +1,442 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package analyse; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import configure.PathConstants; + +/** + * This class extracts and parses n-grams + * from doc instances. + * + * @author halmeida + */ + +public class NgramExtractor extends Extractor{ + + public NgramExtractor(){ + this.id = ""; + this.endId = ""; + this.endFile = ""; + this.openAbst = ""; + this.closeAbst = ""; + this.abstractLabel = " ngram_count = new HashMap(); + //store abstract ngrams, count and "relevance(TBD)" + HashMap,Integer> ngrams = new HashMap,Integer>(); + //store title ngrams and its count + HashMap ngram_title_count = new HashMap(); + //store title ngrams, count and "relevance(TBD)" + HashMap,Integer> ngram_title = new HashMap,Integer>(); + + nextrac.initialize(); + + try + { + BufferedReader reader = new BufferedReader(new FileReader(AnCorpus)); + + //--------------------------- + // repeat until all lines + // of the file are read + //--------------------------- + String line = null; + String features = null; + String id = null; + + + while((line = reader.readLine()) != null){ + + line = line.replaceAll("\t",""); + line = line.replace("\"", ""); + + //find paper ID and store it + if (line.contains(nextrac.getid())){ + line = line.replace(nextrac.getid(), ""); + id = line.replace(nextrac.getendId(), ""); + + //keep reading the file + features = reader.readLine(); + features = features.replaceAll("\t",""); + + String tit_content = ""; + + //continue reading until the end of file + while(!(features.contentEquals(nextrac.getendFile()))){ + + String abstrac = ""; + + //find relevant doc section - Article title + if(features.contains(nextrac.getOpenTitle())){ + + //cleaning title content + features = features.replace(nextrac.getOpenTitle(),""); + features = features.replace(nextrac.getCloseTitle(), ""); + features = nextrac.removeSpecialChar(features); + tit_content = nextrac.removeTags(features); + + //extract n-grams from section + ArrayList title_c = nextrac.nGrams(tit_content, pathVars); + nextrac.addNGram(title_c, ngram_title_count,ngram_title, pathVars); + + features = reader.readLine(); + features = features.replaceAll("\t",""); + } + + + if(features.contains(nextrac.getAbstractLabel())){ + + String temp = ""; + String newAbs = nextrac.getopenAbst(); + + if(features.contains("")){ + temp = temp + nextrac.processAbstract(features); + } + else{ + do{ + temp = temp + nextrac.processAbstract(features); + features = reader.readLine(); + }while(!(features.contains(""))); + } + + newAbs = newAbs + temp; + features = newAbs + nextrac.getcloseAbst(); + } + + //find relevant paper section + if(features.contains(nextrac.getopenAbst())){ + + features = features.replace(nextrac.getopenAbst(),""); + features = features.replace(nextrac.getcloseAbst(), ""); + features = features.replace("-", " "); + features = nextrac.removeSpecialChar(features); + + //handle lines in which abstract text tag + //is separated from the actual text + if(features.isEmpty()){ + features = reader.readLine(); + features = features.replaceAll("\t",""); + features = features.replace(nextrac.getopenAbst(),""); + features = features.replace(nextrac.getcloseAbst(), ""); + features = features.replace("-", " "); + features = nextrac.removeSpecialChar(features); + } + + //features = nextrac.removeSpecialChar(features); + abstrac = nextrac.removeTags(features); + abstrac = nextrac.removeAbstractTags(abstrac); + //extract n-grams from section + ArrayList abstract_c = nextrac.nGrams(abstrac, pathVars); + nextrac.addNGram(abstract_c, ngram_count, ngrams, pathVars); + + //keep reading file + features = reader.readLine(); + features = features.replaceAll("\t",""); + //features = features.replaceAll("\\s+", ""); + } + + features = reader.readLine(); + features = features.replaceAll("\t",""); + //features = features.replaceAll("\\s+", ""); + } + } + } + + reader.close(); + + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + //print list of extracted n-grams + //System.out.println("\n========ABSTRACT==NGRAMS============="); + //nextrac.displayList(ngram_count); + //nextrac.displayList(ngram_title); + //System.out.println("\n===========TITLE==NGRAMS============="); + //nextrac.displayList(ngram_title_count); + + + nextrac.considerOccurance(ngram_count, pathVars); + nextrac.considerOccurance(ngram_title_count, pathVars); + + + System.out.println("\n===========NGRAMS==EXPORT===============\n"); + nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES, ngram_count); + System.out.println("..."+ ngram_count.size()+" unique Abstract ngrams saved."); + nextrac.exportFile(pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS, ngram_title_count); + System.out.println("... "+ ngram_title_count.size() +" unique Title ngrams saved."); + System.out.println("\n========================================\n"); + + } + + + /** + * Removes from feature list all features with + * frequency not statistically relevant (2 or less) + * @param list to be cleaned + */ + + private void considerOccurance(HashMap list, PathConstants vars){ + //going over the list of annotations and removing the + //statistically not significant features - frequency less than 2 + + Iterator iterator = list.values().iterator(); + + while(iterator.hasNext()){ + Integer key = iterator.next(); + + if(key < Integer.parseInt(vars.FEATURE_MIN_FREQ)){ + iterator.remove(); + } + } + } + + private void addNGram(ArrayList str, HashMap list_count, HashMap,Integer> list, PathConstants pathVars){ + + for(int i = 0; i < str.size(); i++){ + String currentNGram = str.get(i); + + if(list_count.containsKey(currentNGram)){ + int count = list_count.get(currentNGram); + list_count.put(currentNGram, count+1); + + /*if(list.containsKey(currentNGram)){ + int cnt = list.get(currentNGram).get(certainty); + list.get(currentNGram).put(certainty, cnt+1); + } + else{ + list.get(currentNGram).put(certainty, 1); + }*/ + } + else { + if(currentNGram.length() >= Integer.parseInt(pathVars.FEATURE_MIN_LENGTH)){ + list_count.put(currentNGram, 1); + + /* list.put(currentNGram, new HashMap()); + list.get(currentNGram).put(certainty, 1);*/ + } + } + } + } + + /** + * Extracts n-grams from the content field + * and populates mapping with n-gram +count + * @param str + * @param id + * @param gram + */ + + public ArrayList nGrams(String str, PathConstants pathVar){ + + //cleaning further chars on sentence + str = str.replace("/", ""); + str = str.replace("\\", ""); + str = str.replace(" ", "-"); + //Tokenize the sentence + String[] words = StringUtils.split(str,"-"); + ArrayList ngramList = new ArrayList(); + + int ngram =Integer.parseInt(pathVar.NGRAM_SIZE); + + if(Boolean.valueOf(pathVar.NGRAM_STOP)){ + words = StringUtils.split(removeStopList(words, pathVar)," "); + } + + for(int i=0; i < words.length - (ngram - 1); i++){ + switch(pathVar.NGRAM_SIZE){ + case "1": + ngramList.add(words[i].toLowerCase()); + break; + case "2": + ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()); + break; + case "3": + ngramList.add(words[i].toLowerCase()+" "+words[i+1].toLowerCase()+" "+words[i+2].toLowerCase()); + break; + } + } + + return ngramList; + } + + /** + * Removes the stopwords from ngrams list + * + * @param str list of ngrams + * @param pathVar constants from + * @return + */ + + public String removeStopList(String[] str, PathConstants pathVar){ + + String pathStop = "stopList.txt"; + String[] stop = null; + StringBuilder cleaned = new StringBuilder(); + + try{ + + BufferedReader reader = new BufferedReader(new FileReader(pathStop)); + + String line = null; + + while((line = reader.readLine()) != null){ + stop = StringUtils.split(line,","); + line = reader.readLine(); + } + + reader.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + for(int i = 0; i < str.length; i++){ + for(int j = 0; j < stop.length; j++){ + + if(str[i].equalsIgnoreCase(stop[j])){ + str[i] = str[i].replace(str[i],"*"); + } + } + if(!(str[i].contentEquals("*"))){ + cleaned.append(str[i]).append(" "); + } + } + return cleaned.toString().replace(" ", " "); + } + + /** + * Evaluates the level of certainty... + * TBD!!! + * @param list + * @return + */ + + public String getCertainty(HashMap> list){ + + ArrayList gramsAr = new ArrayList(list.entrySet()); + //String certainty; + + Iterator itr = gramsAr.iterator(); + while(itr.hasNext()){ + String str = itr.next().toString(); + String[] splitted = StringUtils.split(str,"="); + + int relevance = 0; + int count = 0; + + + try{ + count = list.get(splitted[0]).get(certainty); + } catch(Exception e){ + e.printStackTrace(); + } + + //relevance = count * getWeight(); + + if(relevance == 1) + list.get(splitted[0]).put("fairly relevant", list.get(splitted[0]).get(certainty)); + else if (relevance == 2) + list.get(splitted[0]).put("relevant", list.get(splitted[0]).get(certainty)); + else + list.get(splitted[0]).put("very relevant", list.get(splitted[0]).get(certainty)); + + } + + return certainty; + } + + /** + * Displays the keys and values of the + * maps created with n-grams and counts. + * @param hash HashMap containing n-grams + */ + @Override + public void displayList(HashMap hash){ + super.displayList(hash); + //sum = sum + hash.get(str); + System.out.println("\n=======================================\n"); + System.out.println("Number of unique n-grams: "+hash.size()); + System.out.println("\n=======================================\n"); + } + + + /** + * Accessor and mutator methods for the export + * string with list values - so vector class + * can access its content. + * @return string with list of values. + */ + /*public static String getNgramCount() { + //ngramCount = exportContent(ngram_count); + return ngramCount; + } + public void setNgramCount(String ngramCount) { + this.ngramCount = ngramCount; + } + public static String getNgram() { + //ngram = exportContent(ngrams); + return ngram; + } + public void setNgram(String ngram) { + this.ngram = ngram; + } */ + + +} diff --git a/src/arffmatrix/.gitignore b/src/arffmatrix/.gitignore new file mode 100644 index 0000000..ec5761d --- /dev/null +++ b/src/arffmatrix/.gitignore @@ -0,0 +1,2 @@ +/buildmodel.class +/buildtest.class diff --git a/src/arffmatrix/BuildModel.class b/src/arffmatrix/BuildModel.class new file mode 100644 index 0000000..0be977c Binary files /dev/null and b/src/arffmatrix/BuildModel.class differ diff --git a/src/arffmatrix/BuildModel.java b/src/arffmatrix/BuildModel.java new file mode 100644 index 0000000..65869e8 --- /dev/null +++ b/src/arffmatrix/BuildModel.java @@ -0,0 +1,299 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*** +* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/corpus/buildmodel.java +* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton +* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc +* +* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term +* http://creativecommons.org/licenses/by-nc/3.0/ +*/ + +package arffmatrix; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Date; +import analyse.Extractor; +import arffvector.CreateVector; +import configure.PathConstants; + +/** + * This class reads the corpus instances and uses + * the CreateVector class to generate a model file (ARFF) * + * + */ + +public class BuildModel { + + private static String configfile = null; + + public static void main(String[] args) { + + //----------------------------------- + // instantiate classes of constants + // and configuration file. + //----------------------------------- + + PathConstants pathVars; + + if (configfile == null){ + pathVars = new PathConstants(); + } + else{ + pathVars = new PathConstants(configfile); + } + + Extractor model = new Extractor(); + model.initialize(); + CreateVector vectorgenerator = new CreateVector(pathVars); + String attributes = vectorgenerator.informFeatures(pathVars); + System.out.println("Features loaded ..."); + + // name output ARFF files + String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date()); + String arffFileName = "triage" + pathVars.EXP_TYPE + attributes +"_"+ timeStamp + ".arff"; + + try + { + //by default + String sortarffFileName = pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + arffFileName; // default + + // create file + //FileWriter fstream = new FileWriter(sortarffFileName); + BufferedWriter out = new BufferedWriter(new FileWriter(sortarffFileName)); + + // load ARFF header and write it + String outHeaderArff = vectorgenerator.genArffHeader(pathVars,Integer.parseInt(pathVars.EXP_TYPE)); + //System.out.println(outHeaderArff); // verbose + out.write(outHeaderArff + "\n"); + + // reader for corpus + BufferedReader reader = null; + //train corpus + if(Integer.parseInt(pathVars.EXP_TYPE) == 0) + reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR + pathVars.TRAINING_FILE)); + //test corpus + else if(Integer.parseInt(pathVars.EXP_TYPE) ==1) + reader = new BufferedReader(new FileReader(pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TEST_DIR + pathVars.TEST_FILE)); + + //-------------------------------------------- + // repeat until all lines have been read + // from the file + //-------------------------------------------- + String text = null; + String content = null; + + String abstracttext = ""; + String journaltitle = ""; + String title = ""; + String ecnumber = ""; + String classtriage = ""; + int hasText = 0; + int journaltitlecount = 0; + int abstracttitlecount = 0; + int abstracttextcount = 0; + int positivecount = 0; + int negativecount = 0; + + + while ((text = reader.readLine()) != null) { + + // detect a PubMed abstract + if (text.contains("")){ + + // Reinitialize journal title + journaltitle = ""; + + // Reinitialize abstract title + title = ""; + + // Reinitialize abstract text + abstracttext = ""; + + // Reinitialize hasText to false + hasText = 0; + + String pmid = text.replaceFirst(".*", ""); + pmid = pmid.replace("", ""); + System.out.println("PMID : " + pmid); + + // continue to read + content = reader.readLine(); + content = content.replaceAll("\t", ""); + content = content.replaceFirst("\\s+", ""); + + while ( ! content.contentEquals("") ) { + + if (content.contains("")){ + + journaltitlecount++; + + content = content.replace("<Title>", ""); + content = content.replace("", ""); + journaltitle = content; + System.out.println("Journal title : " + content); + } + + if (content.contains("")){ + + abstracttitlecount++; + + content = content.replace("", ""); + content = content.replace("", ""); + title = content; + System.out.println("Paper title : " + content); + } + + + if (content.contains("")){ + + abstracttextcount++; + hasText = 1; // use it to indicate if the abstract has some text or not + + content = content.replace("", ""); + + //checks if there are empty lines after AbstractText tag + //and keeps reading until finds the abstract content + while(content.isEmpty()){ + content = reader.readLine(); + } + abstracttext = abstracttext + content; + // clean + abstracttext = model.removeAbstractTags(abstracttext); + + + content = reader.readLine(); + // converting toLowerCase is not relevant in bio context + // because it introduces ambiguities (ie Gene name / Enzyme alias) + // abstracttext = abstracttext.toLowerCase(); + } + + if (content.contains("")){ + temp = temp + model.processAbstract(content); + } + else{ + do{ + temp = temp + model.processAbstract(content); + content = reader.readLine(); + }while(!(content.contains(""))); + } + + newAbs = newAbs + temp; + content = newAbs + ""; + + abstracttext = content; + abstracttext = model.removeAbstractTags(abstracttext); + + content = reader.readLine(); + + } + + if (content.contains("EC ")){ + content = content.replace("EC ", ""); + content = content.replace("", ""); + ecnumber = content; + } + + if (content.contains("")){ + + content = content.replace("", ""); + content = content.replace("", ""); + classtriage = content; + if(content.contains("positive")){ + positivecount++; + } + if(content.contains("negative")){ + negativecount++; + } + System.out.println("Triage classification : " + content); + } + + content = reader.readLine(); + content = content.replaceAll("\t", ""); + content = content.replaceFirst("\\s+", ""); + } + + System.out.println("Abstract : " + abstracttext.toString() + "\n\n"); + + // end of if: collect data and write ARFF + String Arffline = vectorgenerator.getArffLine( + journaltitle, + title, + abstracttext, + ecnumber, + classtriage, + Integer.parseInt(pathVars.EXP_TYPE) + ); + + Arffline = Arffline + "\n"; + // write line on disc + out.write(Arffline); + // out.write(id + " " + Arffline + "\n"); // + } + + } + + System.out.println( + "Abstracts processed: " + abstracttitlecount + + "\t with text content: " + abstracttextcount + + "\t from " + journaltitlecount + " journals" + + "\nTotal of: \n" + positivecount + " positive" + + "\t and " + negativecount + " negative documents"); + out.write("\n"); + out.close(); + + reader.close(); + + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + + + +} + + + diff --git a/src/arffvector/.gitignore b/src/arffvector/.gitignore new file mode 100644 index 0000000..bdc0ba3 --- /dev/null +++ b/src/arffvector/.gitignore @@ -0,0 +1,7 @@ +/buildvector.class +/FeatureVector.class +/CreateVector.class +/CreateWeightedVector.class +/ArbitraryWeight.class +/CountsWeightedVector.class +/ArbitraryWeightedVector.class diff --git a/src/arffvector/CreateVector.java b/src/arffvector/CreateVector.java new file mode 100644 index 0000000..ce81dee --- /dev/null +++ b/src/arffvector/CreateVector.java @@ -0,0 +1,872 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*** +* This class re-uses https://code.google.com/p/deft2013/source/browse/trunk/src/vector/buildvector.java +* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton +* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc +* +* This software is free to use, modify and redistribute under Creative Commons by-nc/3.0 License Term +* http://creativecommons.org/licenses/by-nc/3.0/ +*/ + + + + +package arffvector; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import org.apache.commons.lang3.StringUtils; +import configure.PathConstants; + +/** + * Uses the features extracted and the + * generated corpus to create a feature vector + * (a matrix representation of the corpus) + * + * + */ +public class CreateVector { + + ArrayList annotations = new ArrayList(); + ArrayList annotationsType = new ArrayList(); + ArrayList journalTitles = new ArrayList(); + ArrayList ecnumbers = new ArrayList(); + ArrayList titleGrams = new ArrayList(); + ArrayList titleAnnot = new ArrayList(); + ArrayList nGrams = new ArrayList(); + + PathConstants pathVars = null; + + /** + * Constructor to load all features extracted + * from training files. These features will be + * used to generate the ARFF header and the + * ARFF vector lines. + * + * @param extVars Variables holding system paths + */ + + public CreateVector(PathConstants extVars) { + + pathVars = extVars; + + String pathJournalT = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.JOURNAL_TITLE_FEATURES; + try{ + String journalT = ""; + + //receiving journal title + BufferedReader reader = new BufferedReader(new FileReader(pathJournalT)); + int featcount = 0; + while (( journalT = reader.readLine()) != null) { + + if (Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ + + String[] features = StringUtils.split(journalT,"\n"); + + for(int i = 0; i < features.length; i++){ + + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for journal titles duplicates + if(featurename[1] != "" && !(journalTitles.contains(featurename[1]))){ + journalTitles.add(featurename[1]); + } + } + } + if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} + + } + reader.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + String pathAnnotations = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ANNOTATION_FEATURES; + String pathTitleAnnot = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_FEATURES; + + try{ + String abstAnnot = ""; + String tAnnot = ""; + + //receiving abstract annotations (features) + BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations)); + BufferedReader readerT = new BufferedReader(new FileReader(pathTitleAnnot)); + + int featcount = 0; + + while (( abstAnnot = reader.readLine()) != null) { + + if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ + String[] features = StringUtils.split(abstAnnot,"\n"); + + for(int i = 0; i < features.length; i++){ + + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate abstract annotations + if(featurename[0] != "" && !(annotations.contains(featurename[0]))){ + annotations.add(featurename[0]); + } + } + } + if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} + } + + + if(!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ + while((tAnnot = readerT.readLine()) != null){ + + String[] features = StringUtils.split(tAnnot,"\n"); + + for(int i = 0; i < features.length; i++){ + + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate annotations + if(featurename[0] != "" && !(annotations.contains(featurename[0]))){ + annotations.add(featurename[0]); + } + } + + } + + } + + reader.close(); + readerT.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + try{ + String abstAnType = ""; + + //receiving abstract annotation types + BufferedReader reader = new BufferedReader(new FileReader(pathAnnotations)); + int featcount = 0; + while (( abstAnType = reader.readLine()) != null) { + + if (Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ + + String[] features = StringUtils.split(abstAnType,"\n"); + + for(int i = 0; i < features.length; i++){ + + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate abstract annotation types + if(featurename[1] != "" && !(annotationsType.contains(featurename[1]))){ + annotationsType.add(featurename[1]); + } + + } + } + if ( featcount >= Integer.parseInt(pathVars.NB_PARAMS) && Integer.parseInt(pathVars.NB_PARAMS) != -1 ) { break;} + + } + reader.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + + + try{ + String titAnnot = ""; + + //receiving title annotations (features) + BufferedReader reader = new BufferedReader(new FileReader(pathTitleAnnot)); + // int featcount = 0; + while (( titAnnot = reader.readLine()) != null) { + + if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ + + //String titAnnot = FeatureExtractor.getTitCount(); + + String[] features = StringUtils.split(titAnnot,"\n"); + + for(int i = 0; i < features.length; i++){ + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate title annotations + if(!(titleAnnot.contains(featurename[0]))){ + titleAnnot.add(featurename[0]); + } + } + } + } + reader.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + + String pathECNumFeatures = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.ECNUM_FEATURES; + + try{ + String ECNum = ""; + + //receiving EC numbers (features) + BufferedReader reader = new BufferedReader(new FileReader(pathECNumFeatures)); + // int featcount = 0; + while ((ECNum = reader.readLine()) != null) { + + if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ + + //String titAnnot = FeatureExtractor.getTitCount(); + + String[] features = StringUtils.split(ECNum,"\n"); + + for(int i = 0; i < features.length; i++){ + String[] featurename = StringUtils.split(features[i],"\t"); + + //checking for duplicate EC numbers + if(!(ecnumbers.contains(featurename[0]))){ + ecnumbers.add(featurename[0]); + } + } + } + } + reader.close(); + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + + String pathTitleGrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.TITLE_NGRAMS; + + + try{ + String titCont = ""; + // String grams = ""; + + //receiving title ngrams + BufferedReader reader = new BufferedReader(new FileReader(pathTitleGrams)); + + int featcount = 0; + while (( titCont = reader.readLine()) != null) { + + if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ + + String[] content = StringUtils.split(titCont,"\n"); + + for(int i = 0; i < content.length; i++){ + String[] featurename = StringUtils.split(content[i],"\t"); + + //check for duplicate title ngrams + if(!(titleGrams.contains(featurename[0]))){ + titleGrams.add(featurename[0]); + } + } + } + } + + reader.close(); + + } + catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + + String pathNgrams = pathVars.HOME_DIR + pathVars.FEATURE_DIR + pathVars.NGRAM_FEATURES; + try{ + String grams = ""; + String tgrams = ""; + + //receiving ngrams + BufferedReader reader = new BufferedReader(new FileReader(pathNgrams)); + BufferedReader readerT = new BufferedReader(new FileReader(pathTitleGrams)); + + // int featcount = 0; + while (( grams = reader.readLine()) != null) { + + if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ + + String[] features = StringUtils.split(grams,"\n"); + + for(int i = 0; i < features.length; i++){ + String[] featurename = StringUtils.split(features[i],"\t"); + + //check for duplicate abstract ngrams + if(!(nGrams.contains(featurename[0]))){ + nGrams.add(featurename[0]); + } + } + } + + } + + //if not using title grams separately, + // then insert title grams with abstract grams. + if (!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){ + while (( tgrams = readerT.readLine()) != null) { + + String[] features = StringUtils.split(tgrams,"\n"); + + for(int i = 0; i < features.length; i++){ + String[] featurename = StringUtils.split(features[i],"\t"); + + //check for duplicate ngrams + if(!(nGrams.contains(featurename[0]))){ + nGrams.add(featurename[0]); + } + } + } + } + + reader.close(); + readerT.close(); + + }catch (FileNotFoundException e) { + e.printStackTrace(); + } + catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * Gathers the list of features, according to + * experimental configurations. The list of + * features will be written on the ARFF header. + * + * @param pathVars Variables holding system paths + * @param exp experiment type: train or test + * @return a String containing the ARFF header + */ + + public String genArffHeader(PathConstants pathVars, int exp){ + + StringBuilder headerArff = new StringBuilder(); + + switch(exp){ + case 0: + headerArff.append("% Weka training file - mycoCLAP triage - CSFG 2014\n\n"); + break; + case 1: + headerArff.append("% Weka test file - mycoCLAP triage - CSFG 2014\n\n"); + break; + } + + headerArff.append("@RELATION triage\n"); + + if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){ + // writing the list of text sizes + headerArff.append("@ATTRIBUTE sizeoftitle \tREAL \t\t%size of title\n"); + headerArff.append("@ATTRIBUTE sizeoftext \tREAL \t\t%size of text\n"); + } + + if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ + for(int i = 0; i < journalTitles.size(); i++){ + // writing list of journal titles + String feature = journalTitles.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "journalTitle" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ + // writing list of annotation features + for(int i = 0; i < annotations.size(); i++){ + + String feature = annotations.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "annotation" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ + // writing list of annotation entities + for(int i = 0; i < annotationsType.size(); i++){ + String feature = annotationsType.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "annotationType" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ + // write list of title features + for( int i = 0; i < titleAnnot.size(); i++){ + + String feature = titleAnnot.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "titleAnnot" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + + } + + if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ + // writing list of EC numbers + for(int i = 0; i < ecnumbers.size(); i++){ + String feature = ecnumbers.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "ECnumber" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + } + } + + if (Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ + // writing list of ngrams on titles + for( int i = 0; i < titleGrams.size(); i++){ + + String feature = titleGrams.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + namefeature = namefeature.replaceAll("<|>", ""); + String ref = "titleNgram" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ + // write list of ngrams + for(int i = 0; i < nGrams.size(); i++){ + + String feature = nGrams.get(i); + String namefeature = feature.replaceAll("\\s", "-"); + namefeature = namefeature.replaceAll("[,:=+']", "-"); + String ref = "Ngram" + String.valueOf(i) + namefeature; + + headerArff.append("@ATTRIBUTE " + ref + "\tREAL \t\t%" + feature + "\n"); + + } + } + + // writing the dataset classes + headerArff.append("@ATTRIBUTE class {positive, negative}\n"); + headerArff.append("@DATA\n"); + + return headerArff.toString(); + } + + /** + * Iterates over the list of features and + * counts number of features containing + * on a given document. + * + * @param jTitle title of journal + * @param title title of paper + * @param text abstract content + * @param ecnum paper EC numbers + * @param classTriage triage classification: positive or negative + * @param exp experiment type: train or test + * @return String holding counts for all features found in a document + */ + + public String getArffLine(String jTitle, String title, String text, String ecnum, String classTriage, int exp){ + //String vectorArff = ""; + StringBuilder vectorArff = new StringBuilder(); + + text = removeSpecialChar(text.toLowerCase()); + title = removeSpecialChar(title.toLowerCase()); + jTitle = removeSpecialChar(jTitle.toLowerCase()); + ecnum = removeSpecialChar(ecnum); + + int emptyabs = 0; + + // fill title and text sizes (number of words) + // annotation markups do not matter because + // they do not introduce blank spaces hence + // they do not modify the number of words found + if (Boolean.valueOf(pathVars.USE_TEXT_SIZE)){ + + String[] titleGrams = StringUtils.split(title," "); + int titlesize = titleGrams.length; + + String[] abstractcontent = StringUtils.split(text," "); + int abstractsize = abstractcontent.length; + + if(abstractsize == 1){ + emptyabs++; + } + + vectorArff.append(titlesize).append(",").append(abstractsize).append(","); + } + + //fill values of journal titles + if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)){ + + for(int i = 0; i < journalTitles.size(); i++){ + String jfeat = ""; + int jfeatcount = 0; + jfeat = journalTitles.get(i).replaceFirst(" ", ""); + + if(jTitle.contains(jfeat)){ + jfeatcount = StringUtils.countMatches(jTitle, jfeat); + vectorArff.append(jfeatcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + // fill values of annotation types taken into account + // either only the abstract or abstract and title + // adds on vector the count of occurrences + if (Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)){ + + for(int i = 0; i < annotations.size(); i++){ + String anfeat = ""; + int anfeatcount = 0; + anfeat = annotations.get(i).replaceFirst(" ", "").toLowerCase(); + + //in case the text has current annotation + if (text.contains(anfeat)){ + //check the count of the annotation + if((Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ + anfeatcount = StringUtils.countMatches(text, anfeat); + } + //adding title annot count to annotations + else if (!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE))){ + anfeatcount = StringUtils.countMatches(text, anfeat); + //in case title has annotation, add to count + if(title.contains(anfeat)){ + anfeatcount = anfeatcount + StringUtils.countMatches(title, anfeat); + } + } + vectorArff.append(anfeatcount).append(","); + } + //handles the case that only the title (but not abstract) has current annotation + else if((!(Boolean.valueOf(pathVars.USE_TITLE_FEATURE)))){ + if(title.contains(anfeat)){ + anfeatcount = StringUtils.countMatches(title, anfeat); + } + vectorArff.append(anfeatcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + //fill values of abstract annotation types + if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)){ + + for(int i = 0; i < annotationsType.size(); i++){ + String antype = ""; + int antypecount = 0; + antype = annotationsType.get(i).replaceFirst(" ", "").toLowerCase(); + + if (text.contains(antype)){ + //divided by 2 to match occurance + //(count considers open and close tags) + antypecount = (StringUtils.countMatches(text, antype))/2; + vectorArff.append(antypecount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + + } + + //fill values of title annotations + if (Boolean.valueOf(pathVars.USE_TITLE_FEATURE)){ + + for( int i =0; i < titleAnnot.size(); i++){ + String titfeat = ""; + int titfeatcount = 0; + titfeat = titleAnnot.get(i).replaceFirst(" ", "").toLowerCase(); + + if (title.contains(titfeat)){ + titfeatcount = StringUtils.countMatches(title, titfeat); + vectorArff.append(titfeatcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)){ + + for(int i = 0; i < ecnumbers.size(); i++){ + String ecfeat = ""; + int ecnumcount = 0; + ecfeat = ecnumbers.get(i); + + if(ecnum.contains(ecfeat)){ + ecnumcount = StringUtils.countMatches(ecnum, ecfeat); + vectorArff.append(ecnumcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + // fill only values of title ngrams + if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ + + String cleanTitle = removeTags(title.toLowerCase()); + + for( int i =0; i < titleGrams.size(); i++){ + String titgram = ""; + int titgramcount = 0; + titgram = titleGrams.get(i).toLowerCase(); + + //in case the title has current ngram + if (cleanTitle.contains(titgram)){ + //check the count of the ngram + titgramcount = StringUtils.countMatches(cleanTitle, titgram); + + //adding weight to current ngram count + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ + titgramcount = applyWeight(titgramcount, Integer.parseInt(pathVars.WEIGHT)); + } + vectorArff.append(titgramcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + // fill values of ngrams + if (Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)){ + String cleanText = removeTags(text.toLowerCase()); + String cleanTitle = removeTags(title.toLowerCase()); + + for( int i = 0; i < nGrams.size(); i++){ + String ngramfeat = ""; + int ngramcount = 0; + ngramfeat = nGrams.get(i).toLowerCase(); + + //in case the text has current ngram + if (cleanText.contains(ngramfeat)){ + //check the count of the ngram + if(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)){ + ngramcount = StringUtils.countMatches(cleanText, ngramfeat); + + //adding weight to current ngram count + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ + ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); + } + } + //checking if title ngrams should be added to the count + else if(!(Boolean.valueOf(pathVars.USE_TITLE_NGRAMS))){ + ngramcount = StringUtils.countMatches(cleanText, ngramfeat); + + //in case title has ngram, add to count + if(cleanTitle.contains(ngramfeat)){ + ngramcount += StringUtils.countMatches(cleanTitle, ngramfeat); + } + + //adding weight to current ngram count + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ + ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); + } + } + + vectorArff.append(ngramcount).append(","); + } + ////handles the case that only the title (but not abstract) has current ngram + else if (!(cleanText.contains(ngramfeat))){ + //in case only the title has the ngram, add to count + if(cleanTitle.contains(ngramfeat)){ + ngramcount = StringUtils.countMatches(cleanTitle, ngramfeat); + + //adding weight to ngram count + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)){ + ngramcount = applyWeight(ngramcount, Integer.parseInt(pathVars.WEIGHT)); + } + } + vectorArff.append(ngramcount).append(","); + } + else{ + vectorArff.append("0,"); + } + } + } + + + //if(exp == 0){ + if (classTriage.contains("positive")){ + vectorArff.append("positive"); + //vectorArff.append("?"); + } + else { + vectorArff.append("negative"); + //vectorArff.append("?"); + } + //} + + /*else if (exp == 1){ + vectorArff.append("?"); + } */ + + return vectorArff.toString(); + } + + /** + * Cleans a given String from special characters + * + * @param str String to be cleaned + * @return String without special characters + */ + + public String removeSpecialChar(String str){ + str = str.replace("}", ""); + str = str.replace("{", ""); + str = str.replace("]", ""); + str = str.replace("[", ""); + str = str.replace("#", ""); + str = str.replace("*", ""); + str = str.replace(">", ""); + str = str.replace(""", ""); + str = str.replace("&apos", ""); + str = str.replace("%", ""); + str = str.replace("/", ""); + str = str.replace("\\", ""); + str = str.replace("&", ""); + str = str.replace("=", ""); + str = str.replace("?", ""); + str = str.replace(",", ""); + str = str.replace(":", ""); + str = str.replace(";", ""); + str = str.replace(".", ""); + str = str.replace(")", ""); + str = str.replace("(", ""); + str = str.replace("\t\t", "\t"); + str = str.replace("-", ""); + str = str.replace(" ", ""); + + return str; + } + + /** + * + * @param str + * @return + */ + public String removeTags(String str){ + String[] remove = StringUtils.split(str,""); + StringBuilder sb = new StringBuilder(); + + for(int i = 0; i < remove.length; i++){ + + if(remove[i].equalsIgnoreCase("<")){ + do{ + i++; + } + while(!(remove[i].equalsIgnoreCase(">"))); + } + else sb.append(remove[i]); + } + + return sb.toString(); + } + + public int applyWeight(int count, int weight){ + + if(weight > 0){ + count = count * weight; + } + return count; + } + + + public String informFeatures(PathConstants pathVars){ + String value = ""; + if(Boolean.valueOf(pathVars.USE_ANNOTATION_FEATURE)) + value = value + "_annotations"; + if(Boolean.valueOf(pathVars.USE_ANNOTATION_TYPE)) + value = value + "_types"; + if(Boolean.valueOf(pathVars.USE_JOURNAL_TITLE_FEATURE)) + value = value + "_journal"; + if(Boolean.valueOf(pathVars.USE_TITLE_FEATURE) || Boolean.valueOf(pathVars.USE_TITLE_NGRAMS)) + value = value + "_title"; + if(Boolean.valueOf(pathVars.USE_ECNUM_FEATURE)) + value = value + "_ecnum"; + if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE)) + value = value + "_ngrams_size"+ pathVars.NGRAM_SIZE; + if(Boolean.valueOf(pathVars.USE_NGRAM_FEATURE) && Boolean.valueOf(pathVars.NGRAM_STOP)) + value = value + "_stopwords"; + if(Boolean.valueOf(pathVars.USE_WEIGHTED_NGRAM)) + value = value + "_weight"+ pathVars.WEIGHT; + + return value; + } + + +} diff --git a/src/classifier/.gitignore b/src/classifier/.gitignore new file mode 100644 index 0000000..b92cc15 --- /dev/null +++ b/src/classifier/.gitignore @@ -0,0 +1,3 @@ +/test.class +/train.class +/Trainer.class diff --git a/src/classifier/Trainer.java b/src/classifier/Trainer.java new file mode 100644 index 0000000..7417982 --- /dev/null +++ b/src/classifier/Trainer.java @@ -0,0 +1,315 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +package classifier; +import java.util.Random; + +import weka.attributeSelection.LatentSemanticAnalysis; +import weka.attributeSelection.PrincipalComponents; +import weka.attributeSelection.GainRatioAttributeEval; +import weka.attributeSelection.CorrelationAttributeEval; +import weka.attributeSelection.Ranker; +import weka.classifiers.Classifier; +import weka.classifiers.CostMatrix; +import weka.classifiers.Evaluation; +import weka.classifiers.bayes.NaiveBayes; +import weka.classifiers.functions.LibSVM; +import weka.classifiers.meta.AttributeSelectedClassifier; +import weka.classifiers.meta.CostSensitiveClassifier; +import weka.classifiers.trees.LMT; +import weka.core.Instances; +import weka.core.converters.ConverterUtils.DataSource; +import configure.PathConstants; + +/** + * Trains and tests a classifier, + * executes k-fold cross validation on train data + * and outputs the classification results. + * + * @author halmeida + * + */ + +public class Trainer { + + public static int SEED = 1; //the seed for randomizing the data + public static int FOLDS = 5; //the # of folds to generate + double[][] ranking; + String rank; + + + /** + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception { + + PathConstants pathVars = new PathConstants(); + Trainer evaluator = new Trainer(); + + + //Creating classifier + Classifier cls = (Classifier) new LMT(); + //Classifier cls = (Classifier) new NaiveBayes(); + //Classifier cls = (Classifier) new LibSVM(); + + //Loading train data + DataSource sourceTrain = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TRAIN_DIR + pathVars.ARFF_TRAIN); + Instances trainData = sourceTrain.getDataSet(); + + //Flagging the class index on the training data + trainData.setClassIndex(trainData.numAttributes()-1); + System.out.println("Class index set on training data."); + + System.out.println("Training data loaded. Number of instances: " + trainData.numInstances() + "\n"); + + //Executing k-fold cross validation + //train.crossFold(trainData, cls); + + //Loading test data + DataSource sourceTest = new DataSource(pathVars.HOME_DIR + pathVars.OUTPUT_MODEL + pathVars.TEST_DIR + pathVars.ARFF_TEST); + Instances testData = sourceTest.getDataSet(); + + //Flagging the class index on the training data + testData.setClassIndex(trainData.numAttributes()-1); + System.out.println("Class index set on testing data."); + + System.out.println("Test data loaded. Number of instances: " + testData.numInstances() + "\n"); + + //Creating filtered classifiers + //AttributeSelectedClassifier PCAclassifier = evaluator.setPCAFilter(cls); + //AttributeSelectedClassifier LSAclassifier = evaluator.setLSAFilter(cls); + //AttributeSelectedClassifier GRclassifier = evaluator.setGRFilter(cls); + //AttributeSelectedClassifier Corrclassifier = evaluator.setCorrFilter(cls); + + //Training and testing classifier + evaluator.classify(trainData, testData, cls); + + //Training and testing costSensitive classifier + //evaluator.classify(trainData, testData, evaluator.classifySensitive(cls)); + + //Executing k-fold cross validation on filtered classifiers + //evaluator.crossFold(trainData, PCAclassifier); + //evaluator.crossFold(trainData, LSAclassifier); + + } + + + /** + * Trains and tests a classifier when two separated + * datasets are provided. + * + * @param train training data to build classifier + * @param test test data to evaluate classifier + * @param classif type of classifier applied + * @throws Exception + */ + public void classify(Instances train, Instances test, Classifier classif) throws Exception{ + + classif.buildClassifier(train); + Evaluation evaluateClassifier = new Evaluation(train); + evaluateClassifier.evaluateModel(classif, test); + + stats(evaluateClassifier, classif); + } + + /** + * Trains and tests a classifier using a + * provided Cost matrix + * + * @param classif type of classifier to be trained + * @return CostSensitive classifier with costs and classifier + * @throws Exception + */ + + public CostSensitiveClassifier classifySensitive(Classifier classif) throws Exception{ + CostSensitiveClassifier costSensitive = new CostSensitiveClassifier(); + CostMatrix matrix = new CostMatrix(2); + matrix.setElement(0, 1, 4); + matrix.setElement(1, 0, 1); + costSensitive.setClassifier(classif); + costSensitive.setCostMatrix(matrix); + + return costSensitive; + } + + + /** + * Outputs classifier results. + * + * @param eval Evaluation model built by a classifier + * @param classif type of classifier applied + * @throws Exception + */ + + public void stats(Evaluation eval, Classifier classif) throws Exception{ + System.out.println("Number of attributes: " + eval.getHeader().numAttributes()); + System.out.println(eval.toSummaryString("\n======== RESULTS ========\n", false)); + System.out.println(eval.toClassDetailsString("\n\n======== Detailed accuracy by class ========\n")); + System.out.println(eval.toMatrixString("\n\n======== Confusion Matrix ========\n")); + } + + /** + * Executes k-fold cross validation + * on a given dataset + * @param data training data provided + * @param classif type of classifier usedsearch + * @throws Exception + */ + + public void crossFold(Instances data, Classifier classif) throws Exception{ + + Random random = new Random(SEED); //creating seed number generator + Evaluation evaluateClassifier = new Evaluation(data); + + System.out.println("Classifier working...\n\n"); + //Classifier should not be trained when cross-validation is executed. + //because subsequent calls to buildClassifier method will return the same results always. + evaluateClassifier.crossValidateModel(classif, data, FOLDS, random); + + stats(evaluateClassifier, classif); + } + + + /** + * Implements a Filtered GainRatio classifier, + * using the ranker as a search method. + * + * @param classif type of classifier to be used + * @return filtered classif with Correlation analysis + */ + + public AttributeSelectedClassifier setGRFilter(Classifier classif){ + AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); + + //Creating evaluator and search method + GainRatioAttributeEval GR = new GainRatioAttributeEval(); + Ranker rank = new Ranker(); + //return the attributes with evaluation greater than 0 + double threshold = 0.0; + rank.setThreshold(threshold); + + //Setting GainRatio filtered classifier + fClassif.setClassifier(classif); + fClassif.setEvaluator(GR); + fClassif.setSearch(rank); + + return fClassif; + + } + + /** + * Implements a Filtered Correlation classifier, + * using the ranker as a search method. + * + * @param classif type of classifier to be used + * @return filtered classif with Correlation analysis + */ + + public AttributeSelectedClassifier setCorrFilter(Classifier classif){ + AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); + + //Creating evaluator and search method + CorrelationAttributeEval Corr = new CorrelationAttributeEval(); + Ranker rank = new Ranker(); + + //return the attributes with evaluation greater than 0 + double threshold = 0.03; + rank.setThreshold(threshold); + + //Setting GainRatio filtered classifier + fClassif.setClassifier(classif); + fClassif.setEvaluator(Corr); + fClassif.setSearch(rank); + + return fClassif; + + } + + /** + * Implements a Filtered PCA classifier, + * using the ranker as a search method. + * + * @param classif type of classifier to be used + * @return filtered classif with PCA analysis config + */ + public AttributeSelectedClassifier setPCAFilter(Classifier classif){ + AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); + + //Creating evaluator and search method + PrincipalComponents PCA = new PrincipalComponents(); + PCA.setMaximumAttributeNames(-1); + Ranker rank = new Ranker(); + //return the attributes with evaluation greater than 0 + rank.setThreshold(0); + + //Setting the PCA classifier configurations + fClassif.setClassifier(classif); + fClassif.setEvaluator(PCA); + fClassif.setSearch(rank); + + return fClassif; + } + + /** + * Implements a Filtered LSA classifier, + * using the ranker as a search method + * @param classif + * @return + */ + + private AttributeSelectedClassifier setLSAFilter(Classifier classif) { + AttributeSelectedClassifier fClassif = new AttributeSelectedClassifier(); + + //Creating evaluator + LatentSemanticAnalysis LSA = new LatentSemanticAnalysis(); + LSA.setMaximumAttributeNames(-1); + //value between 0 and 1 includes proportion of total latent variables + //greater than 1 = exact # of variables to include; + //less than or equal zero = include all; + //default = 0.95 (proportional) + double defaul = 0; + LSA.setRank(defaul); + //Creating search method + Ranker rank = new Ranker(); + rank.setThreshold(0); + + //Setting the LSA classifier configurations + fClassif.setClassifier(classif); + fClassif.setEvaluator(LSA); + fClassif.setSearch(rank); + + return fClassif; + } + + + +} diff --git a/src/configure/.gitignore b/src/configure/.gitignore new file mode 100644 index 0000000..26ecd44 --- /dev/null +++ b/src/configure/.gitignore @@ -0,0 +1,2 @@ +/DeprecatedVariables.class +/PathConstants.class diff --git a/src/configure/PathConstants.java b/src/configure/PathConstants.java new file mode 100644 index 0000000..2103118 --- /dev/null +++ b/src/configure/PathConstants.java @@ -0,0 +1,180 @@ +/* + * The MIT License (MIT) + +Copyright (c) 2014 + +Hayda Almeida +Marie-Jean Meurs + +Concordia University +Tsang Lab + + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*** +* This class re-uses https://code.google.com/p/semlinker/source/browse/trunk/src/configure/NistKBPConfiguration.java +* The code authors: Eric Charton http://www.echarton.com twitter.com/ericcharton +* Marie-Jean Meurs http://mjmrsc.com/research/ twitter.com/mjmrsc +* +* This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. +*/ + +package configure; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * + * Variables used by the software + * + * @author Marie-Jean Meurs + * @since 2013 + * + */ +public class PathConstants { + + /** + * Default constructor + */ + public PathConstants() { + initVars(); + } + + /** + * Constructor with custom parameter file. + * @param configfile + */ + public PathConstants(String configfile) { + CONFIG_FILE = configfile; + initVars(); + } + + + public static String CONFIG_FILE = "config.cfg"; + public HashMap CONFIG_MAP = new HashMap(); + + //Input files + public String HOME_DIR; + public String CORPUS_DIR; + public String TRAIN_DIR; + public String TEST_DIR; + public String FEATURE_DIR; + public String OUTPUT_MODEL; + public String TRAINING_FILE; + public String TEST_FILE; + public String ARFF_TRAIN; + public String ARFF_TEST; + public String STOP_LIST; + + //Output files + public String JOURNAL_TITLE_FEATURES; + public String ECNUM_FEATURES; + public String ANNOTATION_FEATURES; + public String TITLE_FEATURES; + public String NGRAM_FEATURES; + public String TITLE_NGRAMS; + + //Feature setup + public String USE_TEXT_SIZE; + public String USE_JOURNAL_TITLE_FEATURE; + public String USE_ECNUM_FEATURE; + public String FEATURE_MIN_FREQ; + public String FEATURE_MIN_LENGTH; + + //Feature setup - Annotations + public String USE_ANNOTATION_FEATURE; + public String USE_ANNOTATION_TYPE; + public String USE_TITLE_FEATURE; + + + //Feature setup - Ngrams + public String USE_NGRAM_FEATURE; + public String USE_TITLE_NGRAMS; + public String NGRAM_STOP; + public String NGRAM_SIZE; + public String USE_WEIGHTED_NGRAM; + public String WEIGHT; + + //Task setup + public String EXP_TYPE; + public String NB_PARAMS; + + + private void initVars() { + String text = null; + + try { + BufferedReader reader = new BufferedReader(new FileReader(CONFIG_FILE)); + while ((text = reader.readLine()) != null) { + if (! text.startsWith("#")) { + String label = text.split("=")[0]; + String value = text.split("=")[1]; + CONFIG_MAP.put(label, value); + } + } + reader.close(); + } catch (IOException ex) { + Logger.getLogger(PathConstants.class.getName()).log(Level.SEVERE, null, ex); + } + HOME_DIR = CONFIG_MAP.get("HOME_DIR"); + CORPUS_DIR = CONFIG_MAP.get("CORPUS_DIR"); + TRAIN_DIR = CONFIG_MAP.get("TRAIN_DIR"); + TEST_DIR = CONFIG_MAP.get("TEST_DIR"); + FEATURE_DIR = CONFIG_MAP.get("FEATURE_DIR"); + OUTPUT_MODEL = CONFIG_MAP.get("OUTPUT_MODEL"); + TRAINING_FILE = CONFIG_MAP.get("TRAINING_FILE"); + TEST_FILE = CONFIG_MAP.get("TEST_FILE"); + ARFF_TRAIN = CONFIG_MAP.get("ARFF_TRAIN"); + ARFF_TEST = CONFIG_MAP.get("ARFF_TEST"); + STOP_LIST = CONFIG_MAP.get("STOP_LIST"); + + JOURNAL_TITLE_FEATURES = CONFIG_MAP.get("JOURNAL_TITLE_FEATURES"); + ECNUM_FEATURES = CONFIG_MAP.get("ECNUM_FEATURES"); + ANNOTATION_FEATURES = CONFIG_MAP.get("ANNOTATION_FEATURES"); + TITLE_FEATURES = CONFIG_MAP.get("TITLE_FEATURES"); + NGRAM_FEATURES = CONFIG_MAP.get("NGRAM_FEATURES"); + TITLE_NGRAMS = CONFIG_MAP.get("TITLE_NGRAMS"); + + USE_TEXT_SIZE = CONFIG_MAP.get("USE_TEXT_SIZE"); + USE_JOURNAL_TITLE_FEATURE = CONFIG_MAP.get("USE_JOURNAL_TITLE_FEATURE"); + USE_ECNUM_FEATURE = CONFIG_MAP.get("USE_ECNUM_FEATURE"); + FEATURE_MIN_FREQ = CONFIG_MAP.get("FEATURE_MIN_FREQ"); + FEATURE_MIN_LENGTH = CONFIG_MAP.get("FEATURE_MIN_LENGTH"); + + USE_ANNOTATION_FEATURE = CONFIG_MAP.get("USE_ANNOTATION_FEATURE"); + USE_ANNOTATION_TYPE = CONFIG_MAP.get("USE_ANNOTATION_TYPE"); + USE_TITLE_FEATURE = CONFIG_MAP.get("USE_TITLE_FEATURE"); + + USE_NGRAM_FEATURE = CONFIG_MAP.get("USE_NGRAM_FEATURE"); + USE_TITLE_NGRAMS = CONFIG_MAP.get("USE_TITLE_NGRAMS"); + NGRAM_STOP = CONFIG_MAP.get("NGRAM_STOP"); + NGRAM_SIZE = CONFIG_MAP.get("NGRAM_SIZE"); + USE_WEIGHTED_NGRAM = CONFIG_MAP.get("USE_WEIGHTED_NGRAM"); + WEIGHT = CONFIG_MAP.get("WEIGHT"); + + EXP_TYPE = CONFIG_MAP.get("EXP_TYPE"); + NB_PARAMS = CONFIG_MAP.get("NB_PARAMS"); + + } +} diff --git a/stopList.txt b/stopList.txt new file mode 100644 index 0000000..d42a69c --- /dev/null +++ b/stopList.txt @@ -0,0 +1 @@ +a,about,again,all,almost,also,although,always,among,an,and,another,any,are,as,at,be,because,been,before,being,between,both,but,by,can,could,did,do,does,done,due,during,each,either,enough,especially,etc,for,found,from,further,had,has,have,having,here,how,however,i,if,in,into,is,it,its,itself,just,kg,km,made,mainly,make,may,mg,might,ml,mm,most,mostly,must,nearly,neither,no,nor,obtained,of,often,on,our,overall,perhaps,pmid,quite,rather,really,regarding,seem,seen,several,should,show,showed,shown,shows,significantly,since,so,some,such,than,that,the,their,theirs,them,then,there,therefore,these,they,this,those,through,thus,to,upon,use,used,using,various,very,was,we,were,what,when,which,while,with,within,without,would