diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..038e6d4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+The MIT License (MIT)
+
+Copyright (c) 2014
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..76992cf
--- /dev/null
+++ b/README.md
@@ -0,0 +1,12 @@
+#mycoSORT
+
+A machine learning system for supporting the triage of biological literature.
+
+
+
+
+
+
+
+
+
diff --git a/config-sample.cfg b/config-sample.cfg
new file mode 100644
index 0000000..a9b3483
--- /dev/null
+++ b/config-sample.cfg
@@ -0,0 +1,118 @@
+#################################################
+#
+#
+# Configuration file for mycoSORT
+#
+#
+##################################################
+########################### DIRECTORIES ##########
+# project home
+HOME_DIR=/.
+#
+# corpus directory
+CORPUS_DIR=corpus/
+#
+# train directory
+TRAIN_DIR=train/
+#
+# test directory
+TEST_DIR=test/
+#
+# feature directory
+FEATURE_DIR=features/
+#
+# output directory for arff files
+OUTPUT_MODEL=arff/
+#
+#################################################
+########################## INPUT FILES ##########
+# training file
+TRAINING_FILE=/triagecorpus_train.xml
+#
+# test file
+TEST_FILE=/triagecorpus_test.xml
+#
+# arff training file
+ARFF_TRAIN=triage0.arff
+#
+# arff testing file
+ARFF_TEST=triage1.arff
+#
+# stopwords list
+STOP_LIST=stopList.txt
+#
+##################################################
+########################## OUTPUT FILES ##########
+# EC numbers feature list
+ECNUM_FEATURES=ecnumbers.txt
+#
+# Journal title feature list
+JOURNAL_TITLE_FEATURES=journaltitles.txt
+#
+# Abstract annotations feature list
+ANNOTATION_FEATURES=annotations.txt
+#
+# Paper title annotations feature list
+TITLE_FEATURES=titleAnnotations.txt
+#
+# Abstract ngrams feature list
+NGRAM_FEATURES=ngrams_features.txt
+#
+# Paper title n-grams feature list
+TITLE_NGRAMS=titleGrams.txt
+#
+###################################################
+########################## FEATURE SETUP ##########
+# Extract size of abstract and title
+USE_TEXT_SIZE=false
+#
+# Extract Journal of publication
+USE_JOURNAL_TITLE_FEATURE=false
+#
+# Extract EC Numbers
+USE_ECNUM_FEATURE=true
+#
+# minimum frequency to consider a feature
+FEATURE_MIN_FREQ=2
+#
+# minimum length (in chars) to consider a feature
+FEATURE_MIN_LENGTH=3
+#
+#############################
+######### ANNOTATIONS #######
+# Extract annotation content
+USE_ANNOTATION_FEATURE=true
+#
+# Extract annotation entities
+USE_ANNOTATION_TYPE=true
+#
+# Extract annotations from title separately
+USE_TITLE_FEATURE=false
+#
+#############################
+########## N-GRAMS ##########
+# Extract ngrams
+USE_NGRAM_FEATURE=false
+#
+# Extract ngrams from title separately
+USE_TITLE_NGRAMS=false
+#
+#use of stopwords list on ngrams
+NGRAM_STOP=true
+#
+# Define size of extracted n-grams
+NGRAM_SIZE=1
+#
+# Apply weights to ngrams
+#USE_WEIGHTED_NGRAM=false
+#
+# Define weight of features
+#WEIGHT=3
+#
+#################################################
+########################### TASK SETUP ##########
+# experiment type : train = 0 / test = 1
+EXP_TYPE=0
+#
+# limit numbers of parameters - quantity (top) or -1 all file
+NB_PARAMS=-1
diff --git a/entities.txt b/entities.txt
new file mode 100644
index 0000000..7714e43
--- /dev/null
+++ b/entities.txt
@@ -0,0 +1,23 @@
+annotation_type annotation_level
+AccessionNumber entity
+ActivityAssayConditions sentence
+Assay entity
+Buffer entity
+Characterization entity
+Enzyme entity
+Expression sentence
+Family entity
+Fungus entity
+Gene entity
+Glycoside_Hydrolase entity
+Glycosylation sentence
+Kinetics sentence
+Laccase entity
+Lipase entity
+Peroxidase entity
+pH sentence
+ProductAnalysis sentence
+Temperature sentence
+SpecificActivity sentence
+Substrate entity
+SubstrateSpecificity sentence
\ No newline at end of file
diff --git a/jar/README b/jar/README
new file mode 100644
index 0000000..9a9b435
--- /dev/null
+++ b/jar/README
@@ -0,0 +1,7 @@
+Please add to this folder the following libraries:
+commons-lang3-3.2.1.jar
+jsoup-1.7.3.jar
+weka.jar
+LibSVM.jar
+LibSVM/libsvm.jar
+
diff --git a/jar/README~ b/jar/README~
new file mode 100644
index 0000000..56f2ce9
--- /dev/null
+++ b/jar/README~
@@ -0,0 +1,7 @@
+Please add to this folder the following libraries:
+commons-lang3-3.2.1.jar
+jsoup-1.7.3.jar
+weka.jar
+LibSVM.jar
+libsvm.jar
+
diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore
new file mode 100644
index 0000000..6b468b6
--- /dev/null
+++ b/src/analyse/.gitignore
@@ -0,0 +1 @@
+*.class
diff --git a/src/analyse/ConcatXML.java b/src/analyse/ConcatXML.java
new file mode 100644
index 0000000..9c24173
--- /dev/null
+++ b/src/analyse/ConcatXML.java
@@ -0,0 +1,734 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+package analyse;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Date;
+import java.util.List;
+
+import configure.PathConstants;
+
+/**
+ * Generates a corpus from raw XML doc instances,
+ * so that features can be extracted from it
+ *
+ * @author halmeida
+ *
+ */
+public class ConcatXML extends Extractor{
+
+ private String tag1;
+ private String tag2;
+ private String tag3;
+
+
+ public ConcatXML(){
+ this.id = "";
+ this.tag2 = "";
+ this.tag3 = "";
+ }
+
+ public static void main(String[] args) throws IOException {
+
+ PathConstants pathVars = new PathConstants();
+
+ String timeStamp = new SimpleDateFormat("yyyyMMdd_hh:mm").format(new Date());
+
+ String trainCorpusPath = pathVars.HOME_DIR + pathVars.CORPUS_DIR + pathVars.TRAIN_DIR +pathVars.TRAINING_FILE;
+ String xmlDir = "train";
+ String sourceDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "all_nbs/"+ xmlDir;
+ String duplicatesDir = pathVars.HOME_DIR + pathVars.CORPUS_DIR + "/src"+ "/annotated_GH27-36_2013_12_31";
+
+ String concatCorpus = pathVars.HOME_DIR + pathVars.CORPUS_DIR +"triagecorpus_"+ xmlDir +"_"+timeStamp+".xml";
+ String tagCorpus = concatCorpus;
+
+ ConcatXML concat = new ConcatXML();
+
+ //================= Checking for duplicates =====================//
+ //concat.checkDupCorpus(trainCorpusPath, sourceDir);
+ //concat.checkDupFolder(sourceDir, duplicatesDir);
+
+
+ //================== Creating corpus ==========================//
+ concat.cleanXML(sourceDir);
+ //concat.cleanXML(duplicatesDir);
+ concat.concatenateXML(sourceDir, "", concatCorpus);
+ concat.tagCorpus(tagCorpus);
+ }
+
+ /**
+ * Reads the file IDs in a folder and
+ * checks a second folder for duplicates.
+ *
+ * @param dirSrc source folder
+ * @param dirDup folder to check for duplicates
+ */
+
+ public void checkDupFolder(String dirSrc, String dirDup){
+ ArrayList sourceIDs = new ArrayList();
+ ArrayList duplicated = new ArrayList();
+ ArrayList dupIDs = new ArrayList();
+ int ids = 0;
+
+ if(dirSrc.contentEquals(dirDup)){
+ System.out.println("Source and duplicates directories are the same.\n\n========================\n");
+ }
+ else {
+
+ File sourceDir = new File(dirSrc);
+ File[] srcXMLs = sourceDir.listFiles(new FilenameFilter(){
+ @Override
+ public boolean accept(File dir, String name){
+ return name.endsWith(".xml");
+ }
+ });
+
+ try{
+ //for each file on the source dir
+ for (File xml : srcXMLs){
+
+ try{
+ BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+ String line = null;
+
+ String id = null;
+
+ while((line = reader.readLine()) != null){
+
+ line = line.replaceAll("\t","");
+ line = line.replace("\"", "");
+
+ //get the IDs of the new files
+ if (line.contains(getid())){
+
+ line = line.substring(line.indexOf(">", "");
+
+ id = line.replace(getendId(), "");
+
+ sourceIDs.add(id);
+
+ line = reader.readLine();
+ line = line.replaceAll("\t","");
+ }
+
+ if(line.contains(getOpenJournal())){
+ ids++;
+ }
+
+ line = line.replaceAll("\t","");
+ line = line.replace("\"", "");
+ }
+
+ reader.close();
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ catch(Exception e){
+ throw new RuntimeException(e);
+ }
+
+ System.out.println(ids + " source file IDs encountered.");
+ ids = 0;
+
+ File dupDir = new File(dirDup);
+
+ File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+ @Override
+ public boolean accept(File dir, String name){
+ return name.endsWith(".xml");
+ }
+ });
+
+ try{
+ //for each file on the possibly duplicated dir
+ for (File xml : dupXMLs){
+
+ try{
+ BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+ String line = null;
+
+ String id = null;
+
+ while((line = reader.readLine()) != null){
+
+ line = line.replaceAll("\t","");
+ line = line.replace("\"", "");
+
+ //get the IDs of the new files
+ if (line.contains(getid())){
+
+ line = line.substring(line.indexOf(">", "");
+
+ id = line.replace(getendId(), "");
+
+ dupIDs.add(id);
+ String dupFileID = id;
+
+ for(int j = 0; j < sourceIDs.size(); j++){
+ if(sourceIDs.get(j).equalsIgnoreCase(dupFileID)){
+ //moving the original file
+ Path from = xml.toPath(); //convert from File to Path
+ Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+ Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+ }
+ }
+
+
+ line = reader.readLine();
+ line = line.replaceAll("\t","");
+ }
+
+ if(line.contains(getOpenJournal())){
+ ids++;
+ }
+
+ line = line.replaceAll("\t","");
+ line = line.replace("\"", "");
+ }
+
+ reader.close();
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ catch(Exception e){
+ throw new RuntimeException(e);
+ }
+
+ //count number of existing papers on possibly duplicated folder
+ //just to make sure we are gathering all IDs
+ System.out.println(ids + " new file IDs encountered.");
+ ids = 0;
+
+ //for each possible duplicated ID,
+ //check if it exists on source folder ID list
+ //if yes, list the duplicated ones
+ for(int i = 0; i < dupIDs.size(); i++){
+ for(int j = 0; j < sourceIDs.size(); j++){
+ if(sourceIDs.get(j).equalsIgnoreCase(dupIDs.get(i))){
+ duplicated.add(dupIDs.get(i));
+ }
+ }
+ }
+
+ //sorting the list of duplicated IDs
+ Collections.sort(duplicated, new Comparator(){
+ @Override
+ public int compare(String one, String two){
+ return one.compareTo(two);
+ }
+ });
+
+ System.out.println("\nReaded source files: " + sourceIDs.size());
+ System.out.println("Readed new files: " + dupIDs.size());
+
+ System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+
+ System.out.println("\nDuplicated files IDs: ");
+ for(int i = 0; i < duplicated.size(); i++){
+ System.out.println(duplicated.get(i));
+ }
+
+ System.out.println("\n========================\n");
+ }
+
+
+ }
+
+ /**
+ * Reads the corpus and checks the papers IDs
+ * to identify duplicates in case new papers
+ * are being concatenated to corpus.
+ *
+ * @param corpus path to current corpora to check
+ * @param dir path to folder with new files to be concatenated
+ */
+
+ public void checkDupCorpus(String corpus, String dir){
+ ArrayList trainingIDs = new ArrayList();
+ ArrayList duplicated = new ArrayList();
+ ArrayList newFiles = new ArrayList();
+
+ int ids = 0;
+
+ try
+ {
+ BufferedReader reader = new BufferedReader(new FileReader(corpus));
+
+ String line = null;
+ String id = null;
+
+
+ while((line = reader.readLine()) != null){
+
+ line = line.replaceAll("\t","");
+ line = line.replace("\"", "");
+
+ //on the previous training corpus
+ //find exact paper ID and store it
+ if (line.contains(getid())){
+
+ line = line.substring(line.indexOf(">", "");
+
+ id = line.replace(getendId(), "");
+
+ //insert paper ID to existing training file list
+ trainingIDs.add(id);
+
+ line = reader.readLine();
+ line = line.replaceAll("\t","");
+ }
+
+ //count number of existing papers on the training file
+ //just to make sure we are gathering all IDs
+ if(line.contains(getOpenJournal())){
+ ids++;
+ }
+
+ line = line.replaceAll("\t","");
+ line = line.replace("\"", "");
+ }
+
+ reader.close();
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ System.out.println(ids + " training file IDs encountered.");
+ ids = 0;
+
+ File corpusDir = new File(dir);
+ File[] newXMLs = corpusDir.listFiles(new FilenameFilter(){
+ @Override
+ public boolean accept(File dir, String name){
+ return name.endsWith(".xml");
+ }
+ });
+
+ try{
+ //for each file on the corpus dir
+ for (File xml : newXMLs){
+
+ try{
+ BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+ String line = null;
+
+ String id = null;
+
+ while((line = reader.readLine()) != null){
+
+ line = line.replaceAll("\t","");
+ line = line.replace("\"", "");
+
+ //get the IDs of the new files
+ if (line.contains(getid())){
+
+ line = line.substring(line.indexOf(">", "");
+
+ id = line.replace(getendId(), "");
+
+ newFiles.add(id);
+ String newFileID = id;
+
+ for(int j = 0; j < trainingIDs.size(); j++){
+ if(trainingIDs.get(j).equalsIgnoreCase(newFileID)){
+ //moving the original file
+ Path from = xml.toPath(); //convert from File to Path
+ Path to = Paths.get(xml.toPath()+".duplicated"); //convert from String to Path
+ Files.move(from, to, StandardCopyOption.REPLACE_EXISTING);
+ }
+ }
+
+
+ line = reader.readLine();
+ line = line.replaceAll("\t","");
+ }
+
+ if(line.contains(getOpenJournal())){
+ ids++;
+ }
+
+ line = line.replaceAll("\t","");
+ line = line.replace("\"", "");
+ }
+
+ reader.close();
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ catch(Exception e){
+ throw new RuntimeException(e);
+ }
+
+ //count number of existing papers on the training file
+ //just to make sure we are gathering all IDs
+ System.out.println(ids + " new file IDs encountered.");
+ ids = 0;
+
+ //for each new ID, check if it exists on training file ID list
+ //if yes, list the duplicated ones
+ for(int i = 0; i < newFiles.size(); i++){
+ for(int j = 0; j < trainingIDs.size(); j++){
+ if(trainingIDs.get(j).equalsIgnoreCase(newFiles.get(i))){
+ duplicated.add(newFiles.get(i));
+ }
+ }
+ }
+
+ //sorting the list of duplicated IDs
+ Collections.sort(duplicated, new Comparator(){
+ @Override
+ public int compare(String one, String two){
+ return one.compareTo(two);
+ }
+ });
+
+ System.out.println("\nReaded training files: " + trainingIDs.size());
+ System.out.println("Readed new files: " + newFiles.size());
+
+ System.out.println("\nDuplicated files renamed: " + duplicated.size()+"\n");
+
+ System.out.println("\nDuplicated files IDs: ");
+ for(int i = 0; i < duplicated.size(); i++){
+ System.out.println(duplicated.get(i));
+ }
+
+ System.out.println("\n========================\n");
+
+ }
+
+
+ /**
+ * Reads and edits a list of XMLs files in a folder
+ * to remove XML and previous corpus tags,
+ * preparing the files to be concatenated.
+ *
+ * @param dir string with folder path
+ */
+
+ public void cleanXML(String dir){
+
+ //listing files on corpus dir
+ File sourceDir = new File(dir);
+
+ File[] newXMLs = sourceDir.listFiles(new FilenameFilter(){
+ @Override
+ public boolean accept(File dir, String name){
+ return name.endsWith(".xml");
+ }
+ });
+
+ System.out.println("... Files list loaded.");
+
+ try{
+ //for each file on the corpus dir
+ for (File xml : newXMLs){
+
+ try{
+ BufferedReader reader = new BufferedReader(new FileReader(xml.getPath()));
+
+ String line = null;
+ ArrayList allLines = new ArrayList();
+ String content = null;
+
+ while((line = reader.readLine()) != null){
+ content = line;
+
+ //cleaning XML markups
+ if(content.contains(getTag1())){
+ content = content.replace(getTag1(), "");
+ allLines.add(content);
+ }
+ if(content.contains(getTag2())){
+ content = content.replace(getTag2(), "");
+ allLines.add(content);
+ }
+ if(content.contains(getTag3())){
+ content = content.replace(getTag3(), "");
+ allLines.add(content);
+ }
+
+ //cleaning previous corpus tags
+ if(content.contains(getOpenFile())){
+ content = content.replace(getOpenFile(), "");
+ allLines.add(content);
+ }
+ if(content.contains(getendFile())){
+ content = content.replace(getendFile(), "");
+ allLines.add(content);
+ }
+
+ allLines.add(content);
+ }
+
+ PrintWriter writer = new PrintWriter(xml.getPath());
+
+ for (String l : allLines){
+ writer.println(l);
+ }
+ reader.close();
+ writer.close();
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ catch(Exception e){
+ throw new RuntimeException(e);
+ }
+
+ System.out.println("... Files cleaned and saved.");
+ System.out.println("Ready for concatenation.");
+ System.out.println("\n========================\n");
+ }
+
+ /**
+ * Concatenates all XMLs in one folder or between two folders.
+ * @param sourceDir main directory with XML files.
+ * @param duplicDir second directory with duplicated XML files
+ * @param concatFile path name to saved concatenated corpus
+ */
+
+ public void concatenateXML(String sourceDir, String duplicDir, String concatFile){
+
+ final int BUFFER = 1024 << 8;
+ byte[] buffer = new byte[BUFFER];
+
+ //listing files on corpus dir
+ File srcDir = new File(sourceDir);
+ File[] srcXMLs = srcDir.listFiles(new FilenameFilter(){
+ @Override
+ public boolean accept(File dir, String name){
+ return name.endsWith(".xml");
+ }
+ });
+
+ File dupDir = new File(duplicDir);
+ File[] dupXMLs = dupDir.listFiles(new FilenameFilter(){
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.endsWith(".xml");
+ }
+ });
+
+ System.out.println("... Files list loaded.");
+
+ //defining the output file (concatenated)
+ File newCorpus = new File(concatFile);
+
+ try{
+ OutputStream output = new BufferedOutputStream(new FileOutputStream(newCorpus));
+
+
+ //for each file on the corpus dir
+ for (File xmls : srcXMLs){
+ InputStream input = new FileInputStream(xmls);
+ int count;
+
+ //if the file is not empty/finished
+ try{
+ while((count = input.read(buffer)) >= 0){
+
+ //write it on the concatenated final file
+ output.write(buffer, 0, count);
+ }
+ }finally{
+ input.close();
+ }
+ }
+
+ if(dupXMLs != null){
+ for(File xmld : dupXMLs){
+ InputStream input = new FileInputStream(xmld);
+ int count;
+
+ //if the file is not empty/finished
+ try{
+ while((count = input.read(buffer)) >= 0){
+
+ //write it on the concatenated final file
+ output.write(buffer, 0, count);
+ }
+ }finally{
+ input.close();
+ }
+ }
+ }
+ output.flush();
+ output.close();
+
+ }catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ catch(Exception e){
+ throw new RuntimeException(e);
+ }
+
+ System.out.println("... File concatenated and saved.");
+ System.out.println("Ready for corpus tagging.");
+ System.out.println("\n========================\n");
+ }
+
+ /**
+ * Inserts corpus tag on XML file
+ *
+ * @param pathToCorpus path to
+ * concatenated corpus
+ */
+
+ public void tagCorpus(String pathToCorpus){
+
+ //tagging as corpus
+ try{
+ BufferedReader reader = new BufferedReader(new FileReader(pathToCorpus));
+
+ String line = null;
+ String edit = null;
+ List allLines = new ArrayList();
+
+ //adds tag at beggining of corpus
+ allLines.add(getOpenFile());
+
+ while((line = reader.readLine()) != null){
+
+ allLines.add(line);
+ }
+ //adds tag at the end of corpus
+ allLines.add(getendFile());
+
+ System.out.println("... Corpus loaded and tagged.");
+ //re-writting the file
+ PrintWriter writer = new PrintWriter(pathToCorpus);
+
+ for (String l : allLines){
+ writer.println(l);
+ }
+ reader.close();
+ writer.close();
+
+ System.out.println("... File saved as tagged corpus.");
+ }
+ catch (FileNotFoundException e) {
+ e.printStackTrace();
+ }
+ catch(IOException e){
+ e.printStackTrace();
+ }
+ }
+
+ public String getTag1() {
+ return tag1;
+ }
+
+ public void setTag1(String tag1) {
+ this.tag1 = tag1;
+ }
+
+ public String getTag2() {
+ return tag2;
+ }
+
+ public void setTag2(String tag2) {
+ this.tag2 = tag2;
+ }
+
+ public String getTag3() {
+ return tag3;
+ }
+
+ public void setTag3(String tag3) {
+ this.tag3 = tag3;
+ }
+
+
+}
+
+
diff --git a/src/analyse/Extractor.java b/src/analyse/Extractor.java
new file mode 100644
index 0000000..c97cfa7
--- /dev/null
+++ b/src/analyse/Extractor.java
@@ -0,0 +1,442 @@
+/*
+ * The MIT License (MIT)
+
+Copyright (c) 2014
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package analyse;
+
+import java.io.BufferedWriter;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+/**
+ * Implements common tools to FeatureExtractor
+ * and NgramExtractor classes that are used to
+ * extract features from doc instances
+ *
+ * @author halmeida
+ *
+ */
+public class Extractor {
+
+ //String pathFile;
+ String id;
+ String endId;
+ String openFile;
+ String endFile;
+ String openAbst;
+ String closeAbst;
+ String abstractLabel;
+ String openEC;
+ String closeEC;
+ String classTag;
+ String openTitle;
+ String closeTitle;
+ String openJournal;
+ String closeJournal;
+ String copyR;
+ String closeCopyR;
+
+ /**
+ * Replaces special characters to clean
+ * text for tokenizing.
+ *
+ * @param str text to be cleaned
+ * @return string with cleaned text
+ */
+ public String removeSpecialChar(String str){
+ str = str.replace("}", "");
+ str = str.replace("{", "");
+ str = str.replace("]", "");
+ str = str.replace("[", "");
+ str = str.replace("#", "");
+ str = str.replace("*", "");
+ str = str.replace(">", "");
+ str = str.replace("&apos", "");
+ str = str.replace("%", "");
+ str = str.replace(""", "");
+ str = str.replace("&", "");
+ str = str.replace("=", "");
+ str = str.replace("?", "");
+ str = str.replace(";", "");
+ str = str.replace(":", "");
+ str = str.replace(",", "");
+ str = str.replace(".", "");
+ str = str.replace(")", "");
+ str = str.replace("(", "");
+ str = str.replace("\t\t", "\t");
+ str = str.replace("-", "");
+ str = str.replace(" ", "");
+
+ return str;
+ }
+
+ /**
+ * Handles external tags (and multiple abstract
+ * text tags) present in a single paper
+ * @param str abstract content
+ * @return string without external tags
+ */
+
+ public String processAbstract(String str){
+ str = str.replace(" ", "");
+ String[] remove = str.split("");
+ StringBuilder sb = new StringBuilder();
+ String temp = "";
+ String abstrac = "";
+
+ for(int i = 0; i < remove.length; i++){
+ temp = temp + remove[i];
+
+ if(temp.contains("")));
+ }
+ if(temp.contains("Copyright ")){
+ temp = "";
+ do{
+ i++;
+ //an exception here can mean that a copyright information
+ //tag content did not ended with a period
+ }while(!(remove[i]).equalsIgnoreCase("."));
+ }
+ else sb.append(remove[i]);
+ }
+
+ abstrac = sb.toString();
+ abstrac = removeAbstractTags(abstrac);
+
+ return abstrac;
+ }
+
+
+ /**
+ * Removes specific tags encountered on Abstract texts.
+ * This is used to clean the abstract text before
+ * processing the feature count on the model.
+ * @param str
+ * @return
+ */
+
+ public String removeAbstractTags(String str){
+ //this order of removing tags matters to
+ //exclude the first tag from the abstracts.
+
+ str = str.replace("", "");
+ str = str.replace("", "");
+ str = str.replace("", "");
+ str = str.replace("Copyright", "");
+ str = str.replace("", "");
+ str = str.replace("", "");
+ str = str.replace("", "");
+ str = str.replace("", "");
+
+ return str;
+ }
+
+
+ /**
+ * Removes the markup annotations of a
+ * text field, and keeps its content
+ *
+ * @param str text containing markups
+ * @return string with cleaned text
+ */
+ public String removeTags(String str) {
+ String[] remove = str.split("");
+ StringBuilder sb = new StringBuilder();
+
+ for(int i = 0; i < remove.length; i++){
+
+ if(remove[i].equalsIgnoreCase("<")){
+ do{
+ i++;
+ }
+ while(!(remove[i].equalsIgnoreCase(">")));
+ }
+ else sb.append(remove[i]);
+ }
+
+ return sb.toString();
+ }
+
+
+ /**
+ * Displays the keys and values of the
+ * maps created.
+ *
+ * @param hash HashMap containing list,
+ * values, counts
+ */
+ public void displayList(HashMap hash){
+ Iterator";
+ this.openAbst = "";
+ this.closeAbst = "";
+ this.abstractLabel = ",Integer> abstract_count = new HashMap