mycoSORT first commit

TsangLab · Jul 30, 2014 · fcf52e1 · fcf52e1
commit fcf52e1
Show file tree

Hide file tree

Showing 21 changed files with 4,085 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,27 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 
+
+Hayda Almeida
+Marie-Jean Meurs
+
+Concordia University
+Tsang Lab
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,12 @@
+#mycoSORT
+
+A machine learning system for supporting the triage of biological literature.
+
+
+
+
+
+
+
+
+
diff --git a/config-sample.cfg b/config-sample.cfg
@@ -0,0 +1,118 @@
+#################################################
+#
+#
+#  Configuration file for mycoSORT
+#
+#
+##################################################
+########################### DIRECTORIES ##########
+# project home
+HOME_DIR=/.
+#
+# corpus directory
+CORPUS_DIR=corpus/
+#
+# train directory
+TRAIN_DIR=train/
+#
+# test directory
+TEST_DIR=test/
+#
+# feature directory
+FEATURE_DIR=features/
+#
+# output directory for arff files
+OUTPUT_MODEL=arff/
+#
+#################################################
+########################## INPUT FILES ##########
+# training file
+TRAINING_FILE=/triagecorpus_train.xml
+#
+# test file
+TEST_FILE=/triagecorpus_test.xml
+#
+# arff training file
+ARFF_TRAIN=triage0.arff
+#
+# arff testing file
+ARFF_TEST=triage1.arff
+#
+# stopwords list
+STOP_LIST=stopList.txt
+#
+##################################################
+########################## OUTPUT FILES ##########
+# EC numbers feature list
+ECNUM_FEATURES=ecnumbers.txt
+#
+# Journal title feature list
+JOURNAL_TITLE_FEATURES=journaltitles.txt
+#
+# Abstract annotations feature list
+ANNOTATION_FEATURES=annotations.txt
+#
+# Paper title annotations feature list
+TITLE_FEATURES=titleAnnotations.txt
+#
+# Abstract ngrams feature list
+NGRAM_FEATURES=ngrams_features.txt
+#
+# Paper title n-grams feature list
+TITLE_NGRAMS=titleGrams.txt
+#
+###################################################
+########################## FEATURE SETUP ##########
+# Extract size of abstract and title 
+USE_TEXT_SIZE=false
+#
+# Extract Journal of publication 
+USE_JOURNAL_TITLE_FEATURE=false
+#
+# Extract EC Numbers
+USE_ECNUM_FEATURE=true
+#
+# minimum frequency to consider a feature
+FEATURE_MIN_FREQ=2
+#
+# minimum length (in chars) to consider a feature
+FEATURE_MIN_LENGTH=3
+#
+#############################
+######### ANNOTATIONS #######
+# Extract annotation content
+USE_ANNOTATION_FEATURE=true
+#
+# Extract annotation entities
+USE_ANNOTATION_TYPE=true
+#
+# Extract annotations from title separately
+USE_TITLE_FEATURE=false
+#
+#############################
+########## N-GRAMS ##########
+# Extract ngrams 
+USE_NGRAM_FEATURE=false
+#
+# Extract ngrams from title separately
+USE_TITLE_NGRAMS=false
+#
+#use of stopwords list on ngrams
+NGRAM_STOP=true
+#
+# Define size of extracted n-grams
+NGRAM_SIZE=1
+#
+# Apply weights to ngrams
+#USE_WEIGHTED_NGRAM=false
+#
+# Define weight of features
+#WEIGHT=3
+#
+#################################################
+########################### TASK SETUP ##########
+# experiment type : train = 0 / test = 1
+EXP_TYPE=0
+#
+# limit numbers of parameters - quantity (top) or -1 all file
+NB_PARAMS=-1
diff --git a/entities.txt b/entities.txt
@@ -0,0 +1,23 @@
+annotation_type annotation_level
+AccessionNumber entity
+ActivityAssayConditions sentence
+Assay entity
+Buffer entity
+Characterization entity
+Enzyme entity
+Expression sentence
+Family entity
+Fungus entity
+Gene entity
+Glycoside_Hydrolase entity
+Glycosylation sentence
+Kinetics sentence
+Laccase entity
+Lipase entity
+Peroxidase entity
+pH sentence
+ProductAnalysis sentence
+Temperature sentence
+SpecificActivity sentence
+Substrate entity
+SubstrateSpecificity sentence
diff --git a/jar/README b/jar/README
@@ -0,0 +1,7 @@
+Please add to this folder the following libraries:
+commons-lang3-3.2.1.jar
+jsoup-1.7.3.jar
+weka.jar
+LibSVM.jar
+LibSVM/libsvm.jar
+
diff --git a/jar/README~ b/jar/README~
@@ -0,0 +1,7 @@
+Please add to this folder the following libraries:
+commons-lang3-3.2.1.jar
+jsoup-1.7.3.jar
+weka.jar
+LibSVM.jar
+libsvm.jar
+
diff --git a/src/analyse/.gitignore b/src/analyse/.gitignore
@@ -0,0 +1 @@
+*.class
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,12 @@
		#mycoSORT

		A machine learning system for supporting the triage of biological literature.