Skip to content

Commit

Permalink
mycoSORT src update
Browse files Browse the repository at this point in the history
  • Loading branch information
Hayda Almeida committed Apr 10, 2015
1 parent f83b439 commit e99b908
Show file tree
Hide file tree
Showing 15 changed files with 2,026 additions and 683 deletions.
32 changes: 32 additions & 0 deletions config-sample.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@ HOME_DIR=/.
# corpus directory
CORPUS_DIR=corpus/
#
# source documents directory
SOURCE_DIR=src/
#
# duplicate documents directory
DUP_DIR=test/
#
# positive instances directory
POS_DIR=positives/
#
# negative instances directory
NEG_DIR=negatives/
#
# train directory
TRAIN_DIR=train/
#
Expand Down Expand Up @@ -61,6 +73,9 @@ NGRAM_FEATURES=ngrams_features.txt
# Paper title n-grams feature list
TITLE_NGRAMS=titleGrams.txt
#
# Paper ID and class
DOC_IDS=docIDs.txt
#
###################################################
########################## FEATURE SETUP ##########
# Extract size of abstract and title
Expand All @@ -78,6 +93,9 @@ FEATURE_MIN_FREQ=2
# minimum length (in chars) to consider a feature
FEATURE_MIN_LENGTH=3
#
# extract document IDs
USE_DOC_ID=true
#
#############################
######### ANNOTATIONS #######
# Extract annotation content
Expand Down Expand Up @@ -109,6 +127,20 @@ NGRAM_SIZE=1
# Define weight of features
#WEIGHT=3
#
###################################################
########################## FEATURE SELECTION SETUP ##########
# Enable Odds Ratio (OR) filtering
USE_ODDS_RATIO=false
#
# Define minimum OR threshold to keep attribute
OR_THRESHOLD=1
#
# Enable inverted document frequency (idf) filtering
USE_IDF=false
#
# Define minimum OR threshold to keep attribute
IDF_THRESHOLD=1
#
#################################################
########################### TASK SETUP ##########
# experiment type : train = 0 / test = 1
Expand Down
3 changes: 3 additions & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.classpath
*.project
*.*~
23 changes: 12 additions & 11 deletions src/analyse/Extractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public class Extractor {

//String pathFile;
String id;
String endId;
protected String endId;
String openFile;
String endFile;
String openAbst;
Expand Down Expand Up @@ -95,7 +95,8 @@ public String removeSpecialChar(String str){
str = str.replace(")", "");
str = str.replace("(", "");
str = str.replace("\t\t", "\t");
str = str.replace("-", "");
//losing ngrams because of hifen between names
str = str.replace("-", " ");
str = str.replace(" ", "");

return str;
Expand Down Expand Up @@ -154,15 +155,15 @@ public String removeAbstractTags(String str){
//this order of removing tags matters to
//exclude the first tag from the abstracts.

str = str.replace("<AbstractText>", "");
str = str.replace("<AbstractText", "");
str = str.replace("<CopyrightInformation>", "");
str = str.replace("</CopyrightInformation>", "");
str = str.replace("Copyright", "");
str = str.replace("</AbstractText>", "");
str = str.replace("<Abstract>", "");
str = str.replace("</Abstract>", "");
str = str.replace("<AbstractText.*?>", "");
str = str.replace("<abstracttext>", "");
str = str.replace("<abstracttext", "");
str = str.replace("<copyrightinformation>", "");
str = str.replace("</copyrightinformation>", "");
str = str.replace("copyright", "");
str = str.replace("</abstractText>", "");
str = str.replace("<abstract>", "");
str = str.replace("</abstract>", "");
str = str.replace("<abstracttext.*?>", "");

return str;
}
Expand Down
Loading

0 comments on commit e99b908

Please sign in to comment.