Skip to content

Commit 4f62086

Browse files
committed
known nonref positive examples should be positive
1 parent b985ec6 commit 4f62086

File tree

1 file changed

+10
-2
lines changed

1 file changed

+10
-2
lines changed

scripts/filter_isoforest.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ def max_gt_depth(t):
140140
def main(args):
141141
data = pd.read_csv(args.table, sep='\t', header=0, index_col=0)
142142
orig = pd.read_csv(args.table, sep='\t', header=0, index_col=0, keep_default_na=False, na_values=['_'])
143+
143144
orig['pred'] = 0
144145

145146
# modify and generate additional columns
@@ -163,8 +164,9 @@ def main(args):
163164
subsets = list(set(data['Superfamily']))
164165
X_train = data.loc[data['NonRef'] > 1]
165166

167+
166168
for subset in subsets:
167-
logger.info("Training %s" % subset)
169+
168170
X_train_subset = X_train[X_train['Superfamily'] == subset]
169171
X_test_subset = data[data['Superfamily'] == subset]
170172

@@ -173,14 +175,20 @@ def main(args):
173175
X_train_subset = X_train_subset[model_cols]
174176
X_test_subset = X_test_subset[model_cols]
175177

176-
clf = IsolationForest(behaviour='new', contamination='auto', random_state=42)
178+
logger.info("Training %s" % subset)
179+
180+
clf = IsolationForest(behaviour='new', contamination='auto', random_state=42, max_samples='auto')
177181
clf.fit(X_train_subset)
178182

179183
y_pred_train = clf.predict(X_train_subset)
180184
y_pred_test = clf.predict(X_test_subset)
181185

182186
orig['pred'].loc[orig_subset.index] = y_pred_test
183187

188+
# positive examples should be positive
189+
#orig['pred'].loc[orig['NonRef'] != 'NA'] = 1
190+
orig['pred'].loc[data['NonRef'] > 1] = 1
191+
184192

185193
orig.to_csv('%s.isoforest.txt' % args.table, sep='\t')
186194

0 commit comments

Comments
 (0)