@@ -140,6 +140,7 @@ def max_gt_depth(t):
140
140
def main (args ):
141
141
data = pd .read_csv (args .table , sep = '\t ' , header = 0 , index_col = 0 )
142
142
orig = pd .read_csv (args .table , sep = '\t ' , header = 0 , index_col = 0 , keep_default_na = False , na_values = ['_' ])
143
+
143
144
orig ['pred' ] = 0
144
145
145
146
# modify and generate additional columns
@@ -163,8 +164,9 @@ def main(args):
163
164
subsets = list (set (data ['Superfamily' ]))
164
165
X_train = data .loc [data ['NonRef' ] > 1 ]
165
166
167
+
166
168
for subset in subsets :
167
- logger . info ( "Training %s" % subset )
169
+
168
170
X_train_subset = X_train [X_train ['Superfamily' ] == subset ]
169
171
X_test_subset = data [data ['Superfamily' ] == subset ]
170
172
@@ -173,14 +175,20 @@ def main(args):
173
175
X_train_subset = X_train_subset [model_cols ]
174
176
X_test_subset = X_test_subset [model_cols ]
175
177
176
- clf = IsolationForest (behaviour = 'new' , contamination = 'auto' , random_state = 42 )
178
+ logger .info ("Training %s" % subset )
179
+
180
+ clf = IsolationForest (behaviour = 'new' , contamination = 'auto' , random_state = 42 , max_samples = 'auto' )
177
181
clf .fit (X_train_subset )
178
182
179
183
y_pred_train = clf .predict (X_train_subset )
180
184
y_pred_test = clf .predict (X_test_subset )
181
185
182
186
orig ['pred' ].loc [orig_subset .index ] = y_pred_test
183
187
188
+ # positive examples should be positive
189
+ #orig['pred'].loc[orig['NonRef'] != 'NA'] = 1
190
+ orig ['pred' ].loc [data ['NonRef' ] > 1 ] = 1
191
+
184
192
185
193
orig .to_csv ('%s.isoforest.txt' % args .table , sep = '\t ' )
186
194
0 commit comments