Skip to content

Commit 05f9039

Browse files
authored
Merge pull request #45 from outbrain/3mr-fixes
3mr fixes
2 parents ee41b67 + 4fe538c commit 05f9039

File tree

5 files changed

+37
-6
lines changed

5 files changed

+37
-6
lines changed

outrank/__main__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,13 @@ def main():
183183
help="Which ';'-separated features should be one-hot encoded into n new features (coverage analysis)",
184184
)
185185

186+
parser.add_argument(
187+
'--silent',
188+
type=str,
189+
default='False',
190+
help='Suppress the logo and tips.',
191+
)
192+
186193
parser.add_argument(
187194
'--subfeature_mapping',
188195
type=str,

outrank/algorithms/importance_estimator.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# A module for pairwise computation of importances -- entrypoint for the core ranking algorighm(s)
22
from __future__ import annotations
33

4+
import logging
45
import operator
56
import traceback
67
from typing import Any
@@ -16,6 +17,9 @@
1617
from sklearn.preprocessing import OneHotEncoder
1718
from sklearn.svm import SVC
1819

20+
logger = logging.getLogger('syn-logger')
21+
logger.setLevel(logging.DEBUG)
22+
1923
try:
2024
from outrank.algorithms.feature_ranking import ranking_mi_numba
2125

@@ -99,6 +103,13 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
99103
feature_one = combination[0]
100104
feature_two = combination[1]
101105

106+
if feature_one not in tmp_df.columns:
107+
logging.info(f'{feature_one} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
108+
return [feature_one, feature_two, 0]
109+
elif feature_two not in tmp_df.columns:
110+
logging.info(f'{feature_two} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
111+
return [feature_one, feature_two, 0]
112+
102113
vector_first = tmp_df[[feature_one]].values.ravel()
103114
vector_second = tmp_df[[feature_two]].values.ravel()
104115

@@ -156,10 +167,18 @@ def rank_features_3MR(
156167
def calc_higher_order(feature, is_redundancy=True):
157168
values = []
158169
for feat in ranked_features:
170+
interaction_tuple = (feat, feature)
159171
if is_redundancy:
160-
values.append(redundancy_dict[(feat, feature)])
172+
if interaction_tuple in redundancy_dict:
173+
values.append(redundancy_dict[interaction_tuple])
174+
else:
175+
logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
161176
else:
162-
values.append(relational_dict[(feat, feature)])
177+
if interaction_tuple in relational_dict:
178+
values.append(relational_dict[interaction_tuple])
179+
else:
180+
logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
181+
163182
if strategy == 'sum':
164183
return sum(values)
165184
if strategy == 'mean':

outrank/core_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,11 +182,14 @@ def parse_ob_line_vw(
182182

183183
# Hash multi-value tuples and store name-val mappings
184184
for remaining_part in remainder:
185-
core_parts = remaining_part.split(' ')
185+
core_parts = remaining_part.strip().split(' ')
186186
namespace_part = core_parts[0]
187187
other_parts = '-'.join(x for x in core_parts[1:] if x != '')
188+
188189
if namespace_part in fw_col_mapping:
189190
remainder_hash[fw_col_mapping[namespace_part]] = other_parts
191+
else:
192+
logging.error(f"Didn't find namespace {namespace_part}")
190193

191194
# Construct the consistently-mapped instance based on the remainder mapping
192195
the_real_instance = [
@@ -200,6 +203,7 @@ def parse_ob_line_vw(
200203
]
201204

202205
parts = [label] + the_real_instance
206+
203207
return parts
204208

205209

outrank/task_ranking.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@ def outrank_task_conduct_ranking(args: Any):
3838
if args.task in ['identify_rare_values', 'feature_summary_transformers']:
3939
args.heuristic = 'Constant'
4040

41-
display_tool_name()
42-
display_random_tip()
41+
if args.silent != 'True':
42+
display_tool_name()
43+
display_random_tip()
4344

4445
dataset_info = get_dataset_info(args)
4546

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def _read_description():
2323
packages = [x for x in setuptools.find_packages() if x != 'test']
2424
setuptools.setup(
2525
name='outrank',
26-
version='0.94.1',
26+
version='0.94.2',
2727
description='OutRank: Feature ranking for massive sparse data sets.',
2828
long_description=_read_description(),
2929
long_description_content_type='text/markdown',

0 commit comments

Comments
 (0)