Merge pull request #45 from outbrain/3mr-fixes

SkBlaz · web-flow · commit 05f9039cc3f9 · 2023-10-09T12:11:24.000+02:00
3mr fixes
diff --git a/outrank/__main__.py b/outrank/__main__.py
@@ -183,6 +183,13 @@ def main():
         help="Which ';'-separated features should be one-hot encoded into n new features (coverage analysis)",
     )
 
+    parser.add_argument(
+        '--silent',
+        type=str,
+        default='False',
+        help='Suppress the logo and tips.',
+    )
+
     parser.add_argument(
         '--subfeature_mapping',
         type=str,
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
@@ -1,6 +1,7 @@
 # A module for pairwise computation of importances -- entrypoint for the core ranking algorighm(s)
 from __future__ import annotations
 
+import logging
 import operator
 import traceback
 from typing import Any
@@ -16,6 +17,9 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.svm import SVC
 
+logger = logging.getLogger('syn-logger')
+logger.setLevel(logging.DEBUG)
+
 try:
     from outrank.algorithms.feature_ranking import ranking_mi_numba
 
@@ -99,6 +103,13 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
     feature_one = combination[0]
     feature_two = combination[1]
 
+    if feature_one not in tmp_df.columns:
+        logging.info(f'{feature_one} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
+        return [feature_one, feature_two, 0]
+    elif feature_two not in tmp_df.columns:
+        logging.info(f'{feature_two} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
+        return [feature_one, feature_two, 0]
+
     vector_first = tmp_df[[feature_one]].values.ravel()
     vector_second = tmp_df[[feature_two]].values.ravel()
 
@@ -156,10 +167,18 @@ def rank_features_3MR(
     def calc_higher_order(feature, is_redundancy=True):
         values = []
         for feat in ranked_features:
+            interaction_tuple = (feat, feature)
             if is_redundancy:
-                values.append(redundancy_dict[(feat, feature)])
+                if interaction_tuple in redundancy_dict:
+                    values.append(redundancy_dict[interaction_tuple])
+                else:
+                    logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
             else:
-                values.append(relational_dict[(feat, feature)])
+                if interaction_tuple in relational_dict:
+                    values.append(relational_dict[interaction_tuple])
+                else:
+                    logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
+
         if strategy == 'sum':
             return sum(values)
         if strategy == 'mean':
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
@@ -182,11 +182,14 @@ def parse_ob_line_vw(
 
     # Hash multi-value tuples and store name-val mappings
     for remaining_part in remainder:
-        core_parts = remaining_part.split(' ')
+        core_parts = remaining_part.strip().split(' ')
         namespace_part = core_parts[0]
         other_parts = '-'.join(x for x in core_parts[1:] if x != '')
+
         if namespace_part in fw_col_mapping:
             remainder_hash[fw_col_mapping[namespace_part]] = other_parts
+        else:
+            logging.error(f"Didn't find namespace {namespace_part}")
 
     # Construct the consistently-mapped instance based on the remainder mapping
     the_real_instance = [
@@ -200,6 +203,7 @@ def parse_ob_line_vw(
         ]
 
     parts = [label] + the_real_instance
+
     return parts
 
 
diff --git a/outrank/task_ranking.py b/outrank/task_ranking.py
@@ -38,8 +38,9 @@ def outrank_task_conduct_ranking(args: Any):
     if args.task in ['identify_rare_values', 'feature_summary_transformers']:
         args.heuristic = 'Constant'
 
-    display_tool_name()
-    display_random_tip()
+    if args.silent != 'True':
+        display_tool_name()
+        display_random_tip()
 
     dataset_info = get_dataset_info(args)
 
diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@ def _read_description():
 packages = [x for x in setuptools.find_packages() if x != 'test']
 setuptools.setup(
     name='outrank',
-    version='0.94.1',
+    version='0.94.2',
     description='OutRank: Feature ranking for massive sparse data sets.',
     long_description=_read_description(),
     long_description_content_type='text/markdown',