From bdebf62e780c411720cddf119d39a7bb589e2cfc Mon Sep 17 00:00:00 2001 From: Tomoki Ohtsuki Date: Mon, 25 Jan 2021 08:52:40 +0900 Subject: [PATCH] Optimizer api change (#35) * Added tolerance parameter. * Added some optimization * Remove * fix the convergence criterion. * Correct the doc * API change. Target metric should be specified by evaluators. * Modify the doc and examples. * Remove Random Walk with Restart * Fix setup.py accordingly * Fix tests accordingly. * Add tests, optimizer class * fix doc. * Thanks mypy * optimizer default_tune_range change. Remove alpha from default_tune_range of * P3alphaOptimizer * RP3betaOptimizer * Optimizer doc change. * Pre-compute X_train_all.csc() for user similarity. --- cpp_source/rws.cpp | 142 --------------- cpp_source/util.hpp | 7 +- docs/source/api_reference.rst | 8 +- examples/hyperparameter-optimization.ipynb | 134 ++++++++------- examples/movielens/movielens_1m.py | 8 +- examples/movielens/movielens_1m_cold.py | 6 +- examples/movielens/movielens_20m_cold.py | 1 - irspack/evaluator.py | 63 +++++-- irspack/optimizers/__init__.py | 6 +- irspack/optimizers/_optimizers.py | 50 +++--- irspack/optimizers/base_optimizer.py | 13 +- irspack/recommenders/__init__.py | 5 +- irspack/recommenders/base.py | 59 +++++-- irspack/recommenders/base_earlystop.py | 8 +- irspack/recommenders/rwr.py | 34 ---- irspack/recommenders/slim.py | 13 +- irspack/recommenders/user_knn.py | 190 +++++++++++++++++++++ irspack/user_cold_start/cb2cf.py | 1 - setup.py | 11 -- tests/recommenders/test_learn_all.py | 6 +- tests/recommenders/test_user_knn.py | 76 +++++++++ 21 files changed, 485 insertions(+), 356 deletions(-) delete mode 100644 cpp_source/rws.cpp delete mode 100644 irspack/recommenders/rwr.py create mode 100644 irspack/recommenders/user_knn.py create mode 100644 tests/recommenders/test_user_knn.py diff --git a/cpp_source/rws.cpp b/cpp_source/rws.cpp deleted file mode 100644 index 8c7a09c..0000000 --- a/cpp_source/rws.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -using MatrixEntry = float; -using CSRMatrix = Eigen::SparseMatrix; -using ReturnValue = Eigen::SparseMatrix; - -static uniform_real_distribution udist_(0, 1); -namespace py = pybind11; - -namespace irspack { - -struct RandomWalkGenerator { - inline RandomWalkGenerator(CSRMatrix X) - : user_item(X), item_user(X.transpose()), n_item(X.cols()), - n_user(X.rows()) { - user_item.makeCompressed(); - item_user.makeCompressed(); - } - -protected: - inline size_t step_i_to_u(size_t start_ind, std::mt19937 &rns) const { - size_t start_index = item_user.outerIndexPtr()[start_ind]; - size_t ep = item_user.outerIndexPtr()[start_ind + 1]; - size_t nnz = ep - start_index; - size_t indptr = start_index + floor(nnz * udist_(rns)); - return item_user.innerIndexPtr()[indptr]; - } - - inline size_t step_u_to_i(size_t start_ind, std::mt19937 &rns) const { - size_t start_index = user_item.outerIndexPtr()[start_ind]; - size_t ep = user_item.outerIndexPtr()[start_ind + 1]; - size_t nnz = ep - start_index; - size_t indptr = start_index + floor(nnz * udist_(rns)); - return user_item.innerIndexPtr()[indptr]; - } - -public: - ReturnValue run_with_restart(float decay, size_t cutoff, size_t n_count, - size_t n_worker, int random_seed) const { - using Triplet = Eigen::Triplet; - std::vector>> futures; - for (size_t thread_id = 0; thread_id < n_worker; thread_id++) { - futures.push_back(std::async( - [this, decay, cutoff, thread_id, n_count, n_worker, random_seed]() { - std::mt19937 rns(random_seed + thread_id); - vector d; - for (size_t i = thread_id; i < this->n_item; i += n_worker) { - auto counts = - this->_run_item_walk_restart(decay, cutoff, i, n_count, rns); - for (auto &iter : counts) { - d.emplace_back(i, iter.first, iter.second); - } - } - return d; - })); - } - vector d; - for (size_t thread_id = 0; thread_id < n_worker; thread_id++) { - vector _ = futures[thread_id].get(); - d.insert(d.end(), _.begin(), _.end()); - } - - ReturnValue result(n_item, n_item); - result.setFromTriplets(d.begin(), d.end()); - result.makeCompressed(); - return result; - } - -protected: - inline map _run_item_walk_fixed_step(size_t item_start_index, - size_t n_step, - size_t n_count, - std::mt19937 &rns) const { - - map count; - auto current_loc = item_start_index; - size_t start_index = item_user.outerIndexPtr()[current_loc]; - size_t ep = item_user.outerIndexPtr()[current_loc + 1]; - size_t nnz = ep - start_index; - if (nnz == 0) { - return count; - } - for (size_t m = 0; m < n_count; m++) { - for (size_t n = 0; n < n_step; n++) { - current_loc = step_i_to_u(current_loc, rns); - current_loc = step_u_to_i(current_loc, rns); - } - count[current_loc] += 1; - } - return count; - }; - - inline map _run_item_walk_restart(float decay, size_t cutoff, - size_t item_start_index, - size_t n_count, - std::mt19937 &rns) const { - map count; - auto current_loc = item_start_index; - size_t start_index = item_user.outerIndexPtr()[current_loc]; - size_t ep = item_user.outerIndexPtr()[current_loc + 1]; - size_t nnz = ep - start_index; - if (nnz == 0) { - return count; - } - for (size_t m = 0; m < n_count; m++) { - auto current_loc = item_start_index; - for (size_t n = 0; n < cutoff; n++) { - current_loc = step_i_to_u(current_loc, rns); - current_loc = step_u_to_i(current_loc, rns); - if (udist_(rns) < decay) - break; - } - count[current_loc] += 1; - } - return count; - }; - -private: - CSRMatrix user_item, item_user; - size_t n_item, n_user; - // mt19937 random_state_; -}; - -} // namespace irspack - -PYBIND11_MODULE(_rwr, m) { - using namespace irspack; - m.doc() = "Backend C++ inplementation for Random walk with restart."; - py::class_(m, "RandomWalkGenerator") - .def(py::init()) - .def("run_with_restart", &RandomWalkGenerator::run_with_restart); -} diff --git a/cpp_source/util.hpp b/cpp_source/util.hpp index 38de3c3..43bd039 100644 --- a/cpp_source/util.hpp +++ b/cpp_source/util.hpp @@ -195,7 +195,6 @@ inline CSCMatrix SLIM(const CSRMatrix &X, size_t n_threads, check_arg(n_iter > 0, "n_iter must be > 0."); check_arg(l2_coeff >= 0, "l2_coeff must be > 0."); check_arg(l1_coeff >= 0, "l1_coeff must be > 0."); - const Real tol_all = tol * block_size; using MatrixType = Eigen::Matrix; using VectorType = Eigen::Matrix; @@ -210,7 +209,7 @@ inline CSCMatrix SLIM(const CSRMatrix &X, size_t n_threads, for (size_t th = 0; th < n_threads; th++) { workers.emplace_back(std::async(std::launch::async, [&cursor, &X_csc, l2_coeff, l1_coeff, - n_iter, tol_all] { + n_iter, tol] { const int64_t F = X_csc.cols(); std::mt19937 gen(0); std::vector indices(F); @@ -328,10 +327,10 @@ inline CSCMatrix SLIM(const CSRMatrix &X, size_t n_threads, const int64_t row = nnz_iter.row(); remnants.col(row).noalias() += nnz_iter.valueRef() * coeff_temp; } - delta += coeff_temp.squaredNorm(); + delta = std::max(delta, coeff_temp.cwiseAbs().array().maxCoeff()); } } - if (delta < tol_all) { + if (delta < tol) { break; } } diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index 1805234..5f5a5fd 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -27,7 +27,6 @@ Recommenders BaseRecommender TopPopRecommender IALSRecommender - DenseSLIMRecommender P3alphaRecommender RP3betaRecommender TruncatedSVDRecommender @@ -35,7 +34,10 @@ Recommenders AsymmetricCosineKNNRecommender JaccardKNNRecommender TverskyIndexKNNRecommender + CosineUserKNNRecommender + AsymmetricCosineUserKNNRecommender SLIMRecommender + DenseSLIMRecommender A LightFM wrapper for BPR matrix factorization (requires a separate installation of `lightFM `_). @@ -66,7 +68,6 @@ Optimizers BaseOptimizer TopPopOptimizer IALSOptimizer - DenseSLIMOptimizer P3alphaOptimizer RP3betaOptimizer TruncatedSVDOptimizer @@ -74,7 +75,10 @@ Optimizers AsymmetricCosineKNNOptimizer JaccardKNNOptimizer TverskyIndexKNNOptimizer + CosineUserKNNOptimizer + AsymmetricCosineUserKNNOptimizer SLIMOptimizer + DenseSLIMOptimizer MultVAEOptimizer .. currentmodule:: irspack.split diff --git a/examples/hyperparameter-optimization.ipynb b/examples/hyperparameter-optimization.ipynb index 8b8c10e..3e96d08 100644 --- a/examples/hyperparameter-optimization.ipynb +++ b/examples/hyperparameter-optimization.ipynb @@ -140,7 +140,7 @@ "outputs": [], "source": [ "X_train_val_learn = sps.vstack([X_train_user, X_valid_learn])\n", - "evaluator = Evaluator(X_valid_predict, offset=X_train_user.shape[0], cutoff=20)" + "evaluator = Evaluator(X_valid_predict, offset=X_train_user.shape[0], target_metric='ndcg', cutoff=20)" ] }, { @@ -166,7 +166,7 @@ " disable_default_handler()\n", " optuna.logging.disable_default_handler() \n", " \n", - "optimizer = P3alphaOptimizer(X_train_val_learn, evaluator, metric=\"ndcg\")\n", + "optimizer = P3alphaOptimizer(X_train_val_learn, evaluator)\n", "best_params, validation_results = optimizer.optimize(random_seed=0, n_trials=20)" ] }, @@ -447,9 +447,9 @@ { "data": { "text/plain": [ - "{'train': ,\n", - " 'val': ,\n", - " 'test': }" + "{'train': ,\n", + " 'val': ,\n", + " 'test': }" ] }, "execution_count": 12, @@ -613,49 +613,49 @@ "name": "stderr", "output_type": "stream", "text": [ - "valid_score=0.4286159863842687: 11%|█ | 55/512 [00:05<00:47, 9.62it/s] \n", - "valid_score=0.4009655288183455: 14%|█▎ | 70/512 [00:08<00:54, 8.10it/s] \n", - "valid_score=0.5333319224223335: 14%|█▎ | 70/512 [00:02<00:17, 25.73it/s]\n", - "valid_score=0.456712516536987: 7%|▋ | 35/512 [00:02<00:39, 11.97it/s] \n", - "valid_score=0.48545257282307125: 8%|▊ | 40/512 [00:02<00:31, 15.02it/s]\n", - "valid_score=0.5328125561238588: 10%|▉ | 50/512 [00:01<00:17, 26.43it/s]\n", - "valid_score=0.47685704542170176: 7%|▋ | 35/512 [00:02<00:38, 12.33it/s]\n", - "valid_score=0.4077320184372636: 1%| | 5/512 [00:00<00:30, 16.41it/s]\n", - "valid_score=0.4373383293981874: 1%| | 5/512 [00:00<00:47, 10.74it/s]\n", - "valid_score=0.4216314800784488: 1%| | 5/512 [00:00<00:44, 11.41it/s]\n", - "valid_score=0.44462285171408994: 1%| | 5/512 [00:00<00:14, 36.18it/s]\n", - "valid_score=0.5299900148610059: 9%|▉ | 45/512 [00:01<00:16, 27.86it/s]\n", - "valid_score=0.5338119523100937: 14%|█▎ | 70/512 [00:03<00:19, 23.19it/s]\n", - "valid_score=0.5273020510070392: 7%|▋ | 35/512 [00:01<00:21, 22.22it/s]\n", - "valid_score=0.5287146848004601: 7%|▋ | 35/512 [00:01<00:22, 21.06it/s]\n", - "valid_score=0.5264932903644511: 7%|▋ | 35/512 [00:01<00:23, 20.12it/s]\n", - "valid_score=0.44400466852940296: 1%| | 5/512 [00:00<00:14, 35.89it/s]\n", - "valid_score=0.5306917608122538: 8%|▊ | 40/512 [00:01<00:17, 27.38it/s]\n", - "valid_score=0.5110284973022633: 7%|▋ | 35/512 [00:02<00:27, 17.39it/s]\n", - "valid_score=0.4517869817772647: 1%| | 5/512 [00:00<00:25, 20.12it/s]\n", - "valid_score=0.533166218883192: 12%|█▏ | 60/512 [00:02<00:18, 24.18it/s] \n", - "valid_score=0.5329383829075185: 10%|▉ | 50/512 [00:02<00:17, 26.29it/s]\n", - "valid_score=0.5018632681107333: 1%| | 5/512 [00:00<00:15, 33.62it/s]\n", - "valid_score=0.5030423878939329: 1%| | 5/512 [00:00<00:28, 18.10it/s]\n", - "valid_score=0.5290278187228771: 2%|▏ | 10/512 [00:00<00:21, 23.40it/s]\n", - "valid_score=0.4995433642284334: 1%| | 5/512 [00:00<00:17, 29.62it/s]\n", - "valid_score=0.512148574263727: 1%| | 5/512 [00:00<00:25, 19.85it/s]\n", - "valid_score=0.5345408384892154: 12%|█▏ | 60/512 [00:02<00:16, 26.76it/s]\n", - "valid_score=0.4953207181791401: 1%| | 5/512 [00:00<00:30, 16.42it/s]\n", - "valid_score=0.5034172163942912: 1%| | 5/512 [00:00<00:17, 29.71it/s]\n", - "valid_score=0.3799588055771834: 1%| | 5/512 [00:00<00:28, 17.79it/s]\n", - "valid_score=0.5342803993871786: 14%|█▎ | 70/512 [00:02<00:19, 22.87it/s]/home/tomoki/.pyenv/versions/3.7.4/envs/main/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1366: RuntimeWarning: Mean of empty slice\n", + "valid_score=0.42824183005121735: 11%|█ | 55/512 [00:05<00:45, 9.96it/s]\n", + "valid_score=0.40060602861013855: 15%|█▍ | 75/512 [00:09<00:52, 8.37it/s]\n", + "valid_score=0.5333345981232733: 14%|█▎ | 70/512 [00:02<00:17, 25.95it/s]\n", + "valid_score=0.4565719212930136: 7%|▋ | 35/512 [00:02<00:40, 11.79it/s] \n", + "valid_score=0.48549781083236443: 8%|▊ | 40/512 [00:02<00:31, 14.81it/s]\n", + "valid_score=0.5327950253455304: 10%|▉ | 50/512 [00:01<00:17, 25.96it/s]\n", + "valid_score=0.4767790510450989: 7%|▋ | 35/512 [00:02<00:38, 12.30it/s] \n", + "valid_score=0.4078509330631947: 1%| | 5/512 [00:00<00:34, 14.61it/s]\n", + "valid_score=0.4372874728149235: 1%| | 5/512 [00:00<00:53, 9.40it/s]\n", + "valid_score=0.4217175292017891: 1%| | 5/512 [00:00<00:46, 10.97it/s]\n", + "valid_score=0.44462285171408994: 1%| | 5/512 [00:00<00:14, 35.51it/s]\n", + "valid_score=0.5299876125954996: 9%|▉ | 45/512 [00:01<00:17, 27.08it/s]\n", + "valid_score=0.5338013123443506: 14%|█▎ | 70/512 [00:03<00:19, 22.57it/s]\n", + "valid_score=0.5273331792612119: 7%|▋ | 35/512 [00:01<00:21, 22.01it/s]\n", + "valid_score=0.5286246823761697: 7%|▋ | 35/512 [00:01<00:22, 21.25it/s]\n", + "valid_score=0.5264078529929245: 7%|▋ | 35/512 [00:01<00:22, 21.03it/s]\n", + "valid_score=0.44400466852940296: 1%| | 5/512 [00:00<00:14, 36.19it/s]\n", + "valid_score=0.5306979586892263: 8%|▊ | 40/512 [00:01<00:16, 27.96it/s]\n", + "valid_score=0.511043342238914: 7%|▋ | 35/512 [00:02<00:27, 17.24it/s] \n", + "valid_score=0.4518078505781681: 1%| | 5/512 [00:00<00:27, 18.14it/s]\n", + "valid_score=0.533190381872124: 12%|█▏ | 60/512 [00:02<00:19, 23.58it/s] \n", + "valid_score=0.5330299634914348: 10%|▉ | 50/512 [00:02<00:17, 25.86it/s]\n", + "valid_score=0.5018637442329386: 1%| | 5/512 [00:00<00:15, 33.50it/s]\n", + "valid_score=0.5030662041498615: 1%| | 5/512 [00:00<00:28, 17.81it/s]\n", + "valid_score=0.5289883633577348: 2%|▏ | 10/512 [00:00<00:18, 27.63it/s]\n", + "valid_score=0.49958073757480104: 1%| | 5/512 [00:00<00:18, 27.99it/s]\n", + "valid_score=0.5123500965698433: 1%| | 5/512 [00:00<00:25, 19.92it/s]\n", + "valid_score=0.5345384958248065: 12%|█▏ | 60/512 [00:02<00:19, 23.70it/s]\n", + "valid_score=0.49529533153445293: 1%| | 5/512 [00:00<00:32, 15.41it/s]\n", + "valid_score=0.5034195494138344: 1%| | 5/512 [00:00<00:16, 29.90it/s]\n", + "valid_score=0.37982336414616824: 1%| | 5/512 [00:00<00:29, 17.36it/s]\n", + "valid_score=0.5341290557392615: 15%|█▍ | 75/512 [00:03<00:18, 23.60it/s]\n", + "valid_score=0.5330484337350391: 8%|▊ | 40/512 [00:01<00:19, 24.80it/s]\n", + "valid_score=0.5169060070657032: 1%| | 5/512 [00:00<00:25, 20.08it/s]\n", + "valid_score=0.535784217748701: 11%|█ | 55/512 [00:02<00:19, 23.77it/s] \n", + "valid_score=0.5350593666909814: 9%|▉ | 45/512 [00:01<00:17, 27.38it/s]\n", + "valid_score=0.5164001692167451: 1%| | 5/512 [00:00<00:16, 31.61it/s]\n", + "valid_score=0.5346230124782899: 15%|█▍ | 75/512 [00:02<00:15, 27.92it/s]/home/tomoki/.pyenv/versions/3.7.4/envs/main/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1366: RuntimeWarning: Mean of empty slice\n", " return np.nanmean(a, axis, out=out, keepdims=keepdims)\n", - "valid_score=0.5341363142121404: 15%|█▍ | 75/512 [00:03<00:18, 23.75it/s]\n", - "valid_score=0.5330473218261131: 8%|▊ | 40/512 [00:01<00:19, 24.21it/s]\n", - "valid_score=0.5168612160799183: 1%| | 5/512 [00:00<00:25, 19.90it/s]\n", - "valid_score=0.5357843352992894: 11%|█ | 55/512 [00:02<00:19, 23.73it/s]\n", - "valid_score=0.535091714366856: 9%|▉ | 45/512 [00:01<00:17, 26.98it/s] \n", - "valid_score=0.5164347629255008: 1%| | 5/512 [00:00<00:16, 30.16it/s]\n", - "valid_score=0.5348001341636642: 18%|█▊ | 90/512 [00:03<00:15, 28.08it/s]\n", - "valid_score=0.5305585910357176: 2%|▏ | 10/512 [00:00<00:17, 28.92it/s]\n", - "valid_score=0.46877926572020423: 1%| | 5/512 [00:00<00:14, 33.90it/s]\n", - "100%|██████████| 20/20 [00:00<00:00, 35.04it/s]\n" + "valid_score=0.5348182715646398: 18%|█▊ | 90/512 [00:03<00:15, 27.65it/s]\n", + "valid_score=0.5305485694376337: 2%|▏ | 10/512 [00:00<00:17, 28.12it/s]\n", + "valid_score=0.46877847228072783: 1%| | 5/512 [00:00<00:15, 33.77it/s]\n", + "100%|██████████| 20/20 [00:00<00:00, 32.61it/s]\n" ] }, { @@ -672,12 +672,11 @@ "val_evaluator = Evaluator(\n", " val_users.X_test,\n", " offset=train_users.n_users,\n", - " cutoff=20\n", + " cutoff=20, target_metric=\"ndcg\"\n", ")\n", "test_evaluator = Evaluator(\n", " test_users.X_test,\n", - " offset=train_and_val_users.n_users,\n", - " cutoff=20\n", + " offset=train_and_val_users.n_users\n", ")\n", "test_results = []\n", "for optimizer_class in [IALSOptimizer, RP3betaOptimizer, P3alphaOptimizer, DenseSLIMOptimizer]:\n", @@ -738,6 +737,7 @@ " recall@20\n", " ndcg@20\n", " map@20\n", + " precision@20\n", " gini_index@20\n", " entropy@20\n", " appeared_item@20\n", @@ -748,11 +748,12 @@ " 0\n", " IALSOptimizer\n", " 0.993929\n", - " 0.201007\n", - " 0.549479\n", - " 0.128701\n", - " 0.914691\n", - " 5.994783\n", + " 0.201032\n", + " 0.549493\n", + " 0.128710\n", + " 0.498317\n", + " 0.914696\n", + " 5.994704\n", " 994.0\n", " \n", " \n", @@ -762,6 +763,7 @@ " 0.193559\n", " 0.537777\n", " 0.123390\n", + " 0.484078\n", " 0.949540\n", " 5.409107\n", " 982.0\n", @@ -773,6 +775,7 @@ " 0.186035\n", " 0.522004\n", " 0.116442\n", + " 0.469812\n", " 0.962653\n", " 5.146738\n", " 667.0\n", @@ -784,6 +787,7 @@ " 0.210865\n", " 0.574984\n", " 0.139570\n", + " 0.520006\n", " 0.928136\n", " 5.807884\n", " 988.0\n", @@ -793,17 +797,17 @@ "" ], "text/plain": [ - " algorithm hit@20 recall@20 ndcg@20 map@20 gini_index@20 \\\n", - "0 IALSOptimizer 0.993929 0.201007 0.549479 0.128701 0.914691 \n", - "1 RP3betaOptimizer 0.995033 0.193559 0.537777 0.123390 0.949540 \n", - "2 P3alphaOptimizer 0.990066 0.186035 0.522004 0.116442 0.962653 \n", - "3 DenseSLIMOptimizer 0.993929 0.210865 0.574984 0.139570 0.928136 \n", + " algorithm hit@20 recall@20 ndcg@20 map@20 precision@20 \\\n", + "0 IALSOptimizer 0.993929 0.201032 0.549493 0.128710 0.498317 \n", + "1 RP3betaOptimizer 0.995033 0.193559 0.537777 0.123390 0.484078 \n", + "2 P3alphaOptimizer 0.990066 0.186035 0.522004 0.116442 0.469812 \n", + "3 DenseSLIMOptimizer 0.993929 0.210865 0.574984 0.139570 0.520006 \n", "\n", - " entropy@20 appeared_item@20 \n", - "0 5.994783 994.0 \n", - "1 5.409107 982.0 \n", - "2 5.146738 667.0 \n", - "3 5.807884 988.0 " + " gini_index@20 entropy@20 appeared_item@20 \n", + "0 0.914696 5.994704 994.0 \n", + "1 0.949540 5.409107 982.0 \n", + "2 0.962653 5.146738 667.0 \n", + "3 0.928136 5.807884 988.0 " ] }, "execution_count": 20, diff --git a/examples/movielens/movielens_1m.py b/examples/movielens/movielens_1m.py index 6c24588..2ba592d 100644 --- a/examples/movielens/movielens_1m.py +++ b/examples/movielens/movielens_1m.py @@ -14,7 +14,6 @@ DenseSLIMOptimizer, IALSOptimizer, P3alphaOptimizer, - RandomWalkWithRestartOptimizer, RP3betaOptimizer, TopPopOptimizer, TverskyIndexKNNOptimizer, @@ -69,7 +68,6 @@ (CosineKNNOptimizer, 40), (AsymmetricCosineKNNOptimizer, 40), (TverskyIndexKNNOptimizer, 40), - (RandomWalkWithRestartOptimizer, 20), (DenseSLIMOptimizer, 20), (P3alphaOptimizer, 40), (RP3betaOptimizer, 40), @@ -80,11 +78,7 @@ ] for optimizer_class, n_trials in test_configs: name = optimizer_class.__name__ - optimizer: BaseOptimizer = optimizer_class( - X_train_all, - valid_evaluator, - metric="ndcg", - ) + optimizer: BaseOptimizer = optimizer_class(X_train_all, valid_evaluator) (best_param, validation_results) = optimizer.optimize( timeout=14400, n_trials=n_trials ) diff --git a/examples/movielens/movielens_1m_cold.py b/examples/movielens/movielens_1m_cold.py index 30f66b1..3f42728 100644 --- a/examples/movielens/movielens_1m_cold.py +++ b/examples/movielens/movielens_1m_cold.py @@ -76,11 +76,7 @@ ] for optimizer_class, n_trials in test_configs: recommender_name = optimizer_class.recommender_class.__name__ - optimizer: BaseOptimizer = optimizer_class( - data_train.X_all, - valid_evaluator, - metric="ndcg", - ) + optimizer: BaseOptimizer = optimizer_class(data_train.X_all, valid_evaluator) (best_param, validation_result_df) = optimizer.optimize( timeout=14400, n_trials=n_trials ) diff --git a/examples/movielens/movielens_20m_cold.py b/examples/movielens/movielens_20m_cold.py index 4c42e52..a12a8b7 100644 --- a/examples/movielens/movielens_20m_cold.py +++ b/examples/movielens/movielens_20m_cold.py @@ -95,7 +95,6 @@ optimizer: BaseOptimizer = optimizer_class( data_train.X_all, valid_evaluator, - metric="ndcg", fixed_params=config, ) (best_param, validation_result_df) = optimizer.optimize(n_trials=n_trials) diff --git a/irspack/evaluator.py b/irspack/evaluator.py index 5480877..8d9da80 100644 --- a/irspack/evaluator.py +++ b/irspack/evaluator.py @@ -6,7 +6,7 @@ import numpy as np from irspack._evaluator import EvaluatorCore, Metrics -from irspack.definitions import DenseScoreArray, InteractionMatrix +from irspack.definitions import InteractionMatrix from irspack.utils import get_n_threads if TYPE_CHECKING: @@ -17,6 +17,8 @@ class TargetMetric(Enum): NDCG = "ndcg" RECALL = "recall" HIT = "hit" + MAP = "map" + PRECISION = "precision" METRIC_NAMES = [ @@ -24,6 +26,7 @@ class TargetMetric(Enum): "recall", "ndcg", "map", + "precision", "gini_index", "entropy", "appeared_item", @@ -34,25 +37,34 @@ class Evaluator: """Evaluates recommenders' performance against validation set. Args: - ground_truth (Union[scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): The held-out ground-truth. - offset (int): Where the validation target user block begins. + ground_truth (Union[scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): + The held-out ground-truth. + offset (int): + Where the validation target user block begins. Often the validation set is defined for a subset of users. When offset is not 0, we assume that the users with validation ground truth corresponds to X_train[offset:] where X_train is the matrix feeded into the recommender class. - cutoff (int, optional): Controls the number of recommendation. + cutoff (int, optional): + Controls the default number of recommendation. + When the evaluator is used for parameter tuning, this cutoff value will be used. Defaults to 10. - target_metric (str, optional): Optimization target metric. - Defaults to "ndcg". - recommendable_items (Optional[List[int]], optional): Global recommendable items. Defaults to None. + target_metric (str, optional): + Specifies the target metric when this evaluator is used for + parameter tuning. Defaults to "ndcg". + recommendable_items (Optional[List[int]], optional): + Global recommendable items. Defaults to None. If this parameter is not None, evaluator will be concentrating on the recommender's score output for these recommendable_items, and compute the ranking performance within this subset. per_user_recommendable_items (Optional[List[List[int]]], optional): Similar to `recommendable_items`, but this time the recommendable items can vary among users. Defaults to None. - n_threads (int, optional): Number of threads to sort the score and compute the - evaluation metrics. Defaults to 1. - mb_size (int, optional): The rows of chunked user score. Defaults to 1024. + n_threads (int, optional): + Specifies the Number of threads to sort scores and compute the evaluation metrics. + If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, + and if there is no such an environment variable, it will be set to 1. Defaults to None. + mb_size (int, optional): + The rows of chunked user score. Defaults to 1024. """ n_users: int @@ -89,8 +101,19 @@ def __init__( self.n_threads = get_n_threads(n_threads) self.mb_size = mb_size + def get_target_score(self, model: "base_recommender.BaseRecommender") -> float: + """Compute the optimization target score (self.target_metric) with the cutoff being ``self.cutoff``. + + Args: + model: The evaluated model. + + Returns: + The metric value. + """ + return self.get_score(model)[self.target_metric.value] + def get_score(self, model: "base_recommender.BaseRecommender") -> Dict[str, float]: - """Compute the score with the cutoff being `self.cutoff`. + """Compute the score with the cutoff being ``self.cutoff``. Args: model : The evaluated recommender. @@ -168,19 +191,25 @@ class EvaluatorWithColdUser(Evaluator): When offset is not 0, we assume that the users with validation ground truth corresponds to X_train[offset:] where X_train is the matrix feeded into the recommender class. - cutoff (int, optional): Controls the number of recommendation. + cutoff (int, optional): + Controls the number of recommendation. Defaults to 10. - target_metric (str, optional): Optimization target metric. + target_metric (str, optional): + Optimization target metric. Defaults to "ndcg". - recommendable_items (Optional[List[int]], optional): Global recommendable items. Defaults to None. + recommendable_items (Optional[List[int]], optional): + Global recommendable items. Defaults to None. If this parameter is not None, evaluator will be concentrating on the recommender's score output for these recommendable_items, and compute the ranking performance within this subset. per_user_recommendable_items (Optional[List[List[int]]], optional): Similar to `recommendable_items`, but this time the recommendable items can vary among users. Defaults to None. - n_threads (int, optional): Number of threads to sort the score and compute the - evaluation metrics. Defaults to 1. - mb_size (int, optional): The rows of chunked user score. Defaults to 1024. + n_threads (int, optional): + Specifies the Number of threads to sort scores and compute the evaluation metrics. + If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, + and if there is no such an environment variable, it will be set to 1. Defaults to None. + mb_size (int, optional): + The rows of chunked user score. Defaults to 1024. """ def __init__( diff --git a/irspack/optimizers/__init__.py b/irspack/optimizers/__init__.py index 983a7ee..0e657c9 100644 --- a/irspack/optimizers/__init__.py +++ b/irspack/optimizers/__init__.py @@ -1,14 +1,15 @@ from ._optimizers import ( AsymmetricCosineKNNOptimizer, + AsymmetricCosineUserKNNOptimizer, BaseOptimizer, BaseOptimizerWithEarlyStopping, CosineKNNOptimizer, + CosineUserKNNOptimizer, DenseSLIMOptimizer, IALSOptimizer, JaccardKNNOptimizer, NMFOptimizer, P3alphaOptimizer, - RandomWalkWithRestartOptimizer, RP3betaOptimizer, SLIMOptimizer, TopPopOptimizer, @@ -25,9 +26,10 @@ "SLIMOptimizer", "P3alphaOptimizer", "RP3betaOptimizer", - "RandomWalkWithRestartOptimizer", "CosineKNNOptimizer", "AsymmetricCosineKNNOptimizer", + "CosineUserKNNOptimizer", + "AsymmetricCosineUserKNNOptimizer", "JaccardKNNOptimizer", "TverskyIndexKNNOptimizer", "NMFOptimizer", diff --git a/irspack/optimizers/_optimizers.py b/irspack/optimizers/_optimizers.py index 8cf2137..777a510 100644 --- a/irspack/optimizers/_optimizers.py +++ b/irspack/optimizers/_optimizers.py @@ -11,13 +11,14 @@ ) from ..recommenders import ( AsymmetricCosineKNNRecommender, + AsymmetricCosineUserKNNRecommender, CosineKNNRecommender, + CosineUserKNNRecommender, DenseSLIMRecommender, IALSRecommender, JaccardKNNRecommender, NMFRecommender, P3alphaRecommender, - RandomWalkWithRestartRecommender, RP3betaRecommender, SLIMRecommender, TopPopRecommender, @@ -42,8 +43,6 @@ The train data. val_evaluator (Evaluator): The validation evaluator which measures the performance of the recommenders. - metric (str, optional) : - Target metric. Defaults to "ndcg". logger (Optional[logging.Logger], optional) : The logger used during the optimization steps. Defaults to None. If ``None``, the default logger of irspack will be used. @@ -62,7 +61,6 @@ The train data. val_evaluator (Evaluator): The validation evaluator which measures the performance of the recommenders. - metric (str, optional): Target metric. Defaults to "ndcg". logger (Optional[logging.Logger], optional): The logger used during the optimization steps. Defaults to None. If ``None``, the default logger of irspack will be used. @@ -131,7 +129,6 @@ class IALSOptimizer(BaseOptimizerWithEarlyStopping): class P3alphaOptimizer(BaseOptimizer): default_tune_range = [ - LogUniformSuggestion("alpha", low=1e-10, high=2), IntegerSuggestion("top_k", low=10, high=1000), CategoricalSuggestion("normalize_weight", [True, False]), ] @@ -151,7 +148,6 @@ class DenseSLIMOptimizer(BaseOptimizer): class RP3betaOptimizer(BaseOptimizer): default_tune_range = [ - LogUniformSuggestion("alpha", 1e-5, 10), IntegerSuggestion("top_k", 2, 1000), LogUniformSuggestion("beta", 1e-5, 5e-1), CategoricalSuggestion("normalize_weight", [True, False]), @@ -170,19 +166,6 @@ class TruncatedSVDOptimizer(BaseOptimizer): _add_docstring(TruncatedSVDOptimizer) -class RandomWalkWithRestartOptimizer(BaseOptimizer): - default_tune_range = [ - UniformSuggestion("decay", 1e-2, 9.9e-1), - IntegerSuggestion("n_samples", 100, 2000, step=100), - IntegerSuggestion("cutoff", 100, 2000, step=100), - ] - - recommender_class = RandomWalkWithRestartRecommender - - -_add_docstring(RandomWalkWithRestartOptimizer) - - class SLIMOptimizer(BaseOptimizer): default_tune_range = [ LogUniformSuggestion("alpha", 1e-5, 1), @@ -218,6 +201,17 @@ class CosineKNNOptimizer(BaseOptimizer): _add_docstring(CosineKNNOptimizer) +class AsymmetricCosineKNNOptimizer(BaseOptimizer): + default_tune_range = default_tune_range_knn_with_weighting + [ + UniformSuggestion("alpha", 0, 1) + ] + + recommender_class = AsymmetricCosineKNNRecommender + + +_add_docstring(AsymmetricCosineKNNOptimizer) + + class JaccardKNNOptimizer(BaseOptimizer): default_tune_range = default_tune_range_knn.copy() @@ -239,15 +233,27 @@ class TverskyIndexKNNOptimizer(BaseOptimizer): _add_docstring(TverskyIndexKNNOptimizer) -class AsymmetricCosineKNNOptimizer(BaseOptimizer): +class CosineUserKNNOptimizer(BaseOptimizer): + default_tune_range = default_tune_range_knn_with_weighting.copy() + [ + CategoricalSuggestion("normalize", [False, True]) + ] + + recommender_class = CosineUserKNNRecommender + + +_add_docstring(CosineUserKNNOptimizer) + + +class AsymmetricCosineUserKNNOptimizer(BaseOptimizer): default_tune_range = default_tune_range_knn_with_weighting + [ UniformSuggestion("alpha", 0, 1) ] - recommender_class = AsymmetricCosineKNNRecommender + recommender_class = AsymmetricCosineUserKNNRecommender -_add_docstring(AsymmetricCosineKNNOptimizer) +_add_docstring(AsymmetricCosineUserKNNOptimizer) + try: from ..recommenders.bpr import BPRFMRecommender diff --git a/irspack/optimizers/base_optimizer.py b/irspack/optimizers/base_optimizer.py index 4fbe019..138b005 100644 --- a/irspack/optimizers/base_optimizer.py +++ b/irspack/optimizers/base_optimizer.py @@ -28,7 +28,6 @@ class BaseOptimizer(object, metaclass=ABCMeta): The train data. val_evaluator (Evaluator): The validation evaluator which measures the performance of the recommenders. - metric (str, optional): Target metric. Defaults to "ndcg". logger (Optional[logging.Logger], optional): The logger used during the optimization steps. Defaults to None. If ``None``, the default logger of irspack will be used. @@ -50,7 +49,6 @@ def __init__( self, data: InteractionMatrix, val_evaluator: Evaluator, - metric: str = "ndcg", logger: Optional[logging.Logger] = None, suggest_overwrite: List[Suggestion] = list(), fixed_params: Dict[str, Any] = dict(), @@ -62,7 +60,6 @@ def __init__( self.logger = logger self._data = data self.val_evaluator = val_evaluator - self.metric = metric self.current_trial: int = 0 self.best_trial_index: Optional[int] = None @@ -148,13 +145,16 @@ def objective_func(trial: optuna.Trial) -> float: score, time_spent, ) - val_score = score[self.metric] + val_score = score[self.val_evaluator.target_metric.value] if (-val_score) < self.best_val: self.best_val = -val_score self.best_time = time_spent self.best_params = parameters self.learnt_config_best = dict(**recommender.learnt_config) - self.logger.info("Found best %s using this config.", self.metric) + self.logger.info( + "Found best %s using this config.", + self.val_evaluator.target_metric.value, + ) self.best_trial_index = self.current_trial return -val_score @@ -230,7 +230,6 @@ class BaseOptimizerWithEarlyStopping(BaseOptimizer): The train data. val_evaluator (Evaluator): The validation evaluator which measures the performance of the recommenders. - metric (str, optional): Target metric. Defaults to "ndcg". logger (Optional[logging.Logger], optional): The logger used during the optimization steps. Defaults to None. If ``None``, the default logger of irspack will be used. @@ -256,7 +255,6 @@ def __init__( self, data: InteractionMatrix, val_evaluator: Evaluator, - metric: str = "ndcg", logger: Optional[logging.Logger] = None, suggest_overwrite: List[Suggestion] = list(), fixed_params: Dict[str, Any] = dict(), @@ -269,7 +267,6 @@ def __init__( super().__init__( data, val_evaluator, - metric, logger=logger, suggest_overwrite=suggest_overwrite, fixed_params=fixed_params, diff --git a/irspack/recommenders/__init__.py b/irspack/recommenders/__init__.py index 822828a..2702780 100644 --- a/irspack/recommenders/__init__.py +++ b/irspack/recommenders/__init__.py @@ -13,10 +13,10 @@ from .nmf import NMFRecommender from .p3 import P3alphaRecommender from .rp3 import RP3betaRecommender -from .rwr import RandomWalkWithRestartRecommender from .slim import SLIMRecommender from .toppop import TopPopRecommender from .truncsvd import TruncatedSVDRecommender +from .user_knn import AsymmetricCosineUserKNNRecommender, CosineUserKNNRecommender __all__ = [ "BaseRecommender", @@ -27,7 +27,6 @@ "RP3betaRecommender", "DenseSLIMRecommender", "NMFRecommender", - "RandomWalkWithRestartRecommender", "SLIMRecommender", "TruncatedSVDRecommender", "IALSRecommender", @@ -35,6 +34,8 @@ "JaccardKNNRecommender", "TverskyIndexKNNRecommender", "AsymmetricCosineKNNRecommender", + "CosineUserKNNRecommender", + "AsymmetricCosineUserKNNRecommender", ] try: diff --git a/irspack/recommenders/base.py b/irspack/recommenders/base.py index 991feec..92cfc2a 100644 --- a/irspack/recommenders/base.py +++ b/irspack/recommenders/base.py @@ -5,8 +5,6 @@ from optuna.trial import Trial from scipy import sparse as sps -from irspack.utils import get_n_threads - if TYPE_CHECKING: from .. import evaluator @@ -18,6 +16,13 @@ ) +def _sparse_to_array(U: Any) -> np.ndarray: + if sps.issparse(U): + return U.toarray() + else: + return U + + class CallBeforeFitError(Exception): pass @@ -117,9 +122,7 @@ def get_score_remove_seen_block(self, begin: int, end: int) -> DenseScoreArray: Returns: The masked item scores. Its shape will be (end - begin, self.n_items) """ - scores = self.get_score_block(begin, end) - if sps.issparse(scores): - scores = scores.toarray() + scores = _sparse_to_array(self.get_score_block(begin, end)) m = self.X_train_all[begin:end] scores[m.nonzero()] = -np.inf if scores.dtype != np.float64: @@ -159,7 +162,7 @@ class BaseSimilarityRecommender(BaseRecommender): W_: Optional[Union[sps.csr_matrix, sps.csc_matrix, np.ndarray]] def __init__(self, *args: Any, **kwargs: Any) -> None: - super(BaseSimilarityRecommender, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.W_ = None @property @@ -177,22 +180,42 @@ def W(self) -> Union[sps.csr_matrix, sps.csc_matrix, np.ndarray]: return self.W_ def get_score(self, user_indices: UserIndexArray) -> DenseScoreArray: - if sps.issparse(self.W): - return self.X_train_all[user_indices].dot(self.W).toarray() - else: - return self.X_train_all[user_indices].dot(self.W) + return _sparse_to_array(self.X_train_all[user_indices].dot(self.W)) def get_score_cold_user(self, X: InteractionMatrix) -> DenseScoreArray: - if sps.issparse(self.W): - return X.dot(self.W).toarray() - else: - return X.dot(self.W) + return _sparse_to_array(X.dot(self.W)) + + def get_score_block(self, begin: int, end: int) -> DenseScoreArray: + return _sparse_to_array(self.X_train_all[begin:end].dot(self.W)) + + +class BaseUserSimilarityRecommender(BaseRecommender): + U_: Optional[Union[sps.csr_matrix, sps.csc_matrix, np.ndarray]] + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self._X_csc: sps.csc_matrix = self.X_train_all.tocsc() + self.U_ = None + + @property + def U(self) -> Union[sps.csr_matrix, sps.csc_matrix, np.ndarray]: + """Computed similarity weight matrix. + + Raises: + RuntimeError: Raises When there is not W_ attributes (e.g., method call before the fit). + + Returns: + The similarity matrix. Score will be computed by e.g. self.X.dot(self.W) + """ + if self.U_ is None: + raise RuntimeError("W fetched before fit.") + return self.U_ + + def get_score(self, user_indices: UserIndexArray) -> DenseScoreArray: + return _sparse_to_array(self.U[user_indices].dot(self._X_csc).toarray()) def get_score_block(self, begin: int, end: int) -> DenseScoreArray: - if sps.issparse(self.W): - return self.X_train_all[begin:end].dot(self.W).toarray() - else: - return self.X_train_all[begin:end].dot(self.W) + return _sparse_to_array(self.U[begin:end].dot(self._X_csc)) class BaseRecommenderWithUserEmbedding(BaseRecommender): diff --git a/irspack/recommenders/base_earlystop.py b/irspack/recommenders/base_earlystop.py index 943e7d8..790fb42 100644 --- a/irspack/recommenders/base_earlystop.py +++ b/irspack/recommenders/base_earlystop.py @@ -118,12 +118,10 @@ def learn_with_optimizer( if evaluator is None: continue - valid_score = evaluator.get_score(self) + target_score = evaluator.get_target_score(self) - progress_bar.set_description( - f"valid_score={valid_score[evaluator.target_metric.value]}" - ) - relevant_score = valid_score[evaluator.target_metric.value] + progress_bar.set_description(f"valid_score={target_score}") + relevant_score = target_score if relevant_score > best_score: best_score = relevant_score self.save_state() diff --git a/irspack/recommenders/rwr.py b/irspack/recommenders/rwr.py deleted file mode 100644 index 9695d14..0000000 --- a/irspack/recommenders/rwr.py +++ /dev/null @@ -1,34 +0,0 @@ -from irspack.utils import get_n_threads - -from ..definitions import InteractionMatrix -from ._rwr import RandomWalkGenerator -from .base import BaseSimilarityRecommender - - -class RandomWalkWithRestartRecommender(BaseSimilarityRecommender): - def __init__( - self, - X_train_all: InteractionMatrix, - decay: float = 0.3, - cutoff: int = 1000, - n_samples: int = 1000, - random_seed: int = 42, - n_threads: int = 4, - ): - super().__init__(X_train_all) - self.decay = decay - self.n_samples = n_samples - self.cutoff = cutoff - self.random_seed = random_seed - self.n_threads = get_n_threads(n_threads) - - def _learn(self) -> None: - rwg = RandomWalkGenerator(self.X_train_all.tocsr()) - self.W_ = rwg.run_with_restart( - self.decay, - self.cutoff, - self.n_samples, - self.n_threads, - self.random_seed, - ) - self.W_ = self.W_.tocsc() / self.n_samples diff --git a/irspack/recommenders/slim.py b/irspack/recommenders/slim.py index ab71f3d..630b762 100644 --- a/irspack/recommenders/slim.py +++ b/irspack/recommenders/slim.py @@ -18,11 +18,6 @@ class SLIMRecommender(BaseSimilarityRecommender): The implementation relies on a simple (parallelized) cyclic-coordinate descent method. - Currently, this does not support: - - - shuffling of item indices - - elaborate convergence check - Args: X_train_all: Input interaction matrix. @@ -33,9 +28,11 @@ class SLIMRecommender(BaseSimilarityRecommender): positive_only: Whether we constrain the weight matrix to be non-negative. Defaults to True. n_iter: - The number of coordinate-descent iterations. Defaults to 10. + The number of coordinate-descent iterations. Defaults to 100. tol: - Tolerance parameter for cd iteration. Defaults to 1e-5. + Tolerance parameter for cd iteration, i.e., if the maximal parameter change + of the coordinate-descent single iteration is smaller than this value, + the iteration will terminate. Defaults to 1e-4. n_threads: Specifies the number of threads to use for the computation. If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, @@ -49,7 +46,7 @@ def __init__( l1_ratio: float = 0.01, positive_only: bool = True, n_iter: int = 100, - tol: float = 1e-6, + tol: float = 1e-4, n_threads: Optional[int] = None, ): super().__init__(X_train_all) diff --git a/irspack/recommenders/user_knn.py b/irspack/recommenders/user_knn.py new file mode 100644 index 0000000..db8eb6c --- /dev/null +++ b/irspack/recommenders/user_knn.py @@ -0,0 +1,190 @@ +from abc import abstractmethod +from typing import Optional, Union + +from irspack.definitions import InteractionMatrix +from irspack.recommenders._knn import ( + AsymmetricSimilarityComputer, + CosineSimilarityComputer, + JaccardSimilarityComputer, + TverskyIndexComputer, +) +from irspack.recommenders.base import BaseUserSimilarityRecommender +from irspack.recommenders.knn import FeatureWeightingScheme +from irspack.utils import ( + get_n_threads, + okapi_BM_25_weight, + remove_diagonal, + tf_idf_weight, +) + + +class BaseUserKNNRecommender(BaseUserSimilarityRecommender): + def __init__( + self, + X_train_all: InteractionMatrix, + shrinkage: float = 0.0, + top_k: int = 100, + n_threads: Optional[int] = None, + feature_weighting: str = "NONE", + bm25_k1: float = 1.2, + bm25_b: float = 0.75, + ): + super().__init__(X_train_all) + self.shrinkage = shrinkage + self.top_k = top_k + self.feature_weighting = FeatureWeightingScheme(feature_weighting) + self.bm25_k1 = bm25_k1 + self.bm25_b = bm25_b + self.n_threads = get_n_threads(n_threads) + + @abstractmethod + def _create_computer( + self, X: InteractionMatrix + ) -> Union[ + CosineSimilarityComputer, + AsymmetricSimilarityComputer, + JaccardSimilarityComputer, + TverskyIndexComputer, + ]: + raise NotImplementedError("") + + def _learn(self) -> None: + if self.feature_weighting == FeatureWeightingScheme.NONE: + X_weighted = self.X_train_all + elif self.feature_weighting == FeatureWeightingScheme.TF_IDF: + X_weighted = tf_idf_weight(self.X_train_all) + elif self.feature_weighting == FeatureWeightingScheme.BM_25: + X_weighted = okapi_BM_25_weight(self.X_train_all, self.bm25_k1, self.bm25_b) + else: + raise RuntimeError("Unknown weighting scheme.") + + computer = self._create_computer(X_weighted) + self.U_ = remove_diagonal( + computer.compute_similarity(self.X_train_all, self.top_k) + ) + + +class CosineUserKNNRecommender(BaseUserKNNRecommender): + r"""K-nearest neighbor recommender system based on cosine similarity. That is, the similarity matrix ``U`` is given by (row-wise top-k restricted) + + .. math:: + + \mathrm{U}_{u,v} = \begin{cases} + \frac{\sum_{i} X_{ui} X_{vi}}{||X_{u*}||_2 ||X_{v*}||_2 + \mathrm{shrinkage}} & (\text{if normalize = True}) \\ + \sum_{i} X_{ui} X_{vi} & (\text{if normalize = False}) + \end{cases} + + + Args: + X_train_all (Union[scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): + Input interaction matrix. + shrinkage (float, optional): + The shrinkage parameter for regularization. Defaults to 0.0. + normalize (bool, optional): + Whether to normalize the similarity. Defaults to False. + top_k (int, optional): + Specifies the maximal number of allowed neighbors. Defaults to 100. + feature_weighting (str, optional): + Specifies how to weight the feature. Must be one of: + + - "NONE" : no feature weighting + - "TF_IDF" : TF-IDF weighting + - "BM_25" : `Okapi BM-25 weighting `_ + + Defaults to "NONE". + bm25_k1 (float, optional): + The k1 parameter for BM25. Ignored if ``feature_weighting`` is not "BM_25". Defaults to 1.2. + bm25_b (float, optional): + The b parameter for BM25. Ignored if ``feature_weighting`` is not "BM_25". Defaults to 0.75. + n_threads (Optional[int], optional): Specifies the number of threads to use for the computation. + If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, + and if there is no such an environment variable, it will be set to 1. Defaults to None. + """ + + def __init__( + self, + X_train_all: InteractionMatrix, + shrinkage: float = 0.0, + normalize: bool = True, + top_k: int = 100, + feature_weighting: str = "NONE", + bm25_k1: float = 1.2, + bm25_b: float = 0.75, + n_threads: Optional[int] = None, + ): + super().__init__( + X_train_all, + shrinkage, + top_k, + n_threads, + feature_weighting=feature_weighting, + bm25_k1=bm25_k1, + bm25_b=bm25_b, + ) + self.normalize = normalize + + def _create_computer(self, X: InteractionMatrix) -> CosineSimilarityComputer: + return CosineSimilarityComputer( + X, self.shrinkage, self.normalize, self.n_threads + ) + + +class AsymmetricCosineUserKNNRecommender(BaseUserKNNRecommender): + r"""K-nearest neighbor recommender system based on asymmetric cosine similarity. That is, the similarity matrix ``U`` is given by (row-wise top-k restricted) + + .. math:: + + \mathrm{U}_{u,v} = \frac{\sum_{i} X_{ui} X_{vi}}{||X_{u*}||^{2\alpha}_2 ||X_{v*}||^{2(1-\alpha)}_2 + \mathrm{shrinkage}} + + Args: + X_train_all (Union[scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): + Input interaction matrix. + shrinkage (float, optional): + The shrinkage parameter for regularization. Defaults to 0.0. + alpha (bool, optional): + Specifies :math:`\\alpha`. Defaults to 0.5. + top_k (int, optional): + Specifies the maximal number of allowed neighbors. Defaults to 100. + feature_weighting (str, optional): + Specifies how to weight the feature. Must be one of: + + - "NONE" : no feature weighting + - "TF_IDF" : TF-IDF weighting + - "BM_25" : `Okapi BM-25 weighting `_ + + Defaults to "NONE". + bm25_k1 (float, optional): + The k1 parameter for BM25. Ignored if ``feature_weighting`` is not "BM_25". Defaults to 1.2. + bm25_b (float, optional): + The b parameter for BM25. Ignored if ``feature_weighting`` is not "BM_25". Defaults to 0.75. + n_threads (Optional[int], optional): Specifies the number of threads to use for the computation. + If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, + and if there is no such an environment variable, it will be set to 1. Defaults to None. + """ + + def __init__( + self, + X_train_all: InteractionMatrix, + shrinkage: float = 0.0, + alpha: float = 0.5, + top_k: int = 100, + feature_weighting: str = "NONE", + bm25_k1: float = 1.2, + bm25_b: float = 0.75, + n_threads: Optional[int] = None, + ): + super().__init__( + X_train_all, + shrinkage, + top_k, + n_threads, + feature_weighting=feature_weighting, + bm25_k1=bm25_k1, + bm25_b=bm25_b, + ) + self.alpha = alpha + + def _create_computer(self, X: InteractionMatrix) -> AsymmetricSimilarityComputer: + return AsymmetricSimilarityComputer( + X, self.shrinkage, self.alpha, self.n_threads + ) diff --git a/irspack/user_cold_start/cb2cf.py b/irspack/user_cold_start/cb2cf.py index 252353e..666be9b 100644 --- a/irspack/user_cold_start/cb2cf.py +++ b/irspack/user_cold_start/cb2cf.py @@ -159,7 +159,6 @@ def search_embedding( searcher = self.cf_optimizer_class( self.X_cf_train_all, self.hot_evaluator, - metric="ndcg", logger=logger, suggest_overwrite=suggest_overwrite, fixed_params=fixed_params, diff --git a/setup.py b/setup.py index 6e24804..8ff5e8c 100644 --- a/setup.py +++ b/setup.py @@ -84,17 +84,6 @@ def __str__(self) -> Any: ], language="c++", ), - Extension( - "irspack.recommenders._rwr", - ["cpp_source/rws.cpp"], - include_dirs=[ - # Path to pybind11 headers - get_pybind_include(), - get_pybind_include(user=True), - get_eigen_include(), - ], - language="c++", - ), Extension( "irspack.recommenders._ials", ["cpp_source/als/wrapper.cpp"], diff --git a/tests/recommenders/test_learn_all.py b/tests/recommenders/test_learn_all.py index f365a24..92d6d35 100644 --- a/tests/recommenders/test_learn_all.py +++ b/tests/recommenders/test_learn_all.py @@ -8,14 +8,15 @@ from irspack.evaluator import Evaluator from irspack.recommenders import ( AsymmetricCosineKNNRecommender, + AsymmetricCosineUserKNNRecommender, BaseRecommender, CosineKNNRecommender, + CosineUserKNNRecommender, DenseSLIMRecommender, IALSRecommender, JaccardKNNRecommender, NMFRecommender, P3alphaRecommender, - RandomWalkWithRestartRecommender, RP3betaRecommender, SLIMRecommender, TopPopRecommender, @@ -37,11 +38,12 @@ AsymmetricCosineKNNRecommender, TverskyIndexKNNRecommender, JaccardKNNRecommender, + CosineUserKNNRecommender, + AsymmetricCosineUserKNNRecommender, P3alphaRecommender, RP3betaRecommender, TruncatedSVDRecommender, NMFRecommender, - RandomWalkWithRestartRecommender, IALSRecommender, DenseSLIMRecommender, SLIMRecommender, diff --git a/tests/recommenders/test_user_knn.py b/tests/recommenders/test_user_knn.py new file mode 100644 index 0000000..2c3d64e --- /dev/null +++ b/tests/recommenders/test_user_knn.py @@ -0,0 +1,76 @@ +import numpy as np +import pytest +import scipy.sparse as sps + +from irspack.recommenders.user_knn import ( + AsymmetricCosineUserKNNRecommender, + CosineUserKNNRecommender, +) + +X_small = sps.csr_matrix( + np.asfarray([[1, 1, 2, 3, 4], [0, 1, 0, 1, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 0]]) +) +X_many = np.random.rand(888, 512) +X_many[X_many <= 0.9] = 0 +X_many[X_many > 0.9] = 1 +X_many = sps.csr_matrix(X_many) +X_many.sort_indices() + +X_many_dense = sps.csr_matrix(np.random.rand(133, 245)) + + +@pytest.mark.parametrize( + "X, normalize", [(X_many, True), (X_small, False), (X_many_dense, True)] +) +def test_cosine(X: sps.csr_matrix, normalize: bool) -> None: + rec = CosineUserKNNRecommender( + X, shrinkage=0, n_threads=5, top_k=X.shape[0], normalize=normalize + ) + with pytest.raises(RuntimeError): + U = rec.U + rec.learn() + sim = rec.U.toarray() + manual = X.toarray() # U x I + norm = (manual ** 2).sum(axis=1) ** 0.5 + manual = manual.dot(manual.T) + if normalize: + denom = norm[:, None] * norm[None, :] + 1e-6 + manual /= denom + np.fill_diagonal(manual, 0) + np.testing.assert_allclose( + sim, + manual, + ) + + +@pytest.mark.parametrize( + "X, alpha, shrinkage", + [(X_many, 0.5, 0.0), (X_small, 0.7, 1.0), (X_many_dense, 0.01, 3)], +) +def test_asymmetric_cosine(X: sps.csr_matrix, alpha: float, shrinkage: float) -> None: + rec = AsymmetricCosineUserKNNRecommender( + X, shrinkage=shrinkage, alpha=alpha, n_threads=1, top_k=X.shape[0] + ) + rec.learn() + sim = rec.U.toarray() + + manual = X.toarray() + norm = (manual ** 2).sum(axis=1) + norm_alpha = np.power(norm, alpha) + norm_1malpha = np.power(norm, 1 - alpha) + manual_sim = manual.dot(manual.T) + denom = norm_alpha[:, None] * norm_1malpha[None, :] + 1e-6 + shrinkage + manual_sim /= denom + np.fill_diagonal(manual_sim, 0) + np.testing.assert_allclose( + sim, + manual_sim, + ) + + +@pytest.mark.parametrize("X", [X_many, X_small]) +def test_topk(X: sps.csr_matrix) -> None: + rec = AsymmetricCosineUserKNNRecommender(X, shrinkage=0, top_k=30, n_threads=5) + rec.learn() + sim = rec.U.toarray() + assert np.all((sim > 0).sum(axis=1) <= 30)