|
| 1 | +(ns cmr.indexer.services.autocomplete |
| 2 | + "Provide functions to index concept" |
| 3 | + (:require |
| 4 | + [camel-snake-kebab.core :as camel-snake-kebab] |
| 5 | + [clj-time.core :refer [now]] |
| 6 | + [clojure.string :as string] |
| 7 | + [cmr.common.config :refer [defconfig]] |
| 8 | + [cmr.common.log :as log :refer [debug info warn error]] |
| 9 | + [cmr.common.util :as util :refer [defn-timed]] |
| 10 | + [cmr.indexer.data.concept-parser :as cp] |
| 11 | + [cmr.indexer.data.concepts.collection.collection-util :as collection-util] |
| 12 | + [cmr.indexer.data.concepts.collection.humanizer :as humanizer] |
| 13 | + [cmr.indexer.data.elasticsearch :as es] |
| 14 | + [cmr.indexer.data.index-set :as idx-set] |
| 15 | + [cmr.indexer.services.index-service :as service] |
| 16 | + [cmr.transmit.metadata-db :as meta-db] |
| 17 | + [cmr.transmit.search :as search])) |
| 18 | + |
| 19 | +(defconfig autocomplete-suggestion-age-limit |
| 20 | + "Age in hours that we allow autocomplete suggestions to persist to avoid stale data." |
| 21 | + {:type Long |
| 22 | + :default 24}) |
| 23 | + |
| 24 | +(defn- keywords->elastic-docs |
| 25 | + "Convert hierarchical keywords to colon-separated elastic docs for indexing. |
| 26 | + The keywords may not be hierarchical all the way to the end - some can be skipped to the last |
| 27 | + keyword and may be nil." |
| 28 | + [index type keywords keyword-hierarchy public-collection? permitted-group-ids modified-date] |
| 29 | + (when (and (map? keywords) |
| 30 | + (pos? (count keywords))) |
| 31 | + (let [k-strings (->> keyword-hierarchy |
| 32 | + (map #(get keywords %)) |
| 33 | + (util/remove-nil-tail)) |
| 34 | + keyword-string (string/join ":" k-strings) |
| 35 | + keyword-value (last k-strings) |
| 36 | + id (-> (string/lower-case keyword-string) |
| 37 | + (str "_" type) |
| 38 | + hash)] |
| 39 | + {:_id id |
| 40 | + :type type |
| 41 | + :value keyword-value |
| 42 | + :fields keyword-string |
| 43 | + :_index index |
| 44 | + :contains-public-collections public-collection? |
| 45 | + :permitted-group-ids permitted-group-ids |
| 46 | + :modified modified-date}))) |
| 47 | + |
| 48 | +(defn- science-keywords->elastic-docs |
| 49 | + "Convert hierarchical science-keywords to colon-separated elastic docs for indexing. |
| 50 | + Below 'term', variable may not be hierarchical - they can be skipped - and may be nil." |
| 51 | + [index science-keywords public-collection? permitted-group-ids modified-date] |
| 52 | + (let [keyword-hierarchy [:topic |
| 53 | + :term |
| 54 | + :variable-level-1 |
| 55 | + :variable-level-2 |
| 56 | + :variable-level-3 |
| 57 | + :detailed-variable] |
| 58 | + type "science_keywords"] |
| 59 | + (keywords->elastic-docs index |
| 60 | + type |
| 61 | + science-keywords |
| 62 | + keyword-hierarchy |
| 63 | + public-collection? |
| 64 | + permitted-group-ids |
| 65 | + modified-date))) |
| 66 | + |
| 67 | +(defn- platform-keywords->elastic-docs |
| 68 | + "Convert hierarchical platform keywords to colon-separated elastic docs for indexing. |
| 69 | + Below 'category', the keywords may not be hierarchical - sub-category can be skipped - and may be |
| 70 | + nil." |
| 71 | + [index platform-keywords public-collection? permitted-group-ids modified-date] |
| 72 | + (let [keyword-hierarchy [:basis :category :sub-category :short-name] |
| 73 | + type "platforms"] |
| 74 | + (keywords->elastic-docs index |
| 75 | + type |
| 76 | + platform-keywords |
| 77 | + keyword-hierarchy |
| 78 | + public-collection? |
| 79 | + permitted-group-ids |
| 80 | + modified-date))) |
| 81 | + |
| 82 | +(defn- suggestion-doc |
| 83 | + "Creates elasticsearch docs from a given humanized map" |
| 84 | + [index permissions key-name value-map] |
| 85 | + (let [values (->> value-map |
| 86 | + seq |
| 87 | + (remove #(string/includes? (name (key %)) "-lowercase"))) |
| 88 | + sk-matcher (re-matcher #"science-keywords" key-name) |
| 89 | + platform-matcher (re-matcher #"platforms2-humanized" key-name) |
| 90 | + public-collection? (if (some #(= % "guest") permissions) |
| 91 | + true |
| 92 | + false) |
| 93 | + permitted-group-ids (->> permissions |
| 94 | + (remove #(= "guest" %)) |
| 95 | + (string/join ",") |
| 96 | + not-empty) |
| 97 | + modified-date (str (now))] |
| 98 | + (cond |
| 99 | + (seq (re-find sk-matcher)) |
| 100 | + (science-keywords->elastic-docs index |
| 101 | + value-map |
| 102 | + public-collection? |
| 103 | + permitted-group-ids |
| 104 | + modified-date) |
| 105 | + |
| 106 | + (seq (re-find platform-matcher)) |
| 107 | + (platform-keywords->elastic-docs index |
| 108 | + value-map |
| 109 | + public-collection? |
| 110 | + permitted-group-ids |
| 111 | + modified-date) |
| 112 | + |
| 113 | + :else |
| 114 | + (map (fn [value] |
| 115 | + (let [v (val value) |
| 116 | + type (-> key-name |
| 117 | + camel-snake-kebab/->snake_case_keyword |
| 118 | + (string/replace #"_humanized|:" "")) |
| 119 | + id (-> (string/lower-case v) |
| 120 | + (str "_" type) |
| 121 | + hash)] |
| 122 | + {:type type |
| 123 | + :_id id |
| 124 | + :value v |
| 125 | + :fields v |
| 126 | + :_index index |
| 127 | + :contains-public-collections public-collection? |
| 128 | + :permitted-group-ids permitted-group-ids |
| 129 | + :modified modified-date})) |
| 130 | + values)))) |
| 131 | + |
| 132 | +(defn- get-suggestion-docs |
| 133 | + "Given the humanized fields from a collection, assemble an elastic doc for each |
| 134 | + value available for indexing into elasticsearch" |
| 135 | + [index humanized-fields] |
| 136 | + (let [{:keys [permissions]} humanized-fields |
| 137 | + fields-without-permissions (dissoc humanized-fields :id :permissions)] |
| 138 | + (for [humanized-field fields-without-permissions |
| 139 | + :let [key (key humanized-field) |
| 140 | + key-name (-> key |
| 141 | + name |
| 142 | + (string/replace #"(\.humanized(_?2)?|-sn|-id)" "")) |
| 143 | + value-map (as-> humanized-field h |
| 144 | + (val h) |
| 145 | + (map util/remove-nil-keys h) |
| 146 | + (map #(dissoc % :priority) h)) |
| 147 | + suggestion-docs (->> value-map |
| 148 | + (map #(suggestion-doc index permissions key-name %)) |
| 149 | + (remove nil?))]] |
| 150 | + suggestion-docs))) |
| 151 | + |
| 152 | +(defn- anti-value? |
| 153 | + "Returns whether or not the term is an anti-value. e.g. \"not applicable\" or \"not provided\". |
| 154 | + This is case-insensitive" |
| 155 | + [term] |
| 156 | + (let [rx (re-pattern #"(none|not (provided|applicable))")] |
| 157 | + (or (string/blank? term) |
| 158 | + (some? (re-find rx (string/lower-case term)))))) |
| 159 | + |
| 160 | +(defn anti-value-suggestion? |
| 161 | + "Returns whether an autocomplete suggestion has an anti-value as the :value |
| 162 | + See also [[anti-value?]]" |
| 163 | + [suggestion] |
| 164 | + (let [{:keys [value]} suggestion] |
| 165 | + (anti-value? value))) |
| 166 | + |
| 167 | +(defn- parse-collection |
| 168 | + "Parses collection into concepts. Returns nil on error." |
| 169 | + [context collection] |
| 170 | + (try |
| 171 | + (cp/parse-concept context collection) |
| 172 | + (catch Exception e |
| 173 | + (error (format "An error occurred while parsing collection for autocomplete with concept-id [%s]: %s" |
| 174 | + (:concept-id collection) |
| 175 | + (.getMessage e)))))) |
| 176 | + |
| 177 | +(defn- get-humanized-collections |
| 178 | + "Get the humanized fields for the passed in parsed-concept and remove the old flat platform |
| 179 | + since we don't support those facets anymore for autocomplete." |
| 180 | + [context collection] |
| 181 | + (dissoc (humanizer/collection-humanizers-elastic context collection) :platform-sn-humanized)) |
| 182 | + |
| 183 | +(defn- collections->suggestion-docs |
| 184 | + "Convert collection concept metadata to UMM-C and pull facet fields |
| 185 | + to be indexed as autocomplete suggestion doc" |
| 186 | + [context collections provider-id] |
| 187 | + (let [{:keys [index-names]} (idx-set/get-concept-type-index-names context) |
| 188 | + index (get-in index-names [:autocomplete :autocomplete]) |
| 189 | + humanized-fields-fn (partial get-humanized-collections context) |
| 190 | + parsed-concepts (->> collections |
| 191 | + (remove :deleted) |
| 192 | + (map #(parse-collection context %)) |
| 193 | + (remove nil?)) |
| 194 | + collection-permissions (map (fn [collection] |
| 195 | + (let [permissions (collection-util/get-coll-permitted-group-ids context provider-id collection)] |
| 196 | + {:id (:concept-id collection) |
| 197 | + :permissions permissions})) |
| 198 | + collections) |
| 199 | + humanized-fields (map humanized-fields-fn parsed-concepts) |
| 200 | + humanized-fields-with-permissions (map merge collection-permissions humanized-fields)] |
| 201 | + (->> humanized-fields-with-permissions |
| 202 | + (map #(get-suggestion-docs index %)) |
| 203 | + flatten |
| 204 | + (remove anti-value-suggestion?)))) |
| 205 | + |
| 206 | +(defn-timed reindex-autocomplete-suggestions-for-provider |
| 207 | + "Reindex autocomplete suggestion for a given provider" |
| 208 | + [context provider-id] |
| 209 | + (info "Reindexing autocomplete suggestions for provider" provider-id) |
| 210 | + (let [latest-collection-batches (meta-db/find-in-batches |
| 211 | + context |
| 212 | + :collection |
| 213 | + service/REINDEX_BATCH_SIZE |
| 214 | + {:provider-id provider-id :latest true})] |
| 215 | + (reduce (fn [num-indexed coll-batch] |
| 216 | + (let [batch (collections->suggestion-docs context coll-batch provider-id)] |
| 217 | + (es/bulk-index-autocomplete-suggestions context batch) |
| 218 | + (+ num-indexed (count coll-batch)))) |
| 219 | + 0 |
| 220 | + latest-collection-batches))) |
| 221 | + |
| 222 | +(defn prune-stale-autocomplete-suggestions |
| 223 | + "Delete any autocomplete suggestions that were modified outside the retention period." |
| 224 | + [context] |
| 225 | + (info (format "Pruning autocomplete suggestions older than %d hours." |
| 226 | + (autocomplete-suggestion-age-limit))) |
| 227 | + (let [{:keys [index-names]} (idx-set/get-concept-type-index-names context) |
| 228 | + index (get-in index-names [:autocomplete :autocomplete]) |
| 229 | + concept-mapping-types (idx-set/get-concept-mapping-types context) |
| 230 | + mapping-type (concept-mapping-types :collection) |
| 231 | + document-age (format "now-%dh/h" (autocomplete-suggestion-age-limit))] |
| 232 | + (es/delete-by-query |
| 233 | + context |
| 234 | + index |
| 235 | + mapping-type |
| 236 | + {:range {(service/query-field->elastic-field :modified :suggestion) {:lt document-age}}}))) |
0 commit comments