diff --git a/.env.test b/.env.test index 5f571800..ba806822 100644 --- a/.env.test +++ b/.env.test @@ -3,3 +3,6 @@ LINKRESOLVER_BASEURL=https://mit.primo.exlibrisgroup.com/discovery/openurl?insti TACOS_EMAIL=tacos-help@mit.edu LIBKEY_KEY=FAKE_LIBKEY_KEY LIBKEY_ID=FAKE_LIBKEY_ID +DETECTOR_LAMBDA_URL=http://localhost:3000 +DETECTOR_LAMBDA_PATH=/foo +DETECTOR_LAMBDA_CHALLENGE_SECRET=secret_phrase diff --git a/app/models/detector/citation.rb b/app/models/detector/citation.rb index 4f4132b3..ef3c5b36 100644 --- a/app/models/detector/citation.rb +++ b/app/models/detector/citation.rb @@ -10,7 +10,7 @@ class Detector # hallmarks of being a citation. # Phrases whose score is higher than the REQUIRED_SCORE value can be registered as a Detection. class Citation - attr_reader :score, :subpatterns, :summary + attr_reader :features, :score, :subpatterns, :summary # shared singleton methods extend Detector::BulkChecker @@ -67,10 +67,13 @@ def detection? # @return Nothing intentional. Data is written to Hashes `@subpatterns`, `@summary`, # and `@score` during processing. def initialize(phrase) + @features = {} @subpatterns = {} @summary = {} pattern_checker(phrase) summarize(phrase) + @features = @subpatterns.deep_dup.transform_values(&:length).merge(summary) + @subpatterns.delete_if { |_, v| v == [] } @score = calculate_score end @@ -141,7 +144,7 @@ def commas(phrase) # @return hash def pattern_checker(phrase) CITATION_PATTERNS.each_pair do |type, pattern| - @subpatterns[type.to_sym] = scan(pattern, phrase) if scan(pattern, phrase).present? + @subpatterns[type.to_sym] = scan(pattern, phrase) end end diff --git a/app/models/lookup_citation.rb b/app/models/lookup_citation.rb new file mode 100644 index 00000000..b879fc36 --- /dev/null +++ b/app/models/lookup_citation.rb @@ -0,0 +1,97 @@ +# frozen_string_literal: true + +class LookupCitation + # The info method is the way to return information about whether a given phrase is a citation. It consults an + # external lambda service (address in env) and returns either a true or a false. The default if anything goes wrong + # is to return false. + # + # @return Boolean or nil + def info(phrase) + return unless expected_env? + + external_data = fetch(phrase) + return if external_data == 'Error' + + external_data + end + + private + + def lambda_path + ENV.fetch('DETECTOR_LAMBDA_PATH', nil) + end + + def lambda_secret + ENV.fetch('DETECTOR_LAMBDA_CHALLENGE_SECRET', nil) + end + + def lambda_url + ENV.fetch('DETECTOR_LAMBDA_URL', nil) + end + + # define_lambda connects to the detector lambda. + # + # @return Faraday connection + def define_lambda + Faraday.new( + url: lambda_url, + params: {} + ) + end + + # define_payload defines the Hash that will be sent to the lambda. + # + # @return Hash + def define_payload(phrase) + { + action: 'predict', + features: extract_features(phrase), + challenge_secret: lambda_secret + } + end + + # expected_env? confirms that all three required environment variables are defined. + # + # @return Boolean + def expected_env? + Rails.logger.error('No lambda URL defined') if lambda_url.nil? + + Rails.logger.error('No lambda path defined') if lambda_path.nil? + + Rails.logger.error('No lambda secret defined') if lambda_secret.nil? + + [lambda_url, lambda_path, lambda_secret].all?(&:present?) + end + + # extract_features passes the search phrase through the citation detector, and massages the resulting features object + # to correspond with what the lambda expects. + # + # @return Hash + def extract_features(phrase) + features = Detector::Citation.new(phrase).features + features[:apa] = features.delete :apa_volume_issue + features[:year] = features.delete :year_parens + features.delete :characters + features + end + + # Fetch handles the communication with the detector lambda: defining the connection, building the payload, and any + # error handling with the response. + # + # @return Boolean or 'Error' + def fetch(phrase) + lambda = define_lambda + payload = define_payload(phrase) + + response = lambda.post(lambda_path, payload.to_json) + + if response.status == 200 + JSON.parse(response.body)['response'] == 'true' + else + Rails.logger.error(response.body) + Rails.logger.error(response.body['error']) + + 'Error' + end + end +end diff --git a/test/models/detector/citation_test.rb b/test/models/detector/citation_test.rb index 008dbf3f..262640f0 100644 --- a/test/models/detector/citation_test.rb +++ b/test/models/detector/citation_test.rb @@ -4,10 +4,12 @@ class Detector class CitationTest < ActiveSupport::TestCase - test 'detector::citation exposes three instance variables' do + test 'detector::citation exposes four instance variables' do t = terms('citation') result = Detector::Citation.new(t.phrase) + assert_predicate result.features, :present? + assert_predicate result.score, :present? assert_predicate result.summary, :present? @@ -196,6 +198,29 @@ class CitationTest < ActiveSupport::TestCase assert_operator 0, :<, result.score end + test 'features instance method is a hash of integers' do + result = Detector::Citation.new('simple search phrase') + + assert_instance_of(Hash, result.features) + + assert(result.features.all? { |_, v| v.integer? }) + end + + test 'features instance method includes all elements of citation detector regardless of search string' do + result_simple = Detector::Citation.new('simple') + result_complex = Detector::Citation.new('Science Education and Cultural Diversity: Mapping the Field. Studies in Science Education, 24(1), 49–73.') + + assert_equal result_simple.features.length, result_complex.features.length + end + + test 'features instance method should include all elements of citation patterns and summary thresholds' do + patterns = Detector::Citation.const_get :CITATION_PATTERNS + summary = Detector::Citation.const_get :SUMMARY_THRESHOLDS + result = Detector::Citation.new('simple') + + assert_equal (patterns.length + summary.length), result.features.length + end + test 'detection? convenience method returns true for obvious citations' do result = Detector::Citation.new(terms('citation').phrase) diff --git a/test/models/lookup_citation_test.rb b/test/models/lookup_citation_test.rb new file mode 100644 index 00000000..2b3f6ba7 --- /dev/null +++ b/test/models/lookup_citation_test.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +require 'test_helper' + +class LookupCitationTest < ActiveSupport::TestCase + test 'DETECTOR_LAMBDA_CHALLENGE_SECRET is required' do + ClimateControl.modify DETECTOR_LAMBDA_CHALLENGE_SECRET: nil do + assert_nil(LookupCitation.new.info('ping')) + end + end + + test 'DETECTOR_LAMBDA_PATH is required' do + ClimateControl.modify DETECTOR_LAMBDA_PATH: nil do + assert_nil(LookupCitation.new.info('ping')) + end + end + + test 'DETECTOR_LAMBDA_URL is required' do + ClimateControl.modify DETECTOR_LAMBDA_URL: nil do + assert_nil(LookupCitation.new.info('ping')) + end + end + + test 'lookup returns true when lambda running' do + # These cassettes should be regenerated once the lambda is running in AWS. For now it will need to be running + # on localhost should the cassettes need to be regenerated. + VCR.use_cassette('lambda running') do + prediction = LookupCitation.new.info('ping') + + assert(prediction) + end + end + + test 'lookup returns nil when challenge_secret is wrong' do + ClimateControl.modify DETECTOR_LAMBDA_CHALLENGE_SECRET: 'something wrong' do + VCR.use_cassette('lambda with wrong secret') do + prediction = LookupCitation.new.info('oops') + + assert_nil(prediction) + end + end + end +end diff --git a/test/test_helper.rb b/test/test_helper.rb index 474e9a5f..6964d1be 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -15,6 +15,7 @@ require 'rails/test_help' VCR.configure do |config| + config.ignore_localhost = false config.cassette_library_dir = 'test/vcr_cassettes' config.hook_into :webmock diff --git a/test/vcr_cassettes/lambda_running.yml b/test/vcr_cassettes/lambda_running.yml new file mode 100644 index 00000000..0c11987b --- /dev/null +++ b/test/vcr_cassettes/lambda_running.yml @@ -0,0 +1,37 @@ +--- +http_interactions: +- request: + method: post + uri: http://localhost:3000/foo + body: + encoding: UTF-8 + string: '{"action":"predict","features":{"no":0,"pages":0,"pp":0,"vol":0,"brackets":0,"lastnames":0,"quotes":0,"colons":0,"commas":0,"periods":0,"semicolons":0,"words":1,"apa":0,"year":0},"challenge_secret":"secret_phrase"}' + headers: + User-Agent: + - Faraday v2.12.2 + Content-Type: + - application/x-www-form-urlencoded + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Server: + - Werkzeug/3.0.6 Python/3.11.10 + Date: + - Thu, 29 May 2025 13:54:02 GMT + Content-Type: + - application/json + Content-Length: + - '20' + Connection: + - close + body: + encoding: UTF-8 + string: '{"response": "true"}' + recorded_at: Thu, 29 May 2025 13:54:02 GMT +recorded_with: VCR 6.3.1 diff --git a/test/vcr_cassettes/lambda_with_wrong_secret.yml b/test/vcr_cassettes/lambda_with_wrong_secret.yml new file mode 100644 index 00000000..c5cc30ff --- /dev/null +++ b/test/vcr_cassettes/lambda_with_wrong_secret.yml @@ -0,0 +1,39 @@ +--- +http_interactions: +- request: + method: post + uri: http://localhost:3000/foo + body: + encoding: UTF-8 + string: '{"action":"predict","features":{"no":0,"pages":0,"pp":0,"vol":0,"brackets":0,"lastnames":0,"quotes":0,"colons":0,"commas":0,"periods":0,"semicolons":0,"words":1,"apa":0,"year":0},"challenge_secret":"something + wrong"}' + headers: + User-Agent: + - Faraday v2.12.2 + Content-Type: + - application/x-www-form-urlencoded + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 401 + message: UNAUTHORIZED + headers: + Server: + - Werkzeug/3.0.6 Python/3.11.10 + Date: + - Thu, 29 May 2025 21:31:37 GMT + Content-Type: + - application/json + Content-Length: + - '72' + Connection: + - close + body: + encoding: UTF-8 + string: '{"error": "Challenge secret missing or mismatch", "error_details": + null}' + recorded_at: Thu, 29 May 2025 21:31:37 GMT +recorded_with: VCR 6.3.1