Skip to content

Commit 15e7948

Browse files
authored
Merge pull request #363 from arXiv/ARXIVCE-4110-preserve-quotes
ARXIVCE-4110: keep quotes until es query is built
2 parents 466edc0 + 9202cc6 commit 15e7948

File tree

6 files changed

+28
-14
lines changed

6 files changed

+28
-14
lines changed

search/domain/classic_api/classic_search_query.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def fix_author_underscores(tokens):
155155
if (
156156
i-1 >= 0 and
157157
tokens[i-1][0] == PREFIX and
158+
tokens[i-1][1] == "au:" and
158159
tokens[i][0] == TEXT
159160
):
160161
s = tokens[1][1]

search/domain/classic_api/query_parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,16 @@ def field(self, tokens: List[Token]) -> Field:
5555
return Field(str(f))
5656

5757
def search_string(self, tokens: List[Token]) -> str:
58-
"""Un-quote a search string and strips it of whitespace.
58+
"""Un-quote a search string and strips it of whitespace, then re-quote.
5959
6060
This is the actual search string entered after the Field qualifier.
6161
"""
6262
(s,) = tokens
6363
if s.startswith('"') and s.endswith('"'):
6464
s = s[1:-1]
65+
s = s.strip()
66+
if len(s) > 0:
67+
s = f'"{s}"'
6568
return s.strip() or ""
6669

6770
def term(self, tokens: List[Token]) -> Term:
@@ -152,11 +155,8 @@ def parse_classic_query(query: str) -> Optional[Phrase]:
152155
def phrase_to_query_string(phrase: Phrase, depth: int = 0) -> Optional[str]:
153156
"""Convert a Phrase to a query string."""
154157
if isinstance(phrase, Term):
155-
return (
156-
f"{phrase.field}:{phrase.value}"
157-
if re.search(r"\s", phrase.value) is None
158-
else f'{phrase.field}:"{phrase.value}"'
159-
)
158+
result = f"{phrase.field}:{phrase.value}"
159+
return result
160160
elif len(phrase) == 2:
161161
unary_op, exp = phrase[:2]
162162
value = f"{unary_op.value} {phrase_to_query_string(exp, depth+1)}"

search/domain/classic_api/tests/test_classic_parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,17 @@ class Case:
5454
Case(
5555
message="Simple query with quotations.",
5656
query='ti:"dark matter"',
57-
phrase=Term(Field.Title, "dark matter"),
57+
phrase=Term(Field.Title, '"dark matter"'),
5858
),
5959
Case(
6060
message="Simple query with quotations and extra spacing.",
6161
query='ti:" dark matter "',
62-
phrase=Term(Field.Title, "dark matter"),
62+
phrase=Term(Field.Title, '"dark matter"'),
6363
),
6464
Case(
6565
message="Search date ranges.",
6666
query='submittedDate:"202301010600 TO 202401010600"',
67-
phrase=Term(Field.SubmittedDate, '202301010600 TO 202401010600'),
67+
phrase=Term(Field.SubmittedDate, '"202301010600 TO 202401010600"'),
6868
),
6969
Case(
7070
message="Simple conjunct query.",
@@ -81,7 +81,7 @@ class Case:
8181
phrase=(
8282
Operator.AND,
8383
Term(Field.Author, "del_maestro"),
84-
Term(Field.Title, "dark matter"),
84+
Term(Field.Title, '"dark matter"'),
8585
),
8686
),
8787
Case(
@@ -90,7 +90,7 @@ class Case:
9090
phrase=(
9191
Operator.AND,
9292
Term(Field.Author, "del_maestro"),
93-
Term(Field.Title, "dark matter"),
93+
Term(Field.Title, '"dark matter"'),
9494
),
9595
),
9696
Case(
@@ -214,7 +214,7 @@ class Case:
214214
Case(
215215
message="Simple query with quotations.",
216216
query='ti:"dark matter"',
217-
phrase=Term(Field.Title, "dark matter"),
217+
phrase=Term(Field.Title, '"dark matter"'),
218218
),
219219
Case(
220220
message="Simple conjunct query.",

search/services/index/authors.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ def part_query(term: str, path: str = "authors") -> Q:
141141

142142
def string_query(term: str, path: str = "authors", operator: str = "AND") -> Q:
143143
"""Build a query that handles query strings within a single author."""
144+
if term.startswith('"') and term.endswith('"'):
145+
term = term[1:-1]
144146
q = Q(
145147
"query_string",
146148
fields=[f"{path}.full_name"],
@@ -192,7 +194,7 @@ def author_query(term: str, operator: str = "and") -> Q:
192194
logger.debug(f"Contains literal: {term}")
193195

194196
# Apply literal parts of the query separately.
195-
return reduce(
197+
res = reduce(
196198
iand if operator.upper() == "AND" else ior,
197199
[
198200
(
@@ -203,6 +205,7 @@ def author_query(term: str, operator: str = "and") -> Q:
203205
if part.strip()
204206
],
205207
)
208+
return res
206209

207210
term = term.replace('"', "") # Just ignore unbalanced quotes.
208211

search/services/index/classic_api/query_builder.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ def term_to_query(term: Term) -> Q:
3434
:module:`.api`
3535
"""
3636

37-
return Q() if term.is_empty else FIELD_TERM_MAPPING[term.field](term.value)
37+
res = Q() if term.is_empty else FIELD_TERM_MAPPING[term.field](term.value)
38+
return res
3839

3940

4041
def query_builder(phrase: Phrase) -> Q:

search/services/index/prepare.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ def _query_msc_class(term: str, operator: str = "and") -> Q:
121121

122122

123123
def _query_doi(term: str, operator: str = "and") -> Q:
124+
if term.startswith('"') and term.endswith('"'):
125+
term = term[1:-1]
124126
value, wildcard = wildcard_escape(term)
125127
if wildcard:
126128
return Q("wildcard", doi={"value": term.lower()})
@@ -138,6 +140,8 @@ def _query_submittedDate(term: str, operator: str = "and") -> Q:
138140
'''
139141
_range = None
140142
try:
143+
if term.startswith('"') and term.endswith('"'):
144+
term = term[1:-1]
141145
start_date_str, end_date_str = map(str.strip, term.split("TO"))
142146
start_date = datetime.strptime(start_date_str,
143147
"%Y%m%d%H%M").strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -170,6 +174,8 @@ def _query_primary(term: str, operator: str = "and") -> Q:
170174
# This now uses the "primary_classification.combined" field, which is
171175
# isomorphic to the document-level "combined" field. So we get
172176
# straightforward hit highlighting and a more consistent behavior.
177+
if term.startswith('"') and term.endswith('"'):
178+
term = term[1:-1]
173179
return Q(
174180
"match",
175181
**{
@@ -226,6 +232,9 @@ def query_secondary_exact(classification: Classification) -> Q:
226232

227233
def query_legacy_cat(term: str) -> Q:
228234
#if has_wildcard(term):
235+
236+
if term.startswith('"') and term.endswith('"'):
237+
term = term[1:-1]
229238
return Q("wildcard", primary_classification__category__id=term) | Q(
230239
"nested",
231240
path="secondary_classification",

0 commit comments

Comments
 (0)