Skip to content

Commit 24e767f

Browse files
authored
Merge pull request #191 from cul-it/develop
Pre-release merge for search-0.3
2 parents 3e48813 + 693c0ff commit 24e767f

29 files changed

+744
-92
lines changed

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,19 @@ To check for missing records, use ``audit.py``:
6161
ELASTICSEARCH_HOST=127.0.0.1 ELASTICSEARCH_INDEX=arxiv pipenv run python audit.py -l list_of_papers.txt -o missing.txt
6262
```
6363

64+
### Reindexing
65+
66+
ElasticSearch can perform reindexing by copying documents from one index to
67+
another index with a different mapping. ``reindex.py`` will initiate the
68+
reindexing process, and poll for completion until all of the documents are
69+
processed. If the destination index does not already exist, it will be created
70+
using the current configured mapping.
71+
72+
```bash
73+
FLASK_APP=app.py ELASTICSEARCH_HOST=127.0.0.1 pipenv run python reindex.py OLD_INDEX NEW_INDEX
74+
```
75+
76+
6477
### Flask dev server
6578

6679
You can spin up the search app directly.

bulk_index.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
from search.process import transform
1616

1717
app = create_ui_web_app()
18-
app.app_context().push()
1918

2019

2120
@app.cli.command()
@@ -53,14 +52,18 @@ def populate(print_indexable: bool, paper_id: str, id_list: str,
5352
label='Papers indexed') as index_bar:
5453
last = len(TO_INDEX) - 1
5554
for i, paper_id in enumerate(TO_INDEX):
55+
this_meta = []
5656
if load_cache:
5757
try:
58-
meta += from_cache(cache_dir, paper_id)
59-
continue
58+
this_meta = from_cache(cache_dir, paper_id)
6059
except RuntimeError as e: # No document.
6160
pass
6261

63-
chunk.append(paper_id)
62+
if this_meta:
63+
meta += this_meta
64+
else:
65+
chunk.append(paper_id)
66+
6467
if len(chunk) == retrieve_chunk_size or i == last:
6568
try:
6669
new_meta = metadata.bulk_retrieve(chunk)

mappings/DocumentMapping.json

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,17 @@
5555
"english_stop"
5656
]
5757
},
58+
"author_simple": {
59+
"type": "custom",
60+
"tokenizer": "whitespace",
61+
"char_filter": [
62+
"strip_dots_commas"
63+
],
64+
"filter": [
65+
"icu_folding",
66+
"lowercase"
67+
]
68+
},
5869
"author_folding": {
5970
"type": "custom",
6071
"tokenizer": "whitespace",
@@ -94,6 +105,12 @@
94105
"lowercase"
95106
]
96107
},
108+
"author_simple": {
109+
"filter": [
110+
"icu_folding",
111+
"lowercase"
112+
]
113+
},
97114
"author_folding": {
98115
"filter": [
99116
"icu_folding",
@@ -162,12 +179,12 @@
162179
}
163180
},
164181
"initials": {
165-
"type": "keyword",
166-
"normalizer": "author_folding",
182+
"type": "text",
183+
"analyzer": "author_simple",
167184
"fields": {
168185
"folded": {
169186
"type": "keyword",
170-
"normalizer": "author_folding"
187+
"normalizer": "author_simple"
171188
}
172189
}
173190
},

reindex.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""Helper script to reindex all arXiv papers."""
2+
3+
import os
4+
import tempfile
5+
import click
6+
import time
7+
8+
from search.factory import create_ui_web_app
9+
from search.services import index
10+
11+
app = create_ui_web_app()
12+
13+
14+
@app.cli.command()
15+
@click.argument('old_index', nargs=1)
16+
@click.argument('new_index', nargs=1)
17+
def reindex(old_index: str, new_index: str):
18+
"""
19+
Reindex the documents in `old_index` to `new_index`.
20+
21+
This will create `new_index` with the current configured mappings if it
22+
does not already exist.
23+
"""
24+
click.echo(f"Reindex papers in `{old_index}` to `{new_index}`")
25+
if not index.index_exists(old_index):
26+
click.echo(f"Source index `{old_index}` does not exist.")
27+
28+
r = index.reindex(old_index, new_index)
29+
if not r:
30+
raise click.ClickException("Failed to get or create new index")
31+
32+
click.echo(f"Started reindexing task")
33+
task_id = r['task']
34+
with click.progressbar(length=100, label='percent complete') as progress:
35+
while True:
36+
status = index.get_task_status(task_id)
37+
total = float(status['task']['status']['total'])
38+
if status['completed'] or total == 0:
39+
progress.update(100)
40+
break
41+
42+
updated = status['task']['status']['updated']
43+
created = status['task']['status']['created']
44+
deleted = status['task']['status']['deleted']
45+
complete = (updated + created + deleted)/total
46+
progress.update(complete * 100)
47+
if complete == 1:
48+
break
49+
time.sleep(2)
50+
51+
52+
if __name__ == '__main__':
53+
reindex()

search/agent/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def process_stream(duration: Optional[int] = None) -> None:
5959
verify=app.config.get('KINESIS_VERIFY', 'true') == 'true',
6060
duration=duration,
6161
start_type=start_type,
62-
start_at=start_at
62+
start_at=start_at,
63+
sleep=float(app.config['KINESIS_SLEEP'])
6364
)
6465
processor.go()

search/agent/consumer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class MetadataRecordProcessor(BaseConsumer):
3131

3232
def __init__(self, *args: Any, **kwargs: Any) -> None:
3333
"""Initialize exception counter."""
34+
self.sleep: float = kwargs.get('sleep', 0.1)
3435
super(MetadataRecordProcessor, self).__init__(*args, **kwargs) # type: ignore
3536
self._error_count = 0
3637

@@ -296,7 +297,7 @@ def process_record(self, record: dict) -> None:
296297
documents failed.
297298
298299
"""
299-
time.sleep(0.1)
300+
time.sleep(self.sleep)
300301
logger.info(f'Processing record {record["SequenceNumber"]}')
301302
if self._error_count > self.MAX_ERRORS:
302303
raise IndexingFailed('Too many errors')

search/config.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@
208208
KINESIS_START_TYPE = os.environ.get('KINESIS_START_TYPE', 'AT_TIMESTAMP')
209209
KINESIS_START_AT = os.environ.get('KINESIS_START_AT')
210210

211+
KINESIS_SLEEP = os.environ.get('KINESIS_SLEEP', '0.1')
212+
"""Amount of time to wait before moving on to the next record."""
213+
211214

212215
"""
213216
Flask-S3 plugin settings.
@@ -221,8 +224,8 @@
221224
FLASKS3_ACTIVE = os.environ.get('FLASKS3_ACTIVE', 0)
222225

223226
# Settings for display of release information
224-
RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/fjs2FQ'
225-
RELEASE_NOTES_TEXT = 'Search v0.2 released 2018-05-04'
227+
RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/mBtOFQ'
228+
RELEASE_NOTES_TEXT = 'Search v0.3 released 2018-05-14'
226229

227230

228231
# TODO: one place to set the version, update release notes text, JIRA issue

search/controllers/advanced/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ def search(request_params: MultiDict) -> Response:
133133
else:
134134
logger.debug('form is invalid: %s', str(form.errors))
135135
if 'order' in form.errors or 'size' in form.errors:
136+
print(form.errors, form.data)
136137
# It's likely that the user tried to set these parameters
137138
# manually, or that the search originated from somewhere else
138139
# (and was configured incorrectly).
@@ -293,7 +294,7 @@ def group_search(args: MultiDict, groups_or_archives: str) -> Response:
293294
# group on the form.
294295
fld = dict(forms.ClassificationForm.PHYSICS_ARCHIVES).get(archive)
295296
if fld is None:
296-
logger.warn('Invalid archive shortcut: {fld}')
297+
logger.warn(f'Invalid archive shortcut: {fld}')
297298
continue
298299
args['classification-physics'] = True
299300
# If there is more than one physics archives, only the last one

search/controllers/advanced/forms.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from wtforms.fields import HiddenField
1111
from wtforms import widgets
1212

13+
from arxiv import taxonomy
14+
1315
from search.controllers.util import doesNotStartWithWildcard, stripWhiteSpace
1416

1517

@@ -91,13 +93,10 @@ class ClassificationForm(Form):
9193
('q-fin', 'q_finance'),
9294
('stat', 'statistics')
9395
]
94-
PHYSICS_ARCHIVES = [
95-
('all', 'all'), ('astro-ph', 'astro-ph'), ('cond-mat', 'cond-mat'),
96-
('gr-qc', 'gr-qc'), ('hep-ex', 'hep-ex'), ('hep-lat', 'hep-lat'),
97-
('hep-ph', 'hep-ph'), ('hep-th', 'hep-th'), ('math-ph', 'math-ph'),
98-
('nlin', 'nlin'), ('nucl-ex', 'nucl-ex'), ('nucl-th', 'nucl-th'),
99-
('physics', 'physics'), ('quant-ph', 'quant-ph')
100-
]
96+
PHYSICS_ARCHIVES = [('all', 'all')] + \
97+
[(archive, archive) for archive, description
98+
in taxonomy.ARCHIVES_ACTIVE.items()
99+
if description['in_group'] == 'grp_physics']
101100

102101
computer_science = BooleanField('Computer Science (cs)')
103102
economics = BooleanField('Economics (econ)')
@@ -186,5 +185,5 @@ class AdvancedSearchForm(Form):
186185
('-submitted_date', 'Submission date (newest first)'),
187186
('submitted_date', 'Submission date (oldest first)'),
188187
('', 'Relevance')
189-
], validators=[validators.Optional()], default='')
188+
], validators=[validators.Optional()], default='-announced_date_first')
190189
include_older_versions = BooleanField('Include older versions of papers')

search/controllers/simple/forms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class SimpleSearchForm(Form):
4242
('-submitted_date', 'Submission date (newest first)'),
4343
('submitted_date', 'Submission date (oldest first)'),
4444
('', 'Relevance')
45-
], validators=[validators.Optional()], default='')
45+
], validators=[validators.Optional()], default='-announced_date_first')
4646

4747
def validate_query(form: Form, field: StringField) -> None:
4848
"""Validate the length of the querystring, if searchtype is set."""

0 commit comments

Comments
 (0)