arXiv
diff --git a/‎README.md
Lines changed: 13 additions & 0 deletions b/‎README.md
Lines changed: 13 additions & 0 deletions
diff --git a/‎bulk_index.py
Lines changed: 7 additions & 4 deletions b/‎bulk_index.py
Lines changed: 7 additions & 4 deletions
diff --git a/‎mappings/DocumentMapping.json
Lines changed: 20 additions & 3 deletions b/‎mappings/DocumentMapping.json
Lines changed: 20 additions & 3 deletions
diff --git a/‎reindex.py
Lines changed: 53 additions & 0 deletions b/‎reindex.py
Lines changed: 53 additions & 0 deletions
diff --git a/‎search/agent/__init__.py
Lines changed: 2 additions & 1 deletion b/‎search/agent/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎search/agent/consumer.py
Lines changed: 2 additions & 1 deletion b/‎search/agent/consumer.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎search/config.py
Lines changed: 5 additions & 2 deletions b/‎search/config.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎search/controllers/advanced/__init__.py
Lines changed: 2 additions & 1 deletion b/‎search/controllers/advanced/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎search/controllers/advanced/forms.py
Lines changed: 7 additions & 8 deletions b/‎search/controllers/advanced/forms.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎search/controllers/simple/forms.py
Lines changed: 1 addition & 1 deletion b/‎search/controllers/simple/forms.py
Lines changed: 1 addition & 1 deletion
@@ -61,6 +61,19 @@ To check for missing records, use ``audit.py``:
 ELASTICSEARCH_HOST=127.0.0.1 ELASTICSEARCH_INDEX=arxiv pipenv run python audit.py -l list_of_papers.txt -o missing.txt
 ```
 
+### Reindexing
+
+ElasticSearch can perform reindexing by copying documents from one index to
+another index with a different mapping. ``reindex.py`` will initiate the
+reindexing process, and poll for completion until all of the documents are
+processed. If the destination index does not already exist, it will be created
+using the current configured mapping.
+
+```bash
+FLASK_APP=app.py ELASTICSEARCH_HOST=127.0.0.1 pipenv run python reindex.py OLD_INDEX NEW_INDEX
+```
+
+
 ### Flask dev server
 
 You can spin up the search app directly.
 
@@ -15,7 +15,6 @@
 from search.process import transform
 
 app = create_ui_web_app()
-app.app_context().push()
 
 
 @app.cli.command()
@@ -53,14 +52,18 @@ def populate(print_indexable: bool, paper_id: str, id_list: str,
                                label='Papers indexed') as index_bar:
             last = len(TO_INDEX) - 1
             for i, paper_id in enumerate(TO_INDEX):
+                this_meta = []
                 if load_cache:
                     try:
-                        meta += from_cache(cache_dir, paper_id)
-                        continue
+                        this_meta = from_cache(cache_dir, paper_id)
                     except RuntimeError as e:    # No document.
                         pass
 
-                chunk.append(paper_id)
+                if this_meta:
+                    meta += this_meta
+                else:
+                    chunk.append(paper_id)
+
                 if len(chunk) == retrieve_chunk_size or i == last:
                     try:
                         new_meta = metadata.bulk_retrieve(chunk)
 
@@ -55,6 +55,17 @@
             "english_stop"
           ]
         },
+        "author_simple": {
+          "type": "custom",
+          "tokenizer": "whitespace",
+          "char_filter": [
+            "strip_dots_commas"
+          ],
+          "filter": [
+            "icu_folding",
+            "lowercase"
+          ]
+        },
         "author_folding": {
           "type": "custom",
           "tokenizer": "whitespace",
@@ -94,6 +105,12 @@
             "lowercase"
           ]
         },
+        "author_simple": {
+          "filter": [
+            "icu_folding",
+            "lowercase"
+          ]
+        },
         "author_folding": {
           "filter": [
             "icu_folding",
@@ -162,12 +179,12 @@
               }
             },
             "initials": {
-              "type": "keyword",
-              "normalizer": "author_folding",
+              "type": "text",
+              "analyzer": "author_simple",
               "fields": {
                 "folded": {
                   "type": "keyword",
-                  "normalizer": "author_folding"
+                  "normalizer": "author_simple"
                 }
               }
             },
 
@@ -0,0 +1,53 @@
+"""Helper script to reindex all arXiv papers."""
+
+import os
+import tempfile
+import click
+import time
+
+from search.factory import create_ui_web_app
+from search.services import index
+
+app = create_ui_web_app()
+
+
+@app.cli.command()
+@click.argument('old_index', nargs=1)
+@click.argument('new_index', nargs=1)
+def reindex(old_index: str, new_index: str):
+    """
+    Reindex the documents in `old_index` to `new_index`.
+
+    This will create `new_index` with the current configured mappings if it
+    does not already exist.
+    """
+    click.echo(f"Reindex papers in `{old_index}` to `{new_index}`")
+    if not index.index_exists(old_index):
+        click.echo(f"Source index `{old_index}` does not exist.")
+
+    r = index.reindex(old_index, new_index)
+    if not r:
+        raise click.ClickException("Failed to get or create new index")
+
+    click.echo(f"Started reindexing task")
+    task_id = r['task']
+    with click.progressbar(length=100, label='percent complete') as progress:
+        while True:
+            status = index.get_task_status(task_id)
+            total = float(status['task']['status']['total'])
+            if status['completed'] or total == 0:
+                progress.update(100)
+                break
+
+            updated = status['task']['status']['updated']
+            created = status['task']['status']['created']
+            deleted = status['task']['status']['deleted']
+            complete = (updated + created + deleted)/total
+            progress.update(complete * 100)
+            if complete == 1:
+                break
+            time.sleep(2)
+
+
+if __name__ == '__main__':
+    reindex()
@@ -59,6 +59,7 @@ def process_stream(duration: Optional[int] = None) -> None:
             verify=app.config.get('KINESIS_VERIFY', 'true') == 'true',
             duration=duration,
             start_type=start_type,
-            start_at=start_at
+            start_at=start_at,
+            sleep=float(app.config['KINESIS_SLEEP'])
         )
         processor.go()
@@ -31,6 +31,7 @@ class MetadataRecordProcessor(BaseConsumer):
 
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         """Initialize exception counter."""
+        self.sleep: float = kwargs.get('sleep', 0.1)
         super(MetadataRecordProcessor, self).__init__(*args, **kwargs)  # type: ignore
         self._error_count = 0
 
@@ -296,7 +297,7 @@ def process_record(self, record: dict) -> None:
             documents failed.
 
         """
-        time.sleep(0.1)
+        time.sleep(self.sleep)
         logger.info(f'Processing record {record["SequenceNumber"]}')
         if self._error_count > self.MAX_ERRORS:
             raise IndexingFailed('Too many errors')
 
@@ -208,6 +208,9 @@
 KINESIS_START_TYPE = os.environ.get('KINESIS_START_TYPE', 'AT_TIMESTAMP')
 KINESIS_START_AT = os.environ.get('KINESIS_START_AT')
 
+KINESIS_SLEEP = os.environ.get('KINESIS_SLEEP', '0.1')
+"""Amount of time to wait before moving on to the next record."""
+
 
 """
 Flask-S3 plugin settings.
@@ -221,8 +224,8 @@
 FLASKS3_ACTIVE = os.environ.get('FLASKS3_ACTIVE', 0)
 
 # Settings for display of release information
-RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/fjs2FQ'
-RELEASE_NOTES_TEXT = 'Search v0.2 released 2018-05-04'
+RELEASE_NOTES_URL = 'https://confluence.cornell.edu/x/mBtOFQ'
+RELEASE_NOTES_TEXT = 'Search v0.3 released 2018-05-14'
 
 
 # TODO: one place to set the version, update release notes text, JIRA issue
 
@@ -133,6 +133,7 @@ def search(request_params: MultiDict) -> Response:
         else:
             logger.debug('form is invalid: %s', str(form.errors))
             if 'order' in form.errors or 'size' in form.errors:
+                print(form.errors, form.data)
                 # It's likely that the user tried to set these parameters
                 # manually, or that the search originated from somewhere else
                 # (and was configured incorrectly).
@@ -293,7 +294,7 @@ def group_search(args: MultiDict, groups_or_archives: str) -> Response:
             # group on the form.
             fld = dict(forms.ClassificationForm.PHYSICS_ARCHIVES).get(archive)
             if fld is None:
-                logger.warn('Invalid archive shortcut: {fld}')
+                logger.warn(f'Invalid archive shortcut: {fld}')
                 continue
             args['classification-physics'] = True
             # If there is more than one physics archives, only the last one
 
@@ -10,6 +10,8 @@
 from wtforms.fields import HiddenField
 from wtforms import widgets
 
+from arxiv import taxonomy
+
 from search.controllers.util import doesNotStartWithWildcard, stripWhiteSpace
 
 
@@ -91,13 +93,10 @@ class ClassificationForm(Form):
         ('q-fin', 'q_finance'),
         ('stat', 'statistics')
     ]
-    PHYSICS_ARCHIVES = [
-        ('all', 'all'), ('astro-ph', 'astro-ph'), ('cond-mat', 'cond-mat'),
-        ('gr-qc', 'gr-qc'), ('hep-ex', 'hep-ex'), ('hep-lat', 'hep-lat'),
-        ('hep-ph', 'hep-ph'), ('hep-th', 'hep-th'), ('math-ph', 'math-ph'),
-        ('nlin', 'nlin'), ('nucl-ex', 'nucl-ex'), ('nucl-th', 'nucl-th'),
-        ('physics', 'physics'), ('quant-ph', 'quant-ph')
-    ]
+    PHYSICS_ARCHIVES = [('all', 'all')] + \
+        [(archive, archive) for archive, description
+         in taxonomy.ARCHIVES_ACTIVE.items()
+         if description['in_group'] == 'grp_physics']
 
     computer_science = BooleanField('Computer Science (cs)')
     economics = BooleanField('Economics (econ)')
@@ -186,5 +185,5 @@ class AdvancedSearchForm(Form):
         ('-submitted_date', 'Submission date (newest first)'),
         ('submitted_date', 'Submission date (oldest first)'),
         ('', 'Relevance')
-    ], validators=[validators.Optional()], default='')
+    ], validators=[validators.Optional()], default='-announced_date_first')
     include_older_versions = BooleanField('Include older versions of papers')
@@ -42,7 +42,7 @@ class SimpleSearchForm(Form):
         ('-submitted_date', 'Submission date (newest first)'),
         ('submitted_date', 'Submission date (oldest first)'),
         ('', 'Relevance')
-    ], validators=[validators.Optional()], default='')
+    ], validators=[validators.Optional()], default='-announced_date_first')
 
     def validate_query(form: Form, field: StringField) -> None:
         """Validate the length of the querystring, if searchtype is set."""
Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,7 @@ def process_stream(duration: Optional[int] = None) -> None:`
`59`	`59`	`verify=app.config.get('KINESIS_VERIFY', 'true') == 'true',`
`60`	`60`	`duration=duration,`
`61`	`61`	`start_type=start_type,`
`62`		`- start_at=start_at`
	`62`	`+ start_at=start_at,`
	`63`	`+ sleep=float(app.config['KINESIS_SLEEP'])`
`63`	`64`	`)`
`64`	`65`	`processor.go()`