diff --git a/src/core/admin.py b/src/core/admin.py index 23b21831a0..531fdcc58b 100755 --- a/src/core/admin.py +++ b/src/core/admin.py @@ -398,6 +398,50 @@ class AccessRequestAdmin(admin.ModelAdmin): date_hierarchy = ('requested') +class OrganizationAdmin(admin.ModelAdmin): + list_display = ('pk', 'ror', '_ror_display', '_custom_label', + '_locations', 'ror_status') + list_display_links = ('pk', 'ror') + list_filter = ('ror_status', 'locations__country') + search_fields = ('pk', 'ror_display__value', 'custom_label__value', 'labels__value', + 'aliases__value', 'acronyms__value') + raw_id_fields = ('locations', ) + + def _ror_display(self, obj): + return obj.ror_display if obj and obj.ror_display else '' + + def _locations(self, obj): + return '; '.join([str(l) for l in obj.locations.all()]) if obj else '' + + def _custom_label(self, obj): + return obj.custom_label if obj and obj.custom_label else '' + + +class OrganizationNameAdmin(admin.ModelAdmin): + list_display = ('pk', 'value', 'language') + list_display_links = ('pk', 'value') + search_fields = ('pk', 'value') + raw_id_fields = ('ror_display_for', 'custom_label_for', + 'label_for', 'alias_for', 'acronym_for') + + def _ror_display(self, obj): + return obj.ror_display if obj and obj.ror_display else '' + + def _locations(self, obj): + return '; '.join([str(l) for l in obj.locations.all()]) if obj else '' + + def _custom_label(self, obj): + return obj.custom_label if obj and obj.custom_label else '' + + +class LocationAdmin(admin.ModelAdmin): + list_display = ('pk', 'name', 'country', 'geonames_id') + list_display_links = ('pk', 'name') + list_filter = ('country',) + search_fields = ('pk', 'name', 'country__code', 'country__name', + 'geonames_id') + + admin_list = [ (models.AccountRole, AccountRoleAdmin), (models.Account, AccountAdmin), @@ -427,6 +471,9 @@ class AccessRequestAdmin(admin.ModelAdmin): (models.Contacts, ContactsAdmin), (models.Contact, ContactAdmin), (models.AccessRequest, AccessRequestAdmin), + (models.Organization, OrganizationAdmin), + (models.OrganizationName, OrganizationNameAdmin), + (models.Location, LocationAdmin), ] [admin.site.register(*t) for t in admin_list] diff --git a/src/core/models.py b/src/core/models.py index cd411989c6..c142ea6c12 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -3,6 +3,7 @@ __license__ = "AGPL v3" __maintainer__ = "Birkbeck Centre for Technology and Publishing" +from decimal import Decimal import os import re import uuid @@ -13,6 +14,8 @@ import pytz from hijack.signals import hijack_started, hijack_ended import warnings +import tqdm +import zipfile from bs4 import BeautifulSoup from django.conf import settings @@ -51,6 +54,7 @@ from review import models as review_models from copyediting import models as copyediting_models from repository import models as repository_models +from utils.models import RORImportError from submission import models as submission_models from utils.logger import get_logger from utils import logic as utils_logic @@ -2080,6 +2084,82 @@ def naive_get_or_create( return organization, created + @classmethod + def create_from_ror_record(cls, record): + """ + Creates one organization object in Janeway from a ROR JSON record, + using version 2 of the ROR Schema. + See https://ror.readme.io/v2/docs/data-structure + """ + organization, created = cls.objects.get_or_create( + ror=record.get('id', ''), + ) + if record.get('status'): + organization.ror_status = record.get('status') + organization.save() + for name in record.get('names'): + kwargs = {} + kwargs['value'] = name.get('value', '') + if name.get('lang'): + kwargs['language'] = name.get('language', '') + if 'ror_display' in name.get('types'): + kwargs['ror_display_for'] = organization + if 'label' in name.get('types'): + kwargs['label_for'] = organization + if 'alias' in name.get('types'): + kwargs['alias_for'] = organization + if 'acronym' in name.get('types'): + kwargs['acronym_for'] = organization + OrganizationName.objects.get_or_create(**kwargs) + for location in record.get('locations'): + details = location.get('geonames_details', {}) + country, created = Country.objects.get_or_create( + code=details.get('country_code', ''), + ) + lat = Decimal(details.get('lat')) + lng = Decimal(details.get('lng')) + location, created = Location.objects.get_or_create( + name=details.get('name', ''), + country=country, + latitude=lat, + longitude=lng, + geonames_id=location.get('geonames_id'), + ) + organization.locations.add(location) + + + @classmethod + def import_ror_batch(cls, ror_import, test_full_import=False): + """ + Opens a previously downloaded data dump from + ROR's Zenodo endpoint, processes the records, + and records errors for exceptions raised during creation. + https://ror.readme.io/v2/docs/data-dump + """ + num_errors_before = RORImportError.objects.count() + with zipfile.ZipFile(ror_import.zip_path, mode='r') as zip_ref: + for file_info in zip_ref.infolist(): + if file_info.filename.endswith('v2.json'): + json_string = zip_ref.read(file_info).decode(encoding="utf-8") + data = json.loads(json_string) + if settings.DEBUG and not test_full_import: + # Limit the import run during development by default + data = data[:100] + for item in tqdm.tqdm(data): + try: + cls.create_from_ror_record(item) + except Exception as error: + message = f'{error}\n{json.dumps(item)}' + RORImportError.objects.create( + ror_import=ror_import, + message=message, + ) + num_errors_after = RORImportError.objects.count() + if num_errors_after > num_errors_before: + logger.warn( + f'ROR import errors logged: { num_errors_after - num_errors_before }' + ) + class Affiliation(models.Model): account = models.ForeignKey( diff --git a/src/utils/admin.py b/src/utils/admin.py index 4c3f682485..4e42fdb734 100755 --- a/src/utils/admin.py +++ b/src/utils/admin.py @@ -53,11 +53,34 @@ class VersionAdmin(admin.ModelAdmin): date_hierarchy = ('date') +class RORImportAdmin(admin.ModelAdmin): + list_display = ('pk', 'status', 'started', 'stopped') + list_filter = ('status', 'started', 'stopped') + search_fields = ('rorimporterror__message', 'records',) + date_hierarchy = ('started') + readonly_fields = ('started', 'stopped', 'status', 'records') + inlines = [ + admin_utils.RORImportErrorInline, + ] + + +class RORImportErrorAdmin(admin.ModelAdmin): + list_display = ('pk', '_first_line') + search_fields = ('message',) + date_hierarchy = ('ror_import__started') + raw_id_fields = ('ror_import', ) + + def _first_line(self, obj): + return obj.message.split('\n')[0] if obj and obj.message else '' + + admin_list = [ (models.LogEntry, LogAdmin), (models.Plugin, PluginAdmin), (models.ImportCacheEntry, ImportCacheAdmin), - (models.Version, VersionAdmin) + (models.Version, VersionAdmin), + (models.RORImport, RORImportAdmin), + (models.RORImportError, RORImportErrorAdmin), ] [admin.site.register(*t) for t in admin_list] diff --git a/src/utils/admin_utils.py b/src/utils/admin_utils.py index e7541d5416..19cf4ca4ff 100644 --- a/src/utils/admin_utils.py +++ b/src/utils/admin_utils.py @@ -297,6 +297,12 @@ class NewsItemInline(admin.TabularInline): raw_id_fields = ('newsitem',) +class RORImportErrorInline(admin.TabularInline): + model = core_models.RORImportError + extra = 0 + readonly_fields = ('message',) + + class JournalFilterBase(admin.SimpleListFilter): """ A base class for other journal filters diff --git a/src/utils/management/commands/import_ror_data.py b/src/utils/management/commands/import_ror_data.py new file mode 100644 index 0000000000..dae269d0bf --- /dev/null +++ b/src/utils/management/commands/import_ror_data.py @@ -0,0 +1,53 @@ +from django.conf import settings +from django.core.management.base import BaseCommand + +from utils.models import RORImport +from core.models import Organization +from utils.logger import get_logger + + +logger = get_logger(__name__) + + +class Command(BaseCommand): + """ + Fetches ROR data and generates Organization records. + """ + + help = "Fetches ROR data and generates Organization records." + + def add_arguments(self, parser): + parser.add_argument( + '--test_full_import', + help='By default, the command only runs 100 records when DEBUG=True.' + 'Pass --test_full_import to import the entire dump in development.', + action='store_true', + ) + return super().add_arguments(parser) + + def handle(self, *args, **options): + ror_import = RORImport.objects.create() + ror_import.get_records() + + # The import is necessary. + # Check we have the right copy of the data dump. + if ror_import.ongoing or settings.DEBUG: + if not ror_import.previous_import: + ror_import.download_data() + elif ror_import.previous_import.zip_path != ror_import.zip_path: + ror_import.download_data() + + # The data is all downloaded and ready to import. + if ror_import.ongoing or settings.DEBUG: + test_full_import = options.get('test_full_import', False) + Organization.import_ror_batch( + ror_import, + test_full_import=test_full_import, + ) + + # The process did not error out, so it can be considered a success. + if ror_import.ongoing: + ror_import.status = ror_import.RORImportStatus.SUCCESSFUL + ror_import.save() + + logger.info(ror_import.status) diff --git a/src/utils/migrations/0035_rorimport_rorimporterror.py b/src/utils/migrations/0035_rorimport_rorimporterror.py new file mode 100644 index 0000000000..a22c56b89c --- /dev/null +++ b/src/utils/migrations/0035_rorimport_rorimporterror.py @@ -0,0 +1,37 @@ +# Generated by Django 4.2.14 on 2024-07-26 20:51 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('utils', '0034_rename_toaddress_addressee'), + ] + + operations = [ + migrations.CreateModel( + name='RORImport', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('started', models.DateTimeField(auto_now_add=True)), + ('stopped', models.DateTimeField(blank=True, null=True)), + ('status', models.CharField(choices=[('ongoing', 'Ongoing'), ('unnecessary', 'Unnecessary'), ('successful', 'Successful'), ('failed', 'Failed')], default='ongoing')), + ('records', models.JSONField(default=dict)), + ], + options={ + 'verbose_name': 'ROR import', + 'verbose_name_plural': 'ROR imports', + 'get_latest_by': 'started', + }, + ), + migrations.CreateModel( + name='RORImportError', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('message', models.TextField(blank=True)), + ('ror_import', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='utils.rorimport')), + ], + ), + ] diff --git a/src/utils/models.py b/src/utils/models.py index 0ad6918794..d4481c30d6 100755 --- a/src/utils/models.py +++ b/src/utils/models.py @@ -3,7 +3,7 @@ __license__ = "AGPL v3" __maintainer__ = "Birkbeck Centre for Technology and Publishing" -import json as jason +import json import os from uuid import uuid4 import requests @@ -16,10 +16,14 @@ from django.conf import settings from django.utils.text import slugify -from utils.shared import get_ip_address, join_lists +from utils.logger import get_logger +from utils.shared import get_ip_address from utils.importers.up import get_input_value_by_name +logger = get_logger(__name__) + + LOG_TYPES = [ ('Email', 'Email'), ('PageView', 'PageView'), @@ -295,7 +299,7 @@ def fetch(url, up_auth_file='', up_base_url='', ojs_auth_file=''): # first, check whether there's an auth file if up_auth_file != '': with open(up_auth_file, 'r', encoding="utf-8") as auth_in: - auth_dict = jason.loads(auth_in.read()) + auth_dict = json.loads(auth_in.read()) do_auth = True username = auth_dict['username'] password = auth_dict['password'] @@ -348,3 +352,153 @@ def fetch(url, up_auth_file='', up_base_url='', ojs_auth_file=''): def __str__(self): return self.url + + +class RORImport(models.Model): + """ + An record of an import of ROR organization data into Janeway. + """ + class RORImportStatus(models.TextChoices): + ONGOING = 'ongoing', 'Ongoing' + UNNECESSARY = 'unnecessary', 'Unnecessary' + SUCCESSFUL = 'successful', 'Successful' + FAILED = 'failed', 'Failed' + + started = models.DateTimeField( + auto_now_add=True, + ) + stopped = models.DateTimeField( + blank=True, + null=True, + ) + status = models.CharField( + choices=RORImportStatus.choices, + default=RORImportStatus.ONGOING, + ) + records = models.JSONField( + default=dict, + ) + + class Meta: + get_latest_by = 'started' + verbose_name = 'ROR import' + verbose_name_plural = 'ROR imports' + + def __str__(self): + return f'{self.status} RORImport started { self.started }' + + @property + def previous_import(self): + try: + return RORImport.objects.exclude(pk=self.pk).latest() + except RORImport.DoesNotExist: + return None + + @property + def new_download_needed(self): + if not self.previous_import: + return True + elif self.previous_import.status == self.RORImportStatus.FAILED: + return True + elif not self.source_data_created or self.source_data_created > self.previous_import.started: + return True + else: + return False + + @property + def zip_path(self): + temp_dir = os.path.join(settings.BASE_DIR, 'files', 'temp') + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + try: + file_id = self.records['hits']['hits'][0]['files'][-1]['id'] + except (KeyError, AttributeError) as error: + self.fail(error) + return '' + zip_name = f'ror-download-{file_id}.zip' + return os.path.join(temp_dir, zip_name) + + @property + def download_link(self): + try: + return self.records['hits']['hits'][0]['files'][-1]['links']['self'] + except (KeyError, AttributeError) as error: + self.fail(error) + return '' + + @property + def source_data_created(self): + try: + timestamp = self.records['hits']['hits'][0]['created'] + return timezone.datetime.fromisoformat(timestamp) + except (KeyError, AttributeError) as error: + self.fail(error) + return None + + def fail(self, error): + self.stopped = timezone.datetime.now() + self.status = self.RORImportStatus.FAILED + self.save() + logger.error(error) + RORImportError.objects.create(ror_import=self, messsage=error) + + @property + def ongoing(self): + return self.status == self.RORImportStatus.ONGOING + + def get_records(self): + """ + Gets the manifest of available data and checks if it contains + anything new. If there is no new data, or if the previous import failed, + the import is marked as unnecessary. + """ + records_url = 'https://zenodo.org/api/communities/ror-data/records?sort=newest' + try: + response = requests.get(records_url, timeout=settings.HTTP_TIMEOUT_SECONDS) + response.raise_for_status() + self.records = response.json() + self.save() + if not self.new_download_needed: + self.status = self.RORImportStatus.UNNECESSARY + self.save() + except requests.RequestException as error: + self.fail(error) + + def delete_previous_download(self): + if not self.previous_import: + logger.info('No previous import to remove.') + return + try: + os.unlink(self.previous_import.zip_path) + except FileNotFoundError: + logger.info('Previous import had no zip file.') + + def download_data(self): + """ + Downloads the current data dump from Zenodo. + Then removes previous files to save space. + """ + try: + response = requests.get( + self.download_link, + timeout=settings.HTTP_TIMEOUT_SECONDS, + stream=True, + ) + response.raise_for_status() + with open(self.zip_path, 'wb') as zip_ref: + for chunk in response.iter_content(chunk_size=128): + zip_ref.write(chunk) + if os.path.exists(self.zip_path): + self.delete_previous_download() + except requests.RequestException as error: + self.fail(error) + + +class RORImportError(models.Model): + ror_import = models.ForeignKey( + RORImport, + on_delete=models.CASCADE, + ) + message = models.TextField( + blank=True, + )