diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0f72bb6de..8ffc22ac9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -116,7 +116,7 @@ jobs: path: dist - name: Install Dependencies run: | - sudo apt-get update && sudo apt-get install --yes pandoc texlive-xetex librsvg2-bin + sudo apt-get update && sudo apt-get install --yes pandoc texlive-latex-base texlive-latex-extra texlive-luatex librsvg2-bin pandoc --version python -m pip install --upgrade pip python -m pip --version @@ -172,7 +172,7 @@ jobs: path: dist - name: Install Dependencies run: | - sudo apt-get update && sudo apt install --yes pandoc texlive-xetex librsvg2-bin + sudo apt-get update && sudo apt install --yes pandoc texlive-latex-base texlive-latex-extra texlive-luatex librsvg2-bin python -m pip install --upgrade pip - name: Install rdmo[postgres] from wheel and start postgresql run: | diff --git a/rdmo/core/pandoc.py b/rdmo/core/pandoc.py new file mode 100644 index 000000000..2882fdb4a --- /dev/null +++ b/rdmo/core/pandoc.py @@ -0,0 +1,143 @@ +import json +import logging +import os +import re +from pathlib import Path +from tempfile import mkstemp + +from django.apps import apps +from django.conf import settings + +import pypandoc +from packaging.version import Version +from packaging.version import parse as parse_version + +log = logging.getLogger(__name__) + + +def get_pandoc_version(): + return parse_version(pypandoc.get_pandoc_version()) + + +def get_pandoc_content(html, metadata, export_format, context): + pandoc_args = get_pandoc_args(export_format, context) + + if metadata: + # create a temporary file for the metadata + (metadata_tmp_fd, metadata_tmp_file_name) = mkstemp(suffix='.json') + + # save metadata + log.info('Save metadata file %s %s', metadata_tmp_file_name, str(metadata)) + with open(metadata_tmp_file_name, 'w') as fp: + json.dump(metadata, fp) + + # add metadata file to pandoc args + pandoc_args.append('--metadata-file=' + metadata_tmp_file_name) + + # create a temporary file + (tmp_fd, tmp_file_name) = mkstemp(f'.{export_format}') + + # convert the file using pandoc + log.info('Export %s document using args %s.', export_format, pandoc_args) + html = re.sub( + r'(' + + str(Path(settings.STATIC_ROOT)) + r'/\g<2>', html + ) + pypandoc.convert_text(html, export_format, format='html', outputfile=tmp_file_name, extra_args=pandoc_args) + + # read the created temporary file + with open(tmp_file_name, 'rb') as fp: + pandoc_content = fp.read() + + # delete temporary files + if metadata: + os.remove(metadata_tmp_file_name) + os.remove(tmp_file_name) + + return pandoc_content + + +def get_pandoc_content_disposition(export_format, title): + if export_format == 'pdf': + # display pdf in browser + return f'filename="{title}.{export_format}"' + else: + return f'attachment; filename="{title}.{export_format}"' + + +def get_pandoc_args(export_format, context): + pandoc_version = get_pandoc_version() + pandoc_args = list(settings.EXPORT_PANDOC_ARGS.get(export_format, [])) # without list(), settings would be changed + + if export_format == 'pdf': + # we used xelatex before pandoc 3 + if pandoc_version < Version('3'): + pandoc_args = [ + arg.replace('--pdf-engine=lualatex', '--pdf-engine=xelatex') + for arg in pandoc_args + ] + + elif export_format in ['docx', 'odt']: + # find and add a possible reference document + reference_document = get_pandoc_reference_document(export_format, context) + if reference_document: + if pandoc_version >= Version('2'): + pandoc_args.append(f'--reference-doc={reference_document}') + else: + pandoc_args.append(f'--reference-{export_format}={reference_document}') + + # add STATIC_ROOT and possible additional resource paths + if pandoc_version >= Version('2'): + pandoc_args.append(f'--resource-path={settings.STATIC_ROOT}') + if 'resource_path' in context: + resource_path = Path(settings.MEDIA_ROOT) / context['resource_path'] + pandoc_args.append(f'--resource-path={resource_path}') + + return pandoc_args + + +def get_pandoc_reference_document(export_format, context): + # collect all configured reference documents + reference_documents = get_pandoc_reference_documents(export_format, context) + + # return the first reference document that actually exists + for reference_document in reference_documents: + if reference_document and reference_document.exists(): + return Path(reference_document) + + +def get_pandoc_reference_documents(export_format, context): + # try to get the view and its uri from the context, if it is not set, the current url should be project_answers + try: + view = context['view'] + view_uri = view.uri + except (KeyError, AttributeError): + view_uri = None + + reference_documents = [] + + if export_format == 'odt': + # append view specific custom reference document + if view_uri and view_uri in settings.EXPORT_REFERENCE_ODT_VIEWS: + reference_documents.append(settings.EXPORT_REFERENCE_ODT_VIEWS[view_uri]) + + # append generic custom reference document + if settings.EXPORT_REFERENCE_ODT: + reference_documents.append(settings.EXPORT_REFERENCE_ODT) + + # append the default reference document + reference_documents.append(Path(apps.get_app_config('rdmo').path) / 'share' / 'reference.odt') + + elif export_format == 'docx': + # append view specific custom reference document + if view_uri and view_uri in settings.EXPORT_REFERENCE_DOCX_VIEWS: + reference_documents.append(settings.EXPORT_REFERENCE_DOCX_VIEWS[view_uri]) + + # append generic custom reference document + if settings.EXPORT_REFERENCE_DOCX: + reference_documents.append(settings.EXPORT_REFERENCE_DOCX) + + # append the default reference document + reference_documents.append(Path(apps.get_app_config('rdmo').path) / 'share' / 'reference.docx') + + return reference_documents diff --git a/rdmo/core/settings.py b/rdmo/core/settings.py index 7f803d29a..5b59dbfac 100644 --- a/rdmo/core/settings.py +++ b/rdmo/core/settings.py @@ -289,7 +289,7 @@ EXPORT_REFERENCE_DOCX = None EXPORT_PANDOC_ARGS = { - 'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex'], + 'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex'], 'rtf': ['--standalone'] } diff --git a/rdmo/core/tests/test_pandoc.py b/rdmo/core/tests/test_pandoc.py new file mode 100644 index 000000000..924520937 --- /dev/null +++ b/rdmo/core/tests/test_pandoc.py @@ -0,0 +1,156 @@ +from pathlib import Path + +import pytest + +from django.apps import apps + +from packaging.version import Version + +from ..pandoc import get_pandoc_args, get_pandoc_reference_document, get_pandoc_reference_documents, get_pandoc_version + +rdmo_path = Path(apps.get_app_config('rdmo').path) +testing_path = rdmo_path.parent / 'testing' + +pandoc_versions = [ + '1.9.0', + '2.0.0', + '3.0.0', + '3.5.0' +] + +export_formats = [ + 'rtf', + 'odt', + 'docx', + 'html', + 'markdown', + 'tex', + 'pdf' +] + +pandoc_args_map = { + '1.9.0': { + 'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex'], + 'rtf': ['--standalone'], + 'docx': [f'--reference-docx={rdmo_path}/share/reference.docx'], + 'odt': [f'--reference-odt={rdmo_path}/share/reference.odt'], + 'other': [] + }, + '2.0.0': { + 'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex', + f'--resource-path={testing_path}/static_root'], + 'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'], + 'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'], + 'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'], + 'other': [f'--resource-path={testing_path}/static_root'] + }, + '3.0.0': { + 'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex', + f'--resource-path={testing_path}/static_root'], + 'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'], + 'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'], + 'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'], + 'other': [f'--resource-path={testing_path}/static_root'] + }, + '3.5.0': { + 'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex', + f'--resource-path={testing_path}/static_root'], + 'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'], + 'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'], + 'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'], + 'other': [f'--resource-path={testing_path}/static_root'] + } +} + +class MockedView: + uri = 'http://example.com/terms/views/view' + +@pytest.mark.parametrize('pandoc_version', pandoc_versions) +def test_get_pandoc_version(mocker, pandoc_version): + mocker.patch('pypandoc.get_pandoc_version', return_value=pandoc_version) + assert get_pandoc_version() == Version(pandoc_version) + + +@pytest.mark.parametrize('pandoc_version', pandoc_versions) +@pytest.mark.parametrize('export_format', export_formats) +def test_get_pandoc_args(settings, mocker, pandoc_version, export_format): + mocker.patch('pypandoc.get_pandoc_version', return_value=pandoc_version) + + assert get_pandoc_args(export_format, {}) == \ + pandoc_args_map[pandoc_version].get(export_format, pandoc_args_map[pandoc_version]['other']) + + +def test_get_pandoc_reference_document(mocker): + mocker.patch('rdmo.core.pandoc.get_pandoc_reference_documents', return_value=[ + rdmo_path / 'share' / 'missing.docx', + rdmo_path / 'share' / 'reference.docx', + rdmo_path / 'share' / 'reference.odt' + ]) + + # return the first existing file + assert get_pandoc_reference_document('other', {}) == rdmo_path / 'share' / 'reference.docx' + + +def test_get_pandoc_reference_document_missing(mocker): + mocker.patch('rdmo.core.pandoc.get_pandoc_reference_documents', return_value=[ + rdmo_path / 'share' / 'missing.docx', + rdmo_path / 'share' / 'missing.odt' + ]) + + assert get_pandoc_reference_document('other', {}) is None + + +@pytest.mark.parametrize('export_format', export_formats) +def test_get_pandoc_reference_documents(export_format): + rdmo_path = Path(apps.get_app_config('rdmo').path) + + reference_documents = get_pandoc_reference_documents(export_format, {}) + + if export_format in ['docx', 'odt']: + assert reference_documents == [rdmo_path / 'share' / f'reference.{export_format}'] + else: + assert reference_documents == [] + + +@pytest.mark.parametrize('export_format', export_formats) +def test_get_pandoc_reference_documents_view(export_format): + reference_documents = get_pandoc_reference_documents(export_format, {'view': MockedView()}) + + if export_format in ['docx', 'odt']: + assert reference_documents == [rdmo_path / 'share' / f'reference.{export_format}'] + else: + assert reference_documents == [] + + +@pytest.mark.parametrize('export_format', export_formats) +def test_get_pandoc_reference_documents_view_settings(settings, export_format): + mock_file = rdmo_path / 'share' / f'mock.{export_format}' + + if export_format == 'docx': + settings.EXPORT_REFERENCE_DOCX_VIEWS = {'http://example.com/terms/views/view': mock_file} + elif export_format == 'odt': + settings.EXPORT_REFERENCE_ODT_VIEWS = {'http://example.com/terms/views/view': mock_file} + + reference_documents = get_pandoc_reference_documents(export_format, {'view': MockedView()}) + + if export_format in ['docx', 'odt']: + assert reference_documents == [mock_file, rdmo_path / 'share' / f'reference.{export_format}'] + else: + assert reference_documents == [] + + +@pytest.mark.parametrize('export_format', export_formats) +def test_get_pandoc_reference_documents_settings(settings, export_format): + mock_file = rdmo_path / 'share' / f'mock.{export_format}' + + if export_format == 'docx': + settings.EXPORT_REFERENCE_DOCX = mock_file + elif export_format == 'odt': + settings.EXPORT_REFERENCE_ODT = mock_file + + reference_documents = get_pandoc_reference_documents(export_format, {}) + + if export_format in ['docx', 'odt']: + assert reference_documents == [mock_file, rdmo_path / 'share' / f'reference.{export_format}'] + else: + assert reference_documents == [] diff --git a/rdmo/core/utils.py b/rdmo/core/utils.py index 0813b05c7..a1ea1b319 100644 --- a/rdmo/core/utils.py +++ b/rdmo/core/utils.py @@ -4,21 +4,19 @@ import os import re from pathlib import Path -from tempfile import mkstemp from urllib.parse import urlparse -from django.apps import apps from django.conf import settings from django.http import Http404, HttpResponse, HttpResponseBadRequest from django.template.loader import get_template from django.utils.encoding import force_str from django.utils.translation import gettext_lazy as _ -import pypandoc from defusedcsv import csv from markdown import markdown from .constants import HUMAN2BYTES_MAPPER +from .pandoc import get_pandoc_content, get_pandoc_content_disposition log = logging.getLogger(__name__) @@ -58,29 +56,6 @@ def get_uri_prefix(obj): return r -def get_pandoc_main_version(): - try: - return int(pypandoc.get_pandoc_version().split('.')[0]) - except OSError: - return None - - -def pandoc_version_at_least(required_version): - required = [int(x) for x in required_version.split('.')] - installed = [int(x) for x in pypandoc.get_pandoc_version().split('.')] - for idx, digit in enumerate(installed): - try: - req = required[idx] - except IndexError: - return True - else: - if digit < req: - return False - if digit > req: - return True - return True - - def join_url(base, *args): url = base for arg in args: @@ -157,52 +132,6 @@ def get_language_warning(obj, field): return False -def set_export_reference_document(format, context): - # try to get the view uri from the context - try: - view = context['view'] - view_uri = view.uri - except (AttributeError, KeyError, TypeError): - view_uri = None - - refdocs = [] - - if format == 'odt': - # append view specific custom refdoc - try: - refdocs.append(settings.EXPORT_REFERENCE_ODT_VIEWS[view_uri]) - except KeyError: - pass - - # append custom refdoc - if settings.EXPORT_REFERENCE_ODT: - refdocs.append(settings.EXPORT_REFERENCE_ODT) - - elif format == 'docx': - # append view specific custom refdoc - try: - refdocs.append(settings.EXPORT_REFERENCE_DOCX_VIEWS[view_uri]) - except KeyError: - pass - - # append custom refdoc - if settings.EXPORT_REFERENCE_DOCX: - refdocs.append(settings.EXPORT_REFERENCE_DOCX) - - # append the default reference docs - refdocs.append( - os.path.join( - apps.get_app_config('rdmo').path, - 'share', 'reference' + '.' + format - ) - ) - - # return the first file in refdocs that actually exists - for refdoc in refdocs: - if os.path.isfile(refdoc): - return refdoc - - def render_to_format(request, export_format, title, template_src, context): if export_format not in dict(settings.EXPORT_FORMATS): return HttpResponseBadRequest(_('This format is not supported.')) @@ -221,68 +150,11 @@ def render_to_format(request, export_format, title, template_src, context): response['Content-Disposition'] = f'filename="{title}.{export_format}"' else: - pandoc_args = settings.EXPORT_PANDOC_ARGS.get(export_format, []) - content_disposition = f'attachment; filename="{title}.{export_format}"' - - if export_format == 'pdf': - # check pandoc version (the pdf arg changed to version 2) - if get_pandoc_main_version() == 1: - pandoc_args = [arg.replace( - '--pdf-engine=xelatex', '--latex-engine=xelatex' - ) for arg in pandoc_args] - - # display pdf in browser - content_disposition = f'filename="{title}.{export_format}"' - - # use reference document for certain file formats - refdoc = set_export_reference_document(export_format, context) - if refdoc is not None and export_format in ['docx', 'odt']: - # check pandoc version (the args changed to version 2) - if get_pandoc_main_version() == 1: - pandoc_args.append(f'--reference-{export_format}={refdoc}') - else: - pandoc_args.append(f'--reference-doc={refdoc}') - - # add the possible resource-path - if pandoc_version_at_least("2") is True: - pandoc_args.append(f'--resource-path={settings.STATIC_ROOT}') - if 'resource_path' in context: - resource_path = Path(settings.MEDIA_ROOT).joinpath(context['resource_path']) - pandoc_args.append(f'--resource-path={resource_path}') - - # create a temporary file - (tmp_fd, tmp_filename) = mkstemp('.' + export_format) - - # add metadata - tmp_metadata_file = None - if metadata is not None and pandoc_version_at_least("2.3") is True: - tmp_metadata_file = save_metadata(metadata) - pandoc_args.append('--metadata-file=' + tmp_metadata_file) - - # convert the file using pandoc - log.info('Export %s document using args %s.', export_format, pandoc_args) - html = re.sub( - r'(' + - str(Path(settings.STATIC_ROOT)) + r'/\g<2>', html - ) - pypandoc.convert_text( - html, export_format, format='html', - outputfile=tmp_filename, extra_args=pandoc_args - ) - - # read the temporary file - file_handler = os.fdopen(tmp_fd, 'rb') - file_content = file_handler.read() - file_handler.close() + pandoc_content = get_pandoc_content(html, metadata, export_format, context) + pandoc_content_disposition = get_pandoc_content_disposition(export_format, title) - # delete temporary files - if tmp_metadata_file is not None: - os.remove(tmp_metadata_file) - os.remove(tmp_filename) - - # create the response object - response = HttpResponse(file_content, content_type=f'application/{export_format}') - response['Content-Disposition'] = content_disposition.encode('utf-8') + response = HttpResponse(pandoc_content, content_type=f'application/{export_format}') + response['Content-Disposition'] = pandoc_content_disposition.encode('utf-8') return response @@ -411,14 +283,5 @@ def parse_metadata(html): return metadata, html -def save_metadata(metadata): - _, tmp_metadata_file = mkstemp(suffix='.json') - with open(tmp_metadata_file, 'w') as f: - json.dump(metadata, f) - f = open(tmp_metadata_file) - log.info('Save metadata file %s %s', tmp_metadata_file, str(metadata)) - return tmp_metadata_file - - def remove_double_newlines(string): return re.sub(r'[\n]{2,}', '\n\n', string) diff --git a/rdmo/views/models.py b/rdmo/views/models.py index e7aec7e85..c3f28980d 100644 --- a/rdmo/views/models.py +++ b/rdmo/views/models.py @@ -7,7 +7,8 @@ from rdmo import __version__ from rdmo.core.models import TranslationMixin -from rdmo.core.utils import get_pandoc_main_version, join_url +from rdmo.core.pandoc import get_pandoc_version +from rdmo.core.utils import join_url from rdmo.questions.models import Catalog from .managers import ViewManager @@ -168,7 +169,7 @@ def render(self, project, snapshot=None, export_format=None): 'name': site.name, 'domain': site.domain }, - 'pandoc_version': get_pandoc_main_version() + 'pandoc_version': get_pandoc_version().major })) @classmethod