Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor pandoc #1245

Merged
merged 6 commits into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ jobs:
path: dist
- name: Install Dependencies
run: |
sudo apt-get update && sudo apt-get install --yes pandoc texlive-xetex librsvg2-bin
sudo apt-get update && sudo apt-get install --yes pandoc texlive-latex-base texlive-latex-extra texlive-luatex librsvg2-bin
pandoc --version
python -m pip install --upgrade pip
python -m pip --version
Expand Down Expand Up @@ -172,7 +172,7 @@ jobs:
path: dist
- name: Install Dependencies
run: |
sudo apt-get update && sudo apt install --yes pandoc texlive-xetex librsvg2-bin
sudo apt-get update && sudo apt install --yes pandoc texlive-latex-base texlive-latex-extra texlive-luatex librsvg2-bin
python -m pip install --upgrade pip
- name: Install rdmo[postgres] from wheel and start postgresql
run: |
Expand Down
143 changes: 143 additions & 0 deletions rdmo/core/pandoc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import json
import logging
import os
import re
from pathlib import Path
from tempfile import mkstemp

from django.apps import apps
from django.conf import settings

import pypandoc
from packaging.version import Version
from packaging.version import parse as parse_version

log = logging.getLogger(__name__)


def get_pandoc_version():
return parse_version(pypandoc.get_pandoc_version())


def get_pandoc_content(html, metadata, export_format, context):
pandoc_args = get_pandoc_args(export_format, context)

if metadata:
# create a temporary file for the metadata
(metadata_tmp_fd, metadata_tmp_file_name) = mkstemp(suffix='.json')

# save metadata
log.info('Save metadata file %s %s', metadata_tmp_file_name, str(metadata))
with open(metadata_tmp_file_name, 'w') as fp:
json.dump(metadata, fp)

# add metadata file to pandoc args
pandoc_args.append('--metadata-file=' + metadata_tmp_file_name)

# create a temporary file
(tmp_fd, tmp_file_name) = mkstemp(f'.{export_format}')

# convert the file using pandoc
log.info('Export %s document using args %s.', export_format, pandoc_args)
html = re.sub(
r'(<img.+src=["\'])' + settings.STATIC_URL + r'([\w\-\@?^=%&/~\+#]+)', r'\g<1>' +
str(Path(settings.STATIC_ROOT)) + r'/\g<2>', html
)
pypandoc.convert_text(html, export_format, format='html', outputfile=tmp_file_name, extra_args=pandoc_args)

# read the created temporary file
with open(tmp_file_name, 'rb') as fp:
pandoc_content = fp.read()

# delete temporary files
if metadata:
os.remove(metadata_tmp_file_name)
os.remove(tmp_file_name)

return pandoc_content


def get_pandoc_content_disposition(export_format, title):
if export_format == 'pdf':
# display pdf in browser
return f'filename="{title}.{export_format}"'
else:
return f'attachment; filename="{title}.{export_format}"'


def get_pandoc_args(export_format, context):
pandoc_version = get_pandoc_version()
pandoc_args = list(settings.EXPORT_PANDOC_ARGS.get(export_format, [])) # without list(), settings would be changed

if export_format == 'pdf':
# we used xelatex before pandoc 3
if pandoc_version < Version('3'):
pandoc_args = [
arg.replace('--pdf-engine=lualatex', '--pdf-engine=xelatex')
for arg in pandoc_args
]

elif export_format in ['docx', 'odt']:
# find and add a possible reference document
reference_document = get_pandoc_reference_document(export_format, context)
if reference_document:
if pandoc_version >= Version('2'):
pandoc_args.append(f'--reference-doc={reference_document}')
else:
pandoc_args.append(f'--reference-{export_format}={reference_document}')

# add STATIC_ROOT and possible additional resource paths
if pandoc_version >= Version('2'):
pandoc_args.append(f'--resource-path={settings.STATIC_ROOT}')
if 'resource_path' in context:
resource_path = Path(settings.MEDIA_ROOT) / context['resource_path']
pandoc_args.append(f'--resource-path={resource_path}')

return pandoc_args


def get_pandoc_reference_document(export_format, context):
# collect all configured reference documents
reference_documents = get_pandoc_reference_documents(export_format, context)

# return the first reference document that actually exists
for reference_document in reference_documents:
if reference_document and reference_document.exists():
return Path(reference_document)


def get_pandoc_reference_documents(export_format, context):
# try to get the view and its uri from the context, if it is not set, the current url should be project_answers
try:
view = context['view']
view_uri = view.uri
except (KeyError, AttributeError):
view_uri = None

reference_documents = []

if export_format == 'odt':
# append view specific custom reference document
if view_uri and view_uri in settings.EXPORT_REFERENCE_ODT_VIEWS:
reference_documents.append(settings.EXPORT_REFERENCE_ODT_VIEWS[view_uri])

# append generic custom reference document
if settings.EXPORT_REFERENCE_ODT:
reference_documents.append(settings.EXPORT_REFERENCE_ODT)

# append the default reference document
reference_documents.append(Path(apps.get_app_config('rdmo').path) / 'share' / 'reference.odt')

elif export_format == 'docx':
# append view specific custom reference document
if view_uri and view_uri in settings.EXPORT_REFERENCE_DOCX_VIEWS:
reference_documents.append(settings.EXPORT_REFERENCE_DOCX_VIEWS[view_uri])

# append generic custom reference document
if settings.EXPORT_REFERENCE_DOCX:
reference_documents.append(settings.EXPORT_REFERENCE_DOCX)

# append the default reference document
reference_documents.append(Path(apps.get_app_config('rdmo').path) / 'share' / 'reference.docx')

return reference_documents
2 changes: 1 addition & 1 deletion rdmo/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@
EXPORT_REFERENCE_DOCX = None

EXPORT_PANDOC_ARGS = {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex'],
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex'],
'rtf': ['--standalone']
}

Expand Down
156 changes: 156 additions & 0 deletions rdmo/core/tests/test_pandoc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from pathlib import Path

import pytest

from django.apps import apps

from packaging.version import Version

from ..pandoc import get_pandoc_args, get_pandoc_reference_document, get_pandoc_reference_documents, get_pandoc_version

rdmo_path = Path(apps.get_app_config('rdmo').path)
testing_path = rdmo_path.parent / 'testing'

pandoc_versions = [
'1.9.0',
'2.0.0',
'3.0.0',
'3.5.0'
]

export_formats = [
'rtf',
'odt',
'docx',
'html',
'markdown',
'tex',
'pdf'
]

pandoc_args_map = {
'1.9.0': {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex'],
'rtf': ['--standalone'],
'docx': [f'--reference-docx={rdmo_path}/share/reference.docx'],
'odt': [f'--reference-odt={rdmo_path}/share/reference.odt'],
'other': []
},
'2.0.0': {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex',
f'--resource-path={testing_path}/static_root'],
'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'],
'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'],
'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'],
'other': [f'--resource-path={testing_path}/static_root']
},
'3.0.0': {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex',
f'--resource-path={testing_path}/static_root'],
'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'],
'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'],
'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'],
'other': [f'--resource-path={testing_path}/static_root']
},
'3.5.0': {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex',
f'--resource-path={testing_path}/static_root'],
'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'],
'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'],
'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'],
'other': [f'--resource-path={testing_path}/static_root']
}
}

class MockedView:
uri = 'http://example.com/terms/views/view'

@pytest.mark.parametrize('pandoc_version', pandoc_versions)
def test_get_pandoc_version(mocker, pandoc_version):
mocker.patch('pypandoc.get_pandoc_version', return_value=pandoc_version)
assert get_pandoc_version() == Version(pandoc_version)


@pytest.mark.parametrize('pandoc_version', pandoc_versions)
@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_args(settings, mocker, pandoc_version, export_format):
mocker.patch('pypandoc.get_pandoc_version', return_value=pandoc_version)

assert get_pandoc_args(export_format, {}) == \
pandoc_args_map[pandoc_version].get(export_format, pandoc_args_map[pandoc_version]['other'])


def test_get_pandoc_reference_document(mocker):
mocker.patch('rdmo.core.pandoc.get_pandoc_reference_documents', return_value=[
rdmo_path / 'share' / 'missing.docx',
rdmo_path / 'share' / 'reference.docx',
rdmo_path / 'share' / 'reference.odt'
])

# return the first existing file
assert get_pandoc_reference_document('other', {}) == rdmo_path / 'share' / 'reference.docx'


def test_get_pandoc_reference_document_missing(mocker):
mocker.patch('rdmo.core.pandoc.get_pandoc_reference_documents', return_value=[
rdmo_path / 'share' / 'missing.docx',
rdmo_path / 'share' / 'missing.odt'
])

assert get_pandoc_reference_document('other', {}) is None


@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_reference_documents(export_format):
rdmo_path = Path(apps.get_app_config('rdmo').path)

reference_documents = get_pandoc_reference_documents(export_format, {})

if export_format in ['docx', 'odt']:
assert reference_documents == [rdmo_path / 'share' / f'reference.{export_format}']
else:
assert reference_documents == []


@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_reference_documents_view(export_format):
reference_documents = get_pandoc_reference_documents(export_format, {'view': MockedView()})

if export_format in ['docx', 'odt']:
assert reference_documents == [rdmo_path / 'share' / f'reference.{export_format}']
else:
assert reference_documents == []


@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_reference_documents_view_settings(settings, export_format):
mock_file = rdmo_path / 'share' / f'mock.{export_format}'

if export_format == 'docx':
settings.EXPORT_REFERENCE_DOCX_VIEWS = {'http://example.com/terms/views/view': mock_file}
elif export_format == 'odt':
settings.EXPORT_REFERENCE_ODT_VIEWS = {'http://example.com/terms/views/view': mock_file}

reference_documents = get_pandoc_reference_documents(export_format, {'view': MockedView()})

if export_format in ['docx', 'odt']:
assert reference_documents == [mock_file, rdmo_path / 'share' / f'reference.{export_format}']
else:
assert reference_documents == []


@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_reference_documents_settings(settings, export_format):
mock_file = rdmo_path / 'share' / f'mock.{export_format}'

if export_format == 'docx':
settings.EXPORT_REFERENCE_DOCX = mock_file
elif export_format == 'odt':
settings.EXPORT_REFERENCE_ODT = mock_file

reference_documents = get_pandoc_reference_documents(export_format, {})

if export_format in ['docx', 'odt']:
assert reference_documents == [mock_file, rdmo_path / 'share' / f'reference.{export_format}']
else:
assert reference_documents == []
Loading
Loading