Skip to content

Commit 41f36fd

Browse files
committed
feat: option to fix canonical URLs of a multidocumenter build
1 parent 5da2ba5 commit 41f36fd

24 files changed

+607
-4
lines changed

src/MultiDocumenter.jl

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@ import Gumbo, AbstractTrees
44
using HypertextLiteral
55
import Git: git
66

7+
module DocumenterTools
8+
import Gumbo, AbstractTrees
9+
include("documentertools/walkdocs.jl")
10+
include("documentertools/canonical_urls.jl")
11+
end
12+
713
"""
814
SearchConfig(index_versions = ["stable"], engine = MultiDocumenter.FlexSearch, lowfi = false)
915
@@ -25,13 +31,22 @@ struct MultiDocRef
2531
path::String
2632
name::String
2733

34+
fix_canonical_url::Bool
35+
2836
# these are not actually used internally
2937
giturl::String
3038
branch::String
3139
end
3240

33-
function MultiDocRef(; upstream, name, path, giturl = "", branch = "gh-pages")
34-
MultiDocRef(upstream, path, name, giturl, branch)
41+
function MultiDocRef(;
42+
upstream,
43+
name,
44+
path,
45+
giturl = "",
46+
branch = "gh-pages",
47+
fix_canonical_url = true,
48+
)
49+
MultiDocRef(upstream, path, name, fix_canonical_url, giturl, branch)
3550
end
3651

3752
struct DropdownNav
@@ -76,6 +91,7 @@ end
7691
include("renderers.jl")
7792
include("search/flexsearch.jl")
7893
include("search/stork.jl")
94+
include("canonical.jl")
7995

8096
const DEFAULT_ENGINE = SearchConfig(index_versions = ["stable", "dev"], engine = FlexSearch)
8197

@@ -91,6 +107,7 @@ const DEFAULT_ENGINE = SearchConfig(index_versions = ["stable", "dev"], engine =
91107
prettyurls = true,
92108
rootpath = "/",
93109
hide_previews = true,
110+
canonical = nothing,
94111
)
95112
96113
Aggregates multiple Documenter.jl-based documentation pages `docs` into `outdir`.
@@ -105,6 +122,9 @@ Aggregates multiple Documenter.jl-based documentation pages `docs` into `outdir`
105122
- `prettyurls` removes all `index.html` suffixes from links in the global navigation.
106123
- `rootpath` is the path your site ends up being deployed at, e.g. `/foo/` if it's hosted at `https://bar.com/foo`
107124
- `hide_previews` removes preview builds from the aggregated documentation.
125+
- `canonical`: if set to the root URL of the MultiDocumenter site, will check and, if necessary, update the
126+
canonical URL tags for each package site to point to the directory. Similar to the `canonical` argument of
127+
`Documenter.HTML` constructor.
108128
"""
109129
function make(
110130
outdir,
@@ -117,10 +137,19 @@ function make(
117137
prettyurls = true,
118138
rootpath = "/",
119139
hide_previews = true,
140+
canonical::Union{AbstractString,Nothing} = nothing,
120141
)
121142
maybe_clone(flatten_multidocrefs(docs))
122143

123-
dir = make_output_structure(flatten_multidocrefs(docs), prettyurls, hide_previews)
144+
if !isnothing(canonical)
145+
canonical = rstrip(canonical, '/')
146+
end
147+
dir = make_output_structure(
148+
flatten_multidocrefs(docs),
149+
prettyurls,
150+
hide_previews;
151+
canonical,
152+
)
124153
out_assets = joinpath(dir, "assets")
125154
if assets_dir !== nothing && isdir(assets_dir)
126155
cp(assets_dir, out_assets)
@@ -192,7 +221,12 @@ function maybe_clone(docs::Vector{MultiDocRef})
192221
end
193222
end
194223

195-
function make_output_structure(docs::Vector{MultiDocRef}, prettyurls, hide_previews)
224+
function make_output_structure(
225+
docs::Vector{MultiDocRef},
226+
prettyurls,
227+
hide_previews;
228+
canonical::Union{AbstractString,Nothing},
229+
)
196230
dir = mktempdir()
197231

198232
for doc in docs
@@ -210,6 +244,8 @@ function make_output_structure(docs::Vector{MultiDocRef}, prettyurls, hide_previ
210244
if hide_previews && isdir(previewpath)
211245
rm(previewpath, recursive = true)
212246
end
247+
248+
fix_canonical_url!(doc; canonical, root_dir = dir)
213249
end
214250

215251
open(joinpath(dir, "index.html"), "w") do io

src/canonical.jl

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# This files contains the functions used to implement the canonical URL
2+
# update functionality.
3+
function fix_canonical_url!(
4+
doc::MultiDocRef;
5+
canonical::Union{AbstractString,Nothing},
6+
root_dir::AbstractString,
7+
)
8+
# If the user didn't set `canonical`, then we don't need to do anything
9+
isnothing(canonical) && return nothing
10+
# The user can also disable the canonical URL fixing on a per-package basis
11+
doc.fix_canonical_url || return nothing
12+
# Determine the canonical URL and fix them in the HTML files
13+
documenter_directory_root = joinpath(root_dir, doc.path)
14+
try
15+
DocumenterTools.update_canonical_links(
16+
documenter_directory_root;
17+
canonical = join((canonical, doc.path), '/'),
18+
)
19+
catch e
20+
@error "Unable to update canonical URLs for this package" doc exception =
21+
(e, catch_backtrace())
22+
end
23+
end

src/documentertools/canonical_urls.jl

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
# This is vendored version of code that should eventually moved into DocumenterTools.jl
2+
# once the generic interface has crystallized, and then DocumenterTools should be added
3+
# as a dependency here.
4+
#
5+
# WIP upstream PR: https://github.com/JuliaDocs/DocumenterTools.jl/pull/78
6+
#
7+
# Note: these functions are not part of MultiDocumenter public API.
8+
9+
"""
10+
DocumenterTools.update_canonical_links_for_build(
11+
docs_directory::AbstractString;
12+
canonical::AbstractString,
13+
)
14+
15+
- **`canonical`**: corresponds to the `canonical` attribute of `Documenter.HTML`,
16+
specifying the root of the canonical URL.
17+
"""
18+
function update_canonical_links_for_version(
19+
docs_directory::AbstractString;
20+
canonical::AbstractString,
21+
)
22+
canonical = rstrip(canonical, '/')
23+
24+
walkdocs(docs_directory, isdochtml) do fileinfo
25+
@debug "update_canonical_links: checking $(fileinfo.relpath)"
26+
# Determine the
27+
filepath = splitpath(fileinfo.relpath)
28+
new_canonical_href = if filepath[end] == "index.html"
29+
joinurl(canonical, filepath[1:end-1]...) * '/'
30+
else
31+
joinurl(canonical, filepath[1:end]...)
32+
end
33+
34+
html = Gumbo.parsehtml(read(fileinfo.fullpath, String))
35+
n_canonical_tags::Int = 0
36+
dom_updated::Bool = false
37+
for e in AbstractTrees.PreOrderDFS(html.root)
38+
is_canonical_element(e) || continue
39+
n_canonical_tags += 1
40+
canonical_href = Gumbo.getattr(e, "href", nothing)
41+
if canonical_href != new_canonical_href
42+
Gumbo.setattr!(e, "href", new_canonical_href)
43+
@debug "update_canonical_links_for_version: canonical_href updated" canonical_href new_canonical_href fileinfo.relpath
44+
dom_updated = true
45+
end
46+
end
47+
if n_canonical_tags == 0
48+
for e in AbstractTrees.PreOrderDFS(html.root)
49+
e isa Gumbo.HTMLElement || continue
50+
Gumbo.tag(e) == :head || continue
51+
canonical_href_element = Gumbo.HTMLElement{:link}(
52+
[],
53+
e,
54+
Dict("rel" => "canonical", "href" => new_canonical_href),
55+
)
56+
push!(e.children, canonical_href_element)
57+
@debug "update_canonical_links_for_version: added new canonical_href" new_canonical_href fileinfo.relpath
58+
dom_updated = true
59+
break
60+
end
61+
end
62+
if dom_updated
63+
open(io -> print(io, html), fileinfo.fullpath, "w")
64+
end
65+
if n_canonical_tags > 1
66+
@error "Multiple canonical tags!" file = fileinfo.relpath
67+
end
68+
end
69+
end
70+
71+
is_canonical_element(e) =
72+
(e isa Gumbo.HTMLElement) &&
73+
(Gumbo.tag(e) == :link) &&
74+
(Gumbo.getattr(e, "rel", nothing) == "canonical")
75+
joinurl(ps::AbstractString...) = join(ps, '/')
76+
77+
"""
78+
Takes the multi-versioned Documenter site in `docs_directory` and updates the HTML canonical URLs
79+
to point to `canonical`.
80+
"""
81+
function update_canonical_links(docs_directory::AbstractString; canonical::AbstractString)
82+
canonical = rstrip(canonical, '/')
83+
docs_directory = abspath(docs_directory)
84+
isdir(docs_directory) || throw(ArgumentError("No such directory: $(docs_directory)"))
85+
86+
redirect_index_html_path = joinpath(docs_directory, "index.html")
87+
canonical_path = if isfile(redirect_index_html_path)
88+
redirect_url = get_meta_redirect_url(redirect_index_html_path)
89+
splitpath(normpath(redirect_url))
90+
else
91+
canonical_version_from_versions_js(docs_directory)
92+
end
93+
canonical_full_root = joinurl(canonical, canonical_path...)
94+
# If we have determined which version should be the canonical version, we can actually
95+
# go and run update_canonical_links_for_version on each directory. First, we'll gather
96+
# up the list of Documenter (or other) directories we actually want to run over.
97+
docs_subdirectory_queue, docs_subdirectories = readdir(docs_directory), []
98+
while !isempty(docs_subdirectory_queue)
99+
docs_subdirectory = popfirst!(docs_subdirectory_queue)
100+
path = joinpath(docs_directory, docs_subdirectory)
101+
# We'll skip all files. This includes files such as index.html, which in this
102+
# directory will likely be the redirect. Also, links should be pointing to other
103+
# versions, so we'll skip them too.
104+
if !isdir(path) || islink(path)
105+
continue
106+
end
107+
# Preview directory is should contain other Documenter directories, so we just add
108+
# the subdirectories into the queue and ignore the parent directory itself
109+
if docs_subdirectory == "previews"
110+
append!(docs_subdirectory_queue, joinpath.(docs_subdirectory, readdir(path)))
111+
continue
112+
end
113+
# For other directories, we check for the presence of siteinfo.js, and warn if that
114+
# is missing (but we still try to go and update the canonical URLs).
115+
if !isfile(joinpath(path, "siteinfo.js"))
116+
@warn "update_canonical_links: missing siteinfo.js file" path
117+
end
118+
push!(docs_subdirectories, path)
119+
end
120+
# Finally, we can run update_canonical_links_for_version on the directory.
121+
for path in docs_subdirectories
122+
@debug "Updating canonical URLs for a version" path canonical_full_root
123+
update_canonical_links_for_version(path; canonical = canonical_full_root)
124+
end
125+
end
126+
127+
function canonical_directory_from_redirect_index_html(docs_directory::AbstractString)
128+
redirect_index_html_path = joinpath(docs_directory, "index.html")
129+
isfile(redirect_index_html_path) || return nothing
130+
redirect_url = get_meta_redirect_url(redirect_index_html_path)
131+
splitpath(normpath(redirect_url))
132+
end
133+
134+
"""
135+
Parses the HTML file at `indexhtml_path` and tries to extract the `url=...` value
136+
of the redirect `<meta http-equiv="refresh" ...>` tag.
137+
"""
138+
function get_meta_redirect_url(indexhtml_path::AbstractString)
139+
html = Gumbo.parsehtml(read(indexhtml_path, String))
140+
for e in AbstractTrees.PreOrderDFS(html.root)
141+
e isa Gumbo.HTMLElement || continue
142+
Gumbo.tag(e) == :meta || continue
143+
Gumbo.getattr(e, "http-equiv", nothing) == "refresh" || continue
144+
content = Gumbo.getattr(e, "content", nothing)
145+
if isnothing(content)
146+
@warn "<meta http-equiv=\"refresh\" ...> with no content attribute" path =
147+
indexhtml_path
148+
continue
149+
end
150+
m = match(r"[0-9]+;\s*url=(.*)", content)
151+
if isnothing(m)
152+
@warn "Unable to parse content value of <meta http-equiv=\"refresh\" ...>" content path =
153+
indexhtml_path
154+
continue
155+
end
156+
return m.captures[1]
157+
end
158+
return nothing
159+
end
160+
161+
function canonical_version_from_versions_js(docs_directory)
162+
isdir(docs_directory) || throw(ArgumentError("Not a directory: $(docs_directory)"))
163+
# Try to extract the list of versions from versions.js
164+
versions_js = joinpath(docs_directory, "versions.js")
165+
isfile(versions_js) ||
166+
throw(ArgumentError("versions.js is missing in $(docs_directory)"))
167+
versions = map(extract_versions_list(versions_js)) do version_str
168+
isversion, version_number = if occursin(Base.VERSION_REGEX, version_str)
169+
true, VersionNumber(version_str)
170+
else
171+
false, nothing
172+
end
173+
fullpath = joinpath(docs_directory, version_str)
174+
return (;
175+
path = version_str,
176+
path_exists = isdir(fullpath) || islink(fullpath),
177+
symlink = islink(fullpath),
178+
isversion,
179+
version_number,
180+
fullpath,
181+
)
182+
end
183+
# We'll filter out a couple of potential bad cases and issue warnings
184+
filter(versions) do vi
185+
if !vi.path_exists
186+
@warn "update_canonical_links: path does not exists or is not a directory" docs_directory vi
187+
return false
188+
end
189+
return true
190+
end
191+
# We need to determine the canonical path. This would usually be something like the stable/
192+
# directory, but it can have a different name, including being a version number. So first we
193+
# try to find a non-version directory _that is a symlink_ (so that it wouldn't get confused)
194+
# previews/ or dev builds. If that fails, we try to find the directory matching `v[0-9]+`,
195+
# with the highest version number. This does not cover all possible cases, but should be good
196+
# enough for now.
197+
if isempty(versions)
198+
error("Unable to determine the canonical path. Found no version directories")
199+
end
200+
201+
non_version_symlinks = filter(vi -> !vi.isversion && vi.symlink, versions)
202+
canonical_version = if isempty(non_version_symlinks)
203+
# We didn't find any non-version symlinks, so we'll try to find the vN directory now
204+
# as a fallback.
205+
version_symlinks = map(versions) do vi
206+
m = match(r"^v([0-9]+)$", vi.path)
207+
isnothing(m) && return nothing
208+
parse(Int, m[1]) => vi
209+
end
210+
filter!(!isnothing, version_symlinks)
211+
if isempty(version_symlinks)
212+
error("Unable to determine the canonical path. Found no version directories")
213+
end
214+
# Note: findmax(first, version_symlinks) would be nicer, but is not supported
215+
# on Julia 1.6
216+
_, idx = findmax(first.(version_symlinks))
217+
version_symlinks[idx][2]
218+
elseif length(non_version_symlinks) > 1
219+
error(
220+
"Unable to determine the canonical path. Found multiple non-version symlinks.\n$(non_version_symlinks)",
221+
)
222+
else
223+
only(non_version_symlinks)
224+
end
225+
226+
return canonical_version.path
227+
end
228+
229+
function extract_versions_list(versions_js::AbstractString)
230+
versions_js = abspath(versions_js)
231+
isfile(versions_js) || throw(ArgumentError("No such file: $(versions_js)"))
232+
versions_js_content = read(versions_js, String)
233+
m = match(r"var\s+DOC_VERSIONS\s*=\s*\[([0-9A-Za-z\"\s.,+-]+)\]", versions_js_content)
234+
if isnothing(m)
235+
throw(ArgumentError("""
236+
Could not find DOC_VERSIONS in $(versions_js):
237+
$(versions_js_content)"""))
238+
end
239+
versions = strip.(c -> isspace(c) || (c == '"'), split(m[1], ","))
240+
filter!(!isempty, versions)
241+
if isempty(versions)
242+
throw(ArgumentError("""
243+
DOC_VERSIONS empty in $(versions_js):
244+
$(versions_js_content)"""))
245+
end
246+
return versions
247+
end

0 commit comments

Comments
 (0)