Skip to content

Commit e804549

Browse files
committed
feat(exporter): sanitize filename
1 parent cd02491 commit e804549

File tree

8 files changed

+107
-10
lines changed

8 files changed

+107
-10
lines changed

internal/exporter/epub/export.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818

1919
"github.com/ncarlier/readflow/pkg/downloader"
2020
"github.com/ncarlier/readflow/pkg/mediatype"
21+
"github.com/ncarlier/readflow/pkg/utils"
2122
)
2223

2324
var errSkippedURL = errors.New("skip processing url")
@@ -59,7 +60,7 @@ func (exp *EpubExporter) Export(ctx context.Context, article *model.Article) (*d
5960
return &downloader.WebAsset{
6061
Data: buf.Bytes(),
6162
ContentType: mediatype.Epub,
62-
Name: strings.TrimRight(article.Title, ". ") + ".epub",
63+
Name: utils.SanitizeFilename(article.Title) + ".epub",
6364
}, nil
6465
}
6566

internal/exporter/html/html.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@ package html
33
import (
44
"bytes"
55
"context"
6-
"strings"
76
"text/template"
87

98
"github.com/ncarlier/readflow/internal/exporter"
109
"github.com/ncarlier/readflow/internal/model"
1110

1211
"github.com/ncarlier/readflow/pkg/downloader"
1312
"github.com/ncarlier/readflow/pkg/mediatype"
13+
"github.com/ncarlier/readflow/pkg/utils"
1414
)
1515

1616
var articleAsHTMLTpl = template.Must(template.New("article-as-html").Parse(`
@@ -22,7 +22,7 @@ var articleAsHTMLTpl = template.Must(template.New("article-as-html").Parse(`
2222
<meta name="og:title" content="{{ .Title }}"/>
2323
<meta name="og:url" content="{{ .URL }}"/>
2424
<meta name="og:image" content="{{ .Image }}"/>
25-
<meta name="og:revised" content="{{ .PublishedAt }}"/>
25+
<meta name="og:revised" content="{{if .PublishedAt}}{{ .PublishedAt }}{{else}}{{ .CreatedAt }}{{end}}"/>
2626
</head>
2727
<body>{{ .HTML }}</body>
2828
</html>
@@ -44,7 +44,7 @@ func (exp *HTMLExporter) Export(ctx context.Context, article *model.Article) (*d
4444
return &downloader.WebAsset{
4545
Data: buffer.Bytes(),
4646
ContentType: mediatype.HTML,
47-
Name: strings.TrimRight(article.Title, ". ") + ".html",
47+
Name: utils.SanitizeFilename(article.Title) + ".html",
4848
}, nil
4949
}
5050

internal/exporter/html/single.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
"github.com/ncarlier/readflow/pkg/downloader"
1616
"github.com/ncarlier/readflow/pkg/mediatype"
17+
"github.com/ncarlier/readflow/pkg/utils"
1718

1819
"github.com/go-shiori/dom"
1920
"golang.org/x/net/html"
@@ -48,7 +49,7 @@ func (exp *SingleHTMLExporter) Export(ctx context.Context, article *model.Articl
4849
return &downloader.WebAsset{
4950
Data: data,
5051
ContentType: mediatype.HTML,
51-
Name: strings.TrimRight(article.Title, ". ") + ".html",
52+
Name: utils.SanitizeFilename(article.Title) + ".html",
5253
}, nil
5354
}
5455

internal/exporter/html/zip.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/ncarlier/readflow/internal/model"
1818

1919
"github.com/ncarlier/readflow/pkg/downloader"
20+
"github.com/ncarlier/readflow/pkg/utils"
2021
)
2122

2223
// ZIPExporter convert an article to a ZIP archive
@@ -53,7 +54,7 @@ func (exp *ZIPExporter) Export(ctx context.Context, article *model.Article) (*do
5354
return &downloader.WebAsset{
5455
Data: buf.Bytes(),
5556
ContentType: "application/zip",
56-
Name: strings.TrimRight(article.Title, ". ") + ".zip",
57+
Name: utils.SanitizeFilename(article.Title) + ".zip",
5758
}, nil
5859
}
5960

internal/exporter/md/md.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@ package md
22

33
import (
44
"context"
5-
"strings"
65

76
"github.com/ncarlier/readflow/internal/exporter"
87
"github.com/ncarlier/readflow/internal/exporter/html"
98
"github.com/ncarlier/readflow/internal/model"
109

1110
"github.com/ncarlier/readflow/pkg/downloader"
1211
"github.com/ncarlier/readflow/pkg/mediatype"
12+
"github.com/ncarlier/readflow/pkg/utils"
1313

1414
md "github.com/JohannesKaufmann/html-to-markdown"
1515
)
@@ -42,7 +42,7 @@ func (exp *MarkdownExporter) Export(ctx context.Context, article *model.Article)
4242
return &downloader.WebAsset{
4343
Data: data,
4444
ContentType: mediatype.HTML,
45-
Name: strings.TrimRight(article.Title, ". ") + ".md",
45+
Name: utils.SanitizeFilename(article.Title) + ".md",
4646
}, nil
4747
}
4848

internal/exporter/pdf/pdf.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ import (
77
"io"
88
"mime/multipart"
99
"net/http"
10-
"strings"
1110

1211
"github.com/ncarlier/readflow/internal/exporter"
1312
"github.com/ncarlier/readflow/internal/exporter/html"
1413
"github.com/ncarlier/readflow/internal/model"
1514

1615
"github.com/ncarlier/readflow/pkg/downloader"
16+
"github.com/ncarlier/readflow/pkg/utils"
1717
)
1818

1919
// PDFExporter convert an article to PDF file
@@ -78,6 +78,6 @@ func (exp *PDFExporter) Export(ctx context.Context, article *model.Article) (*do
7878
return &downloader.WebAsset{
7979
Data: body,
8080
ContentType: res.Header.Get("Content-Type"),
81-
Name: strings.TrimRight(article.Title, ". ") + ".pdf",
81+
Name: utils.SanitizeFilename(article.Title) + ".pdf",
8282
}, nil
8383
}

pkg/utils/filename.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package utils
2+
3+
import (
4+
"regexp"
5+
"strings"
6+
"unicode"
7+
8+
"golang.org/x/text/runes"
9+
"golang.org/x/text/transform"
10+
"golang.org/x/text/unicode/norm"
11+
)
12+
13+
const MAX_FILENAME_SIZE = 255
14+
15+
var reg = regexp.MustCompile("-{2,}")
16+
17+
func removeIllFormed(input string) (output string) {
18+
output, _, _ = transform.String(runes.ReplaceIllFormed(), input)
19+
return output
20+
}
21+
22+
func toLower(input string) (output string) {
23+
output = strings.ToLower(input)
24+
return output
25+
}
26+
27+
func replaceNonAlphaNum(input string) (output string) {
28+
replaceNonAlphaNum := runes.Map(func(r rune) rune {
29+
if !unicode.Is(unicode.Latin, r) && !unicode.IsDigit(r) {
30+
return '-'
31+
}
32+
return r
33+
})
34+
output, _, _ = transform.String(replaceNonAlphaNum, input)
35+
return output
36+
}
37+
38+
func removeAccents(input string) (output string) {
39+
t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
40+
s, _, _ := transform.String(t, input)
41+
r := strings.NewReplacer("ł", "l", "Ł", "L")
42+
output = r.Replace(s)
43+
return output
44+
}
45+
46+
func dedupHyp(input string) (output string) {
47+
output = reg.ReplaceAllString(input, "-")
48+
return output
49+
}
50+
51+
func trimEnds(input string) (output string) {
52+
output = strings.TrimFunc(input, func(r rune) bool {
53+
return !unicode.IsLetter(r) && !unicode.IsDigit(r)
54+
})
55+
return output
56+
}
57+
58+
// SanitizeFilename sanitize filename
59+
func SanitizeFilename(input string) (output string) {
60+
output = trimEnds(dedupHyp(replaceNonAlphaNum(removeAccents(toLower(removeIllFormed(input))))))
61+
nc := len(output)
62+
if nc < MAX_FILENAME_SIZE {
63+
return output
64+
}
65+
return output[0:MAX_FILENAME_SIZE]
66+
}

pkg/utils/tests/filename_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package test
2+
3+
import (
4+
"testing"
5+
6+
"github.com/ncarlier/readflow/pkg/utils"
7+
"github.com/stretchr/testify/require"
8+
)
9+
10+
var testCases = []struct {
11+
input string
12+
expected string
13+
}{
14+
{
15+
input: "a.b.html",
16+
expected: "a-b-html",
17+
},
18+
{
19+
input: "whatēverwëirduserînput",
20+
expected: "whateverweirduserinput",
21+
},
22+
}
23+
24+
func TestSanitizeFilename(t *testing.T) {
25+
for _, testCase := range testCases {
26+
require.Equal(t, testCase.expected, utils.SanitizeFilename(testCase.input))
27+
}
28+
}

0 commit comments

Comments
 (0)