Add RSS import/export (#750)

* Add RSS export/import * Update README, man and help * Add tests for RSS import/export
jarun · Jul 8, 2024 · 6394fb2 · 6394fb2
1 parent 704018a
commit 6394fb2
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ To get started right away, jump to the [Quickstart](#quickstart) section. `buku`
 - Powerful search options (regex, substring...)
 - Continuous search with on the fly mode switch
 - Portable, merge-able database to sync between systems
-- Import/export bookmarks from/to HTML, XBEL, Markdown or Orgfile
+- Import/export bookmarks from/to HTML, XBEL, Markdown, RSS or Orgfile
 - Smart tag management using redirection (>>, >, <<)
 - Multi-threaded full DB refresh
 - Manual encryption support
@@ -221,10 +221,11 @@ POWER TOYS:
                            format: [title](url) <!-- TAGS -->
                            export Orgfile, if file ends with '.org'
                            format: *[[url][title]] :tags:
+                           export rss feed if file ends with '.rss'
                            export buku DB, if file ends with '.db'
                            combines with search results, if opted
       -i, --import file    import bookmarks from file
-                           supports .html .xbel .json .md .org .db
+                           supports .html .xbel .json .md .org .rss .db
       -p, --print [...]    show record details by indices, ranges
                            print all bookmarks, if no arguments
                            -n shows the last n results (like tail)

diff --git a/buku b/buku
@@ -48,6 +48,7 @@ from subprocess import DEVNULL, PIPE, Popen
 from typing import Any, Dict, List, Optional, Tuple, NamedTuple
 from collections.abc import Sequence, Set, Callable
 from warnings import warn
+import xml.etree.ElementTree as ET
 
 import urllib3
 from bs4 import BeautifulSoup
@@ -2529,6 +2530,10 @@ class BukuDb:
                 res = convert_bookmark_set(resultset, 'xbel', old)
                 count += res['count']
                 outfp.write(res['data'])
+            elif filepath.endswith('.rss'):
+                res = convert_bookmark_set(resultset, 'rss', old)
+                count += res['count']
+                outfp.write(res['data'])
             else:
                 res = convert_bookmark_set(resultset, 'html', old)
                 count += res['count']
@@ -2881,6 +2886,8 @@ class BukuDb:
             items = import_md(filepath=filepath, newtag=newtag)
         elif filepath.endswith('org'):
             items = import_org(filepath=filepath, newtag=newtag)
+        elif filepath.endswith('rss'):
+            items = import_rss(filepath=filepath, newtag=newtag)
         elif filepath.endswith('json'):
             if not tacit:
                 resp = input('Add parent folder names as tags? (y/n): ')
@@ -3330,7 +3337,7 @@ def convert_bookmark_set(
         converted data and count of converted bookmark set
     """
     import html
-    assert export_type in ['markdown', 'html', 'org', 'xbel']
+    assert export_type in ['markdown', 'html', 'org', 'xbel', 'rss']
     #  compatibility
     resultset = bookmark_vars(bookmark_set)
     old = old or {}
@@ -3376,6 +3383,29 @@ def convert_bookmark_set(
             count += 1
 
         out += '</xbel>'
+    elif export_type == 'rss':
+        out = (
+            '<feed xmlns="http://www.w3.org/2005/Atom">\n'
+            '    <title>Bookmarks</title>\n'
+            '    <generator uri="https://github.com/jarun/buku">buku</generator>\n'
+        )
+
+        for row in resultset:
+            out += '    <entry>\n'
+            out += '        <title>' + title(row) + '</title>\n'
+            _url = html.escape(row.url).encode('ascii', 'xmlcharrefreplace').decode('utf-8')
+            out += '        <link href="%s" rel="alternate" type="text/html"/>\n' % _url
+            out += '        <id>%s</id>\n' % row.id
+            for tag in (t for t in row.tags.split(',') if t):
+                _tag = html.escape(tag).encode('ascii', 'xmlcharrefreplace').decode('utf-8')
+                out += '        <category term="%s"/>\n' % _tag
+            if row.desc:
+                _desc = html.escape(row.desc).encode('ascii', 'xmlcharrefreplace').decode('utf-8')
+                out += '        <content type="html"> <![CDATA[ <p>%s</p> ]]> </content>\n' % _desc
+            out += '    </entry>\n'
+            count += 1
+
+        out += '</feed>'
     elif export_type == 'html':
         timestamp = str(int(time.time()))
         out = (
@@ -3519,6 +3549,34 @@ def import_md(filepath: str, newtag: Optional[str]):
 
                     yield (url, title, delim_wrap(tags), None, 0, True, False)
 
+def import_rss(filepath: str, newtag: Optional[str]):
+    """Parse bookmark RSS file.
+
+    Parameters
+    ----------
+    filepath : str
+        Path to RSS file.
+    newtag : str, optional
+        New tag for bookmarks in RSS file.
+
+    Returns
+    tuple
+        Parsed result.
+    """
+
+    with open(filepath, mode='r', encoding='utf-8') as infp:
+        ns = {'atom': 'http://www.w3.org/2005/Atom'}
+        root = ET.fromstring(infp.read())
+        for entry in root.findall('atom:entry', ns):
+            title = entry.find('atom:title', ns).text
+            url = entry.find('atom:link', ns).attrib['href']
+            tags = ','.join([tag.attrib['term'] for tag in entry.findall('atom:category', ns)])
+            if newtag is not None:
+                tags = newtag + ',' + tags
+            desc = entry.find('atom:content', ns)
+            desc = desc.text if desc is not None else None
+            yield (url, title, delim_wrap(tags), desc, 0, True, False)
+
 def import_org(filepath: str, newtag: Optional[str]):
     """Parse bookmark org file.
 
@@ -5750,10 +5808,11 @@ POSITIONAL ARGUMENTS:
                          format: [title](url) <!-- TAGS -->
                          export Orgfile, if file ends with '.org'
                          format: *[[url][title]] :tags:
+                         export rss feed if file ends with '.rss'
                          export buku DB, if file ends with '.db'
                          combines with search results, if opted
     -i, --import file    import bookmarks from file
-                         supports .html .xbel .json .md .org .db
+                         supports .html .xbel .json .md .org .rss .db
     -p, --print [...]    show record details by indices, ranges
                          print all bookmarks, if no arguments
                          -n shows the last n results (like tail)

diff --git a/buku.1 b/buku.1
@@ -19,7 +19,7 @@ is a command-line utility to store, tag, search and organize bookmarks.
   * Powerful search options (regex, substring...)
   * Continuous search with on the fly mode switch
   * Portable, merge-able database to sync between systems
-  * Import/export bookmarks from/to HTML, XBEL, Markdown or Orgfile
+  * Import/export bookmarks from/to HTML, XBEL, Markdown, RSS or Orgfile
   * Smart tag management using redirection (>>, >, <<)
   * Multithreaded full DB refresh
   * Manual encryption support
@@ -211,14 +211,18 @@ Orgfile is used if
 .I file
 has extension '.org' Orgfile format: * [[url][title]], 1 entry per line.
 .br
+RSS is used if
+.I file
+has extension '.rss' RSS format: <entry> per bookmark with <title>, <link>, <category>, <content> elements
+.br
 A buku database is generated if
 .I file
 has extension '.db'.
 .TP
 .BI \-i " " \--import " file"
 Import bookmarks from Firefox bookmarks formatted HTML.
 .I file
-is considered Firefox-exported JSON if it has '.json' extension, XBEL if it is '.xbel', Markdown (compliant with --export format) if it is '.md', Orgfile if the extension is '.org' or another buku database if the extension is '.db'.
+is considered Firefox-exported JSON if it has '.json' extension, XBEL if it is '.xbel', Markdown (compliant with --export format) if it is '.md', Orgfile if the extension is '.org', RSS if the extension is '.rss' or another buku database if the extension is '.db'.
 .TP
 .BI \-p " " \--print " [...]"
 Show details (DB index, URL, title, tags and comment) of bookmark record by DB index. If no arguments, all records with actual index from DB are shown. Accepts hyphenated ranges and space-separated indices. A negative value (introduced for convenience) behaves like the tail utility, e.g., -n shows the details of the last n bookmarks.

diff --git a/tests/test_buku.py b/tests/test_buku.py
@@ -562,6 +562,28 @@ def test_import_md(tmpdir, newtag, exp_res):
     res = list(import_md(p.strpath, newtag))
     assert res[0] == exp_res
 
+@pytest.mark.parametrize(
+    "newtag, exp_res",
+    [
+        (None, ("http://example.com", "text1", ",", None, 0, True, False)),
+        ("tag1", ("http://example.com", "text1", ",tag1,", None, 0, True, False)),
+    ],
+)
+def test_import_rss(tmpdir, newtag, exp_res):
+    from buku import import_rss
+
+    p = tmpdir.mkdir("importrss").join("test.rss")
+    p.write(
+        '<feed xmlns="http://www.w3.org/2005/Atom">\n'
+        '    <title>Bookmarks</title>\n'
+        '    <generator uri="https://github.com/jarun/buku">buku</generator>\n'
+        '    <entry>\n'
+        '        <title>text1</title>\n'
+        '        <link href="http://example.com"/>\n'
+        '    </entry>\n'
+        '</feed>\n')
+    res = list(import_rss(p.strpath, newtag))
+    assert res[0] == exp_res
 
 @pytest.mark.parametrize(
     "newtag, exp_res",
@@ -862,6 +884,28 @@ def test_copy_to_clipboard(platform, params):
             "markdown",
             "- [Untitled](http://example.com)\n- [Untitled](http://example.org)\n- [Google](http://google.com)\n",
         ],
+        [
+            "rss",
+            '<feed xmlns="http://www.w3.org/2005/Atom">\n'
+            '    <title>Bookmarks</title>\n'
+            '    <generator uri="https://github.com/jarun/buku">buku</generator>\n'
+            '    <entry>\n'
+            '        <title></title>\n'
+            '        <link href="http://example.com" rel="alternate" type="text/html"/>\n'
+            '        <id>1</id>\n'
+            '    </entry>\n'
+            '    <entry>\n'
+            '        <title></title>\n'
+            '        <link href="http://example.org" rel="alternate" type="text/html"/>\n'
+            '        <id>1</id>\n'
+            '    </entry>\n'
+            '    <entry>\n'
+            '        <title>Google</title>\n'
+            '        <link href="http://google.com" rel="alternate" type="text/html"/>\n'
+            '        <id>2</id>\n'
+            '    </entry>\n'
+            '</feed>',
+        ],
         ["random", None],
         [
             "xbel",