From 470c31d603c79f9ec2171be7cd93bd8aff93d379 Mon Sep 17 00:00:00 2001
From: Thawann Malfatti <malfatti@disroot.org>
Date: Mon, 27 Apr 2020 21:06:47 -0300
Subject: [PATCH 01/34] Implement nauthor, ntitle and nameformat user settings

---
 papers/bib.py    | 228 +++++++++++++++++++++++++++++------------------
 papers/config.py |  12 ++-
 2 files changed, 152 insertions(+), 88 deletions(-)

diff --git a/papers/bib.py b/papers/bib.py
index 19aa11e..9e69d4f 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -33,9 +33,6 @@
 
 # KEY GENERATION
 # ==============
-NAUTHOR = 2
-NTITLE = 0
-
 
 def append_abc(key, keys=[]):
     """
@@ -64,11 +61,11 @@ def append_abc(key, keys=[]):
     return Key
 
 
-def generate_key(entry, nauthor=NAUTHOR, ntitle=NTITLE, minwordlen=3, mintitlen=4, keys=None):
+def generate_key(entry, nauthor=config.nauthor, ntitle=config.ntitle, minwordlen=3, mintitlen=4, keys=None, authorsep='_'):
     # names = bibtexparser.customization.getnames(entry.get('author','unknown').lower().split(' and '))
     names = family_names(entry.get('author','unknown').lower())
-    authortag = '_'.join([nm for nm in names[:nauthor]]) 
-    yeartag = entry.get('year','0000') 
+    authortag = authorsep.join([nm for nm in names[:nauthor]])
+    yeartag = entry.get('year','0000')
     if not ntitle or not entry.get('title',''):
         titletag = ''
     else:
@@ -80,7 +77,7 @@ def generate_key(entry, nauthor=NAUTHOR, ntitle=NTITLE, minwordlen=3, mintitlen=
     if keys and key in keys: # and not isinstance(keys, set):
         key = append_abc(key, keys)
     return key
- 
+
 
 # DUPLICATE DEFINITION
 # ====================
@@ -140,7 +137,7 @@ def compare_entries(e1, e2, fuzzy=False):
 
     id1 = entry_id(e1)
     id2 = entry_id(e2)
-    
+
     logger.debug('{} ?= {}'.format(id1, id2))
 
     if id1 == id2:
@@ -171,7 +168,7 @@ def are_duplicates(e1, e2, similarity=DEFAULT_SIMILARITY, fuzzy_ratio=FUZZY_RATI
         PARTIAL = PARTIAL_DUPLICATES,
         FUZZY = FUZZY_DUPLICATES,
         )
-    try: 
+    try:
         target = level[similarity]
     except KeyError:
         raise ValueError('similarity must be one of EXACT, GOOD, FAIR, PARTIAL, FUZZY')
@@ -220,7 +217,7 @@ class DuplicateKeyError(ValueError):
 class Biblio(object):
     """main config
     """
-    def __init__(self, db=None, filesdir=None, key_field='ID', nauthor=NAUTHOR, ntitle=NTITLE, similarity=DEFAULT_SIMILARITY):
+    def __init__(self, db=None, filesdir=None, key_field='ID', nauthor=config.nauthor, ntitle=config.ntitle, nameformat=config.nameformat, similarity=DEFAULT_SIMILARITY):
         self.filesdir = filesdir
         # assume an already sorted list
         self.key_field = key_field
@@ -232,6 +229,7 @@ def __init__(self, db=None, filesdir=None, key_field='ID', nauthor=NAUTHOR, ntit
         self.sort()
         self.nauthor = nauthor
         self.ntitle = ntitle
+        self.nameformat = nameformat
         self.similarity = similarity
 
     @property
@@ -297,7 +295,7 @@ def insert_entry(self, entry, update_key=False, check_duplicate=False, **checkop
 
         if i < len(self.entries) and self.key(self.entries[i]) == self.key(entry):
             logger.info('key duplicate: '+self.key(self.entries[i]))
-            
+
             if update_key:
                 newkey = self.append_abc_to_key(entry)  # add abc
                 logger.info('update key: {} => {}'.format(entry['ID'], newkey))
@@ -305,15 +303,15 @@ def insert_entry(self, entry, update_key=False, check_duplicate=False, **checkop
 
             else:
                 raise DuplicateKeyError('this error can be avoided if update_key is True')
-       
-        else: 
+
+        else:
             logger.info('new entry: '+self.key(entry))
-        
+
         self.entries.insert(i, entry)
 
 
     def insert_entry_check(self, entry, update_key=False, mergefiles=True, on_conflict='i'):
-        
+
         duplicates = [e for e in self.entries if self.eq(e, entry)]
 
         if not duplicates:
@@ -383,7 +381,7 @@ def fetch_doi(self, doi, **kw):
 
 
     def add_pdf(self, pdf, attachments=None, rename=False, copy=False, search_doi=True, search_fulltext=True, scholar=False, **kw):
-        
+
         bibtex = extract_pdf_metadata(pdf, search_doi, search_fulltext, scholar=scholar)
 
         bib = bibtexparser.loads(bibtex)
@@ -402,32 +400,32 @@ def add_pdf(self, pdf, attachments=None, rename=False, copy=False, search_doi=Tr
         self.insert_entry(entry, update_key=True, **kw)
 
         if rename:
-            self.rename_entry_files(entry, copy=copy)
-            
+            self.rename_entry_files(entry, copy=copy, nameformat=self.nameformat)
+
 
     def scan_dir(self, direc, search_doi=True, search_fulltext=True, **kw):
-        
+
         for root, direcs, files in os.walk(direc):
             dirname = os.path.basename(root)
             if dirname.startswith('.'): continue
             if dirname.startswith('_'): continue
-        
+
             # maybe a special entry directory?
             if os.path.exists(hidden_bibtex(root)):
                 logger.debug('read from hidden bibtex')
                 try:
                     entry = read_entry_dir(root)
                     self.insert_entry(entry, **kw)
-                except Exception:  
+                except Exception:
                     logger.warn(root+'::'+str(error))
-                continue 
+                continue
 
             for file in files:
                 if file.startswith('.'):
                     continue
                 path = os.path.join(root, file)
                 try:
-                    if file.endswith('.pdf'): 
+                    if file.endswith('.pdf'):
                         self.add_pdf(path, search_doi=search_doi, search_fulltext=search_fulltext, **kw)
                     elif file.endswith('.bib'):
                         self.add_bibtex_file(path, **kw)
@@ -453,26 +451,69 @@ def check_duplicates(self, key=None, eq=None, mode='i'):
         self.sort() # keep sorted
 
 
-    def rename_entry_files(self, e, copy=False):
+    def rename_entry_files(self, e, copy=False, nameformat='year,/,ID'):
+        """ Rename files according to 'nameformat'
+            'nameformat' is a comma-separated string, and every field that is in
+            e.keys() will be replaced by the corresponding value. Fields not in
+            e.keys() will remain untouched.
+
+            To rename esd-4-11-2013.pdf as perrette_2013.pdf, nameformat should be 'author,_,year'.
+            If that happens to be the entry ID, 'ID' also works.
+
+            To rename esd-4-11-2013.pdf as
+            2013/Perrette2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf,
+            nameformat should be 'year,/,Author,year,Title' (note the case).
+
+            Entries are case-sensitive, so that:
+                'author' generates 'perrette'
+                'Author' generates 'Perrette'
+                'AUTHOR' generates 'PERRETTE'
+            any other case, like 'AuTHoR', will retrieve the field from 'e' with unaltered case.
+        """
 
         if self.filesdir is None:
             raise ValueError('filesdir is None, cannot rename entries')
 
         files = parse_file(e.get('file',''))
         # newname = entrydir(e, root)
-        direc = os.path.join(self.filesdir, e.get('year','0000'))
+        direc = self.filesdir
+
+        autoname = lambda s: s.replace(':','-').replace(';','-') # ':' and ';' are forbidden in file name
+
+        Fields = nameformat.split(',')
+        for F,Field in enumerate(Fields):
+            if Field == 'ID':
+                Fields[F] = autoname(e['ID'])
+
+            elif Field == 'year':
+                Fields[F] = e.get('year','0000')
+
+            elif Field.lower() in e.keys():
+                if Field.lower() == 'author':
+                    Names = family_names(e['author'])
+                    eField = Names[0]
+                    if len(Names) > 3: eField += ' et al'
+                else:
+                    eField = e[Field.lower()]
+
+                if Field.istitle():
+                    Fields[F] = autoname(eField).title().replace(' ','')
+                elif Field.islower():
+                    Fields[F] = autoname(eField).lower().replace(' ','')
+                elif Field.isupper():
+                    Fields[F] = autoname(eField).upper().replace(' ','')
+                else:
+                    Fields[F] = autoname(eField).replace(' ','')
 
         if not files:
             logger.info('no files to rename')
             return
 
-        autoname = lambda e: e['ID'].replace(':','-').replace(';','-') # ':' and ';' are forbidden in file name
-
         count = 0
         if len(files) == 1:
             file = files[0]
             base, ext = os.path.splitext(file)
-            newfile = os.path.join(direc, autoname(e)+ext)
+            newfile = os.path.join(direc, ''.join(Fields)+ext)
             if not os.path.exists(file):
                 raise ValueError(file+': original file link is broken')
             elif file != newfile:
@@ -521,14 +562,14 @@ def rename_entry_files(self, e, copy=False):
     def rename_entries_files(self, copy=False):
         for e in self.db.entries:
             try:
-                self.rename_entry_files(e, copy)
+                self.rename_entry_files(e, copy, nameformat=self.nameformat)
             except Exception as error:
                 logger.error(str(error))
                 continue
 
 
-    def fix_entry(self, e, fix_doi=True, fetch=False, fetch_all=False, 
-        fix_key=False, auto_key=False, key_ascii=False, encoding=None, 
+    def fix_entry(self, e, fix_doi=True, fetch=False, fetch_all=False,
+        fix_key=False, auto_key=False, key_ascii=False, encoding=None,
         format_name=True, interactive=False):
 
         e_old = e.copy()
@@ -592,10 +633,10 @@ def fix_entry(self, e, fix_doi=True, fetch=False, fetch_all=False,
                 except Exception as error:
                     logger.warn('...failed to fetch/update bibtex (all): '+str(error))
 
-            if bibtex:                        
+            if bibtex:
                 db = bibtexparser.loads(bibtex)
                 e2 = db.entries[0]
-                self.fix_entry(e2, encoding=encoding, format_name=True) 
+                self.fix_entry(e2, encoding=encoding, format_name=True)
                 strip_e = lambda e_: {k:e_[k] for k in e_ if k not in ['ID', 'file'] and k in e2}
                 if strip_e(e) != strip_e(e2):
                     logger.info('...fetch-update entry')
@@ -658,12 +699,12 @@ def entry_filecheck_metadata(e, file, image=False):
         raise ValueError(e['ID']+': doi: entry <=> pdf : {} <=> {}'.format(e['doi'].lower(), doi.lower()))
 
 
-def entry_filecheck(e, delete_broken=False, fix_mendeley=False, 
+def entry_filecheck(e, delete_broken=False, fix_mendeley=False,
     check_hash=False, check_metadata=False, interactive=True, image=False):
 
     if 'file' not in e:
         return
-    
+
     if check_hash:
         import hashlib
 
@@ -679,8 +720,8 @@ def entry_filecheck(e, delete_broken=False, fix_mendeley=False,
             logger.info(e['ID']+': remove duplicate path: "{}"'.format(fixed.get(file, file)))
             continue
         realpaths.add(realpath) # put here so that for identical
-                                   # files that are checked and finally not 
-                                   # included, the work is done only once 
+                                   # files that are checked and finally not
+                                   # included, the work is done only once
 
         if fix_mendeley and not os.path.exists(file):
             old = file
@@ -695,8 +736,8 @@ def entry_filecheck(e, delete_broken=False, fix_mendeley=False,
             dirname = os.path.dirname(file)
             candidate = os.path.sep + file
             if (not file.startswith(os.path.sep) and dirname # only apply when some directory name is specified
-                and not os.path.exists(dirname) 
-                and os.path.exists(os.path.dirname(candidate))): # simply requires that '/'+directory exists 
+                and not os.path.exists(dirname)
+                and os.path.exists(os.path.dirname(candidate))): # simply requires that '/'+directory exists
                 # and os.path.exists(newfile)):
                     # logger.info('prepend "/" to file name: "{}"'.format(file))
                     file = candidate
@@ -766,16 +807,22 @@ def main():
 
     cfg = argparse.ArgumentParser(add_help=False, parents=[loggingp])
     grp = cfg.add_argument_group('config')
-    grp.add_argument('--filesdir', default=config.filesdir, 
+    grp.add_argument('--filesdir', default=config.filesdir,
         help='files directory (default: %(default)s)')
     grp.add_argument('--bibtex', default=config.bibtex,
         help='bibtex database (default: %(default)s)')
-    grp.add_argument('--dry-run', action='store_true', 
+    grp.add_argument('--dry-run', action='store_true',
         help='no PDF renaming/copying, no bibtex writing on disk (for testing)')
+    grp.add_argument('--nauthor', type=int, default=config.nauthor,
+        help='number of authors to include in key (default:%(default)s)')
+    grp.add_argument('--ntitle', type=int, default=config.ntitle,
+        help='number of title words to include in key (default:%(default)s)')
+    grp.add_argument('--nameformat', default=config.nameformat,
+        help='comma-separated fields for renaming files (default:%(default)s)')
 
     # status
     # ======
-    statusp = subparsers.add_parser('status', 
+    statusp = subparsers.add_parser('status',
         description='view install status',
         parents=[cfg])
     statusp.add_argument('--no-check-files', action='store_true', help='faster, less info')
@@ -785,32 +832,32 @@ def statuscmd(o):
         if o.bibtex:
             config.bibtex = o.bibtex
         if o.filesdir is not None:
-            config.filesdir = o.filesdir        
+            config.filesdir = o.filesdir
         print(config.status(check_files=not o.no_check_files, verbose=o.verbose))
-        
+
 
     # install
     # =======
 
     installp = subparsers.add_parser('install', description='setup or update papers install',
         parents=[cfg])
-    installp.add_argument('--reset-paths', action='store_true') 
+    installp.add_argument('--reset-paths', action='store_true')
     # egrp = installp.add_mutually_exclusive_group()
-    installp.add_argument('--local', action='store_true', 
-        help="""save config file in current directory (global install by default). 
-        This file will be loaded instead of the global configuration file everytime 
-        papers is executed from this directory. This will affect the default bibtex file, 
+    installp.add_argument('--local', action='store_true',
+        help="""save config file in current directory (global install by default).
+        This file will be loaded instead of the global configuration file everytime
+        papers is executed from this directory. This will affect the default bibtex file,
         the files directory, as well as the git-tracking option. Note this option does
         not imply anything about the actual location of bibtex file and files directory.
         """)
-    installp.add_argument('--git', action='store_true', 
-        help="""Track bibtex files with git. 
+    installp.add_argument('--git', action='store_true',
+        help="""Track bibtex files with git.
         Each time the bibtex is modified, a copy of the file is saved in a git-tracked
-        global directory (see papers status), and committed. Note the original bibtex name is 
+        global directory (see papers status), and committed. Note the original bibtex name is
         kept, so that different files can be tracked simultaneously, as long as the names do
         not conflict. This option is mainly useful for backup purposes (local or remote).
         Use in combination with `papers git`'
-        """) 
+        """)
     installp.add_argument('--gitdir', default=config.gitdir, help='default: %(default)s')
 
     grp = installp.add_argument_group('status')
@@ -838,6 +885,10 @@ def installcmd(o):
         if o.filesdir is not None:
             config.filesdir = o.filesdir
 
+        config.nauthor = o.nauthor
+        config.ntitle = o.ntitle
+        config.nameformat = o.nameformat
+
         if o.reset_paths:
             config.reset()
 
@@ -906,23 +957,23 @@ def savebib(my, o):
     # addp.add_argument('-f','--force', action='store_true', help='disable interactive')
 
     grp = addp.add_argument_group('duplicate check')
-    grp.add_argument('--no-check-duplicate', action='store_true', 
+    grp.add_argument('--no-check-duplicate', action='store_true',
         help='disable duplicate check (faster, create duplicates)')
-    grp.add_argument('--no-merge-files', action='store_true', 
+    grp.add_argument('--no-merge-files', action='store_true',
         help='distinct "file" field considered a conflict, all other things being equal')
-    grp.add_argument('-u', '--update-key', action='store_true', 
+    grp.add_argument('-u', '--update-key', action='store_true',
         help='update added key according to any existing duplicate (otherwise an error might be raised on identical insert key)')
     # grp.add_argument('-f', '--force', action='store_true', help='no interactive')
     grp.add_argument('-m', '--mode', default='i', choices=['u', 'U', 'o', 's', 'r', 'i','a'],
-        help='''if duplicates are found, the default is to start an (i)nteractive dialogue, 
+        help='''if duplicates are found, the default is to start an (i)nteractive dialogue,
         unless "mode" is set to (r)aise, (s)skip new, (u)pdate missing, (U)pdate with new, (o)verwrite completely.
         ''')
 
     grp = addp.add_argument_group('directory scan')
-    grp.add_argument('--recursive', action='store_true', 
+    grp.add_argument('--recursive', action='store_true',
         help='accept directory as argument, for recursive scan \
         of .pdf files (bibtex files are ignored in this mode')
-    grp.add_argument('--ignore-errors', action='store_true', 
+    grp.add_argument('--ignore-errors', action='store_true',
         help='ignore errors when adding multiple files')
 
     grp = addp.add_argument_group('pdf metadata')
@@ -932,32 +983,35 @@ def savebib(my, o):
 
     grp = addp.add_argument_group('attached files')
     grp.add_argument('-a','--attachment', nargs='+', help=argparse.SUPPRESS) #'supplementary material')
-    grp.add_argument('-r','--rename', action='store_true', 
+    grp.add_argument('-r','--rename', action='store_true',
         help='rename PDFs according to key')
-    grp.add_argument('-c','--copy', action='store_true', 
+    grp.add_argument('-c','--copy', action='store_true',
         help='copy file instead of moving them')
 
 
 
     def addcmd(o):
-
         if os.path.exists(o.bibtex):
             my = Biblio.load(o.bibtex, o.filesdir)
         else:
             my = Biblio.newbib(o.bibtex, o.filesdir)
 
+        my.nauthor = o.nauthor
+        my.ntitle = o.ntitle
+        my.nameformat = o.nameformat
+
         if len(o.file) > 1 and o.attachment:
             logger.error('--attachment is only valid for one added file')
             addp.exit(1)
 
-        kw = {'on_conflict':o.mode, 'check_duplicate':not o.no_check_duplicate, 
+        kw = {'on_conflict':o.mode, 'check_duplicate':not o.no_check_duplicate,
             'mergefiles':not o.no_merge_files, 'update_key':o.update_key}
 
         for file in o.file:
             try:
                 if os.path.isdir(file):
                     if o.recursive:
-                        my.scan_dir(file, rename=o.rename, copy=o.copy, 
+                        my.scan_dir(file, rename=o.rename, copy=o.copy,
                             search_doi=not o.no_query_doi,
                             search_fulltext=not o.no_query_fulltext,
                               **kw)
@@ -965,22 +1019,22 @@ def addcmd(o):
                         raise ValueError(file+' is a directory, requires --recursive to explore')
 
                 elif file.endswith('.pdf'):
-                    my.add_pdf(file, attachments=o.attachment, rename=o.rename, copy=o.copy, 
+                    my.add_pdf(file, attachments=o.attachment, rename=o.rename, copy=o.copy,
                             search_doi=not o.no_query_doi,
-                            search_fulltext=not o.no_query_fulltext, 
-                            scholar=o.scholar, 
+                            search_fulltext=not o.no_query_fulltext,
+                            scholar=o.scholar,
                             **kw)
 
                 else: # file.endswith('.bib'):
                     my.add_bibtex_file(file, **kw)
 
             except Exception as error:
-                # print(error) 
+                # print(error)
                 # addp.error(str(error))
                 raise
                 logger.error(str(error))
                 if not o.ignore_errors:
-                    if len(o.file) or (os.isdir(file) and o.recursive)> 1: 
+                    if len(o.file) or (os.isdir(file) and o.recursive)> 1:
                         logger.error('use --ignore to add other files anyway')
                     addp.exit(1)
 
@@ -989,7 +1043,7 @@ def addcmd(o):
 
     # check
     # =====
-    checkp = subparsers.add_parser('check', description='check and fix entries', 
+    checkp = subparsers.add_parser('check', description='check and fix entries',
         parents=[cfg])
     checkp.add_argument('-k', '--keys', nargs='+', help='apply check on this key subset')
     checkp.add_argument('-f','--force', action='store_true', help='do not ask')
@@ -998,8 +1052,8 @@ def addcmd(o):
     grp.add_argument('--fix-key', action='store_true', help='fix key based on author name and date (in case misssing or digit)')
     grp.add_argument('--key-ascii', action='store_true', help='replace keys unicode character with ascii')
     grp.add_argument('--auto-key', action='store_true', help='new, auto-generated key for all entries')
-    grp.add_argument('--nauthor', type=int, default=NAUTHOR, help='number of authors to include in key (default:%(default)s)')
-    grp.add_argument('--ntitle', type=int, default=NTITLE, help='number of title words to include in key (default:%(default)s)')
+#     grp.add_argument('--nauthor', type=int, default=config.nauthor, help='number of authors to include in key (default:%(default)s)')
+#     grp.add_argument('--ntitle', type=int, default=config.ntitle, help='number of title words to include in key (default:%(default)s)')
     # grp.add_argument('--ascii-key', action='store_true', help='replace unicode characters with closest ascii')
 
     grp = checkp.add_argument_group('crossref fetch and fix')
@@ -1029,8 +1083,8 @@ def checkcmd(o):
         for e in my.entries:
             if o.keys and e.get('ID','') not in o.keys:
                 continue
-            my.fix_entry(e, fix_doi=o.fix_doi, fetch=o.fetch, fetch_all=o.fetch_all, fix_key=o.fix_key, 
-                auto_key=o.auto_key, format_name=o.format_name, encoding=o.encoding, 
+            my.fix_entry(e, fix_doi=o.fix_doi, fetch=o.fetch, fetch_all=o.fetch_all, fix_key=o.fix_key,
+                auto_key=o.auto_key, format_name=o.format_name, encoding=o.encoding,
                 key_ascii=o.key_ascii, interactive=not o.force)
 
 
@@ -1044,34 +1098,34 @@ def checkcmd(o):
     # =====
     filecheckp = subparsers.add_parser('filecheck', description='check attached file(s)',
         parents=[cfg])
-    # filecheckp.add_argument('-f','--force', action='store_true', 
+    # filecheckp.add_argument('-f','--force', action='store_true',
     #     help='do not ask before performing actions')
 
     # action on files
-    filecheckp.add_argument('-r','--rename', action='store_true', 
+    filecheckp.add_argument('-r','--rename', action='store_true',
         help='rename files')
-    filecheckp.add_argument('-c','--copy', action='store_true', 
+    filecheckp.add_argument('-c','--copy', action='store_true',
         help='in combination with --rename, keep a copy of the file in its original location')
 
     # various metadata and duplicate checks
-    filecheckp.add_argument('--metadata-check', action='store_true', 
+    filecheckp.add_argument('--metadata-check', action='store_true',
         help='parse pdf metadata and check against metadata (currently doi only)')
 
-    filecheckp.add_argument('--hash-check', action='store_true', 
+    filecheckp.add_argument('--hash-check', action='store_true',
         help='check file hash sum to remove any duplicates')
 
-    filecheckp.add_argument('-d', '--delete-broken', action='store_true', 
+    filecheckp.add_argument('-d', '--delete-broken', action='store_true',
         help='remove file entry if the file link is broken')
 
-    filecheckp.add_argument('--fix-mendeley', action='store_true', 
+    filecheckp.add_argument('--fix-mendeley', action='store_true',
         help='fix a Mendeley bug where the leading "/" is omitted.')
 
-    filecheckp.add_argument('--force', action='store_true', help='no interactive prompt, strictly follow options') 
+    filecheckp.add_argument('--force', action='store_true', help='no interactive prompt, strictly follow options')
     # filecheckp.add_argument('--search-for-files', action='store_true',
     #     help='search for missing files')
     # filecheckp.add_argument('--searchdir', nargs='+',
     #     help='search missing file link for existing bibtex entries, based on doi')
-    # filecheckp.add_argument('-D', '--delete-free', action='store_true', 
+    # filecheckp.add_argument('-D', '--delete-free', action='store_true',
         # help='delete file which is not associated with any entry')
     # filecheckp.add_argument('-a', '--all', action='store_true', help='--hash and --meta')
 
@@ -1080,7 +1134,7 @@ def filecheckcmd(o):
 
         # fix ':home' entry as saved by Mendeley
         for e in my.entries:
-            entry_filecheck(e, delete_broken=o.delete_broken, fix_mendeley=o.fix_mendeley, 
+            entry_filecheck(e, delete_broken=o.delete_broken, fix_mendeley=o.fix_mendeley,
                 check_hash=o.hash_check, check_metadata=o.metadata_check, interactive=not o.force)
 
         if o.rename:
@@ -1265,12 +1319,12 @@ def nfiles(e):
     doip = subparsers.add_parser('doi', description='parse DOI from PDF')
     doip.add_argument('pdf')
     doip.add_argument('--image', action='store_true', help='convert to image and use tesseract instead of pdftotext')
-    
+
     def doicmd(o):
         print(extract_pdf_doi(o.pdf, image=o.image))
 
     # fetch
-    # =====   
+    # =====
     fetchp = subparsers.add_parser('fetch', description='fetch bibtex from DOI')
     fetchp.add_argument('doi')
 
@@ -1307,7 +1361,7 @@ def undocmd(o):
         shutil.move(tmp, back)
         savebib(None, o)
 
-        
+
 
     # git
     # ===
diff --git a/papers/config.py b/papers/config.py
index da4858b..19da390 100644
--- a/papers/config.py
+++ b/papers/config.py
@@ -52,12 +52,16 @@ class Config(object):
     """configuration class to specify system-wide collections and files-dir
     """
     def __init__(self, file=CONFIG_FILE, data=DATA_DIR, cache=CACHE_DIR,
-        bibtex=None, filesdir=None, gitdir=None, git=False):
+        bibtex=None, filesdir=None, nauthor=2, ntitle=0, nameformat='year,/,ID',
+        gitdir=None, git=False):
         self.file = file
         self.data = data
         self.cache = cache
         self.filesdir = filesdir or os.path.join(data, 'files')
         self.bibtex = bibtex  or os.path.join(data, 'papers.bib')
+        self.nauthor = nauthor
+        self.ntitle = ntitle
+        self.nameformat = nameformat
         self.gitdir = gitdir  or data
         self.git = git
 
@@ -72,6 +76,9 @@ def save(self):
         json.dump({
             "filesdir":self.filesdir,
             "bibtex":self.bibtex,
+            "nauthor":self.nauthor,
+            "ntitle":self.ntitle,
+            "nameformat":self.nameformat,
             "git":self.git,
             "gitdir":self.gitdir,
             }, open(self.file, 'w'), sort_keys=True, indent=2, separators=(',', ': '))
@@ -81,6 +88,9 @@ def load(self):
         js = json.load(open(self.file))
         self.bibtex = js.get('bibtex', self.bibtex)
         self.filesdir = js.get('filesdir', self.filesdir)
+        self.nauthor = js.get('nauthor', self.nauthor)
+        self.ntitle = js.get('ntitle', self.ntitle)
+        self.nameformat = js.get('nameformat', self.nameformat)
         self.git = js.get('git', self.git)
         self.gitdir = js.get('gitdir', self.gitdir)
 

From f48006151aaf25c400a63cb78049e8a931980701 Mon Sep 17 00:00:00 2001
From: Thawann Malfatti <malfatti@disroot.org>
Date: Thu, 30 Apr 2020 10:11:05 -0300
Subject: [PATCH 02/34] Make "et al." smarter

---
 papers/bib.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/papers/bib.py b/papers/bib.py
index 9e69d4f..431abda 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -491,8 +491,11 @@ def rename_entry_files(self, e, copy=False, nameformat='year,/,ID'):
             elif Field.lower() in e.keys():
                 if Field.lower() == 'author':
                     Names = family_names(e['author'])
-                    eField = Names[0]
-                    if len(Names) > 3: eField += ' et al'
+
+                    if len(Names) >= 3: eField = Names[0] + ' et al'
+                    elif len(Names) == 2: eField = ' and '.join(Names)
+                    else: eField = Names[0]
+
                 else:
                     eField = e[Field.lower()]
 

From 11efec2d5db92711980295252282ebcf6b11cb2d Mon Sep 17 00:00:00 2001
From: Thawann Malfatti <malfatti@disroot.org>
Date: Thu, 30 Apr 2020 10:27:01 -0300
Subject: [PATCH 03/34] Add instructions on how to control file name when
 renaming

---
 README.md | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 31108b9..5ef9c9b 100644
--- a/README.md
+++ b/README.md
@@ -73,8 +73,33 @@ For the sake of the example, one of my owns: https://www.earth-syst-dynam.net/4/
 
 (the `--info` argument asks for the above output information to be printed out to the terminal)
 
-In the common case where the bibtex (`--bibtex`) and files directory  (`--filesdir`) do not change,
-it is convenient to *install* `papers`.
+
+- control fields when renaming file
+
+        $> papers add --rename --info --nameformat Author,year,-,Title --nauthor 1 --ntitle 1 esd-4-11-2013
+        INFO:papers:found doi:10.5194/esd-4-11-2013
+        INFO:papers:new entry: perrette2013scaling
+        INFO:papers:create directory: files/2013
+        INFO:papers:mv /home/perrette/playground/papers/esd-4-11-2013.pdf files/2013/PerretteEtAl2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf
+        INFO:papers:renamed file(s): 1
+
+where 'nameformat' is a comma-separated list of fields, with valid fields being any field available in the bibtex. Fields not in the bibtex will remain untouched.
+
+To rename esd-4-11-2013.pdf as perrette_2013.pdf, nameformat should be "author,_,year".
+If that happens to be the entry ID, 'ID' also works.
+
+To rename esd-4-11-2013.pdf as 2013/Perrette2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf,
+nameformat should be 'year,/,Author,year,Title' (note the case).
+
+Entries are case-sensitive, so that:
+    'author' generates 'perrette'
+    'Author' generates 'Perrette'
+    'AUTHOR' generates 'PERRETTE'
+any other case, like 'AuTHoR', will retrieve the field from 'e' with unaltered case.
+
+
+
+In the common case where the bibtex (`--bibtex`), files directory  (`--filesdir`), number of authors in key (`--nauthor`) or number of title words in key (`--ntitle`) do not change, it is convenient to *install* `papers`.
 Install comes with the option to git-track any change to the bibtex file (`--git`) options.
 
 - setup git-tracked library (optional)

From a649da73b0278cdd435be7db6aec29a13a6cb3a8 Mon Sep 17 00:00:00 2001
From: Thawann Malfatti <malfatti@disroot.org>
Date: Thu, 30 Apr 2020 10:30:01 -0300
Subject: [PATCH 04/34] Fix markdown formatting

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 5ef9c9b..061d56c 100644
--- a/README.md
+++ b/README.md
@@ -85,16 +85,16 @@ For the sake of the example, one of my owns: https://www.earth-syst-dynam.net/4/
 
 where 'nameformat' is a comma-separated list of fields, with valid fields being any field available in the bibtex. Fields not in the bibtex will remain untouched.
 
-To rename esd-4-11-2013.pdf as perrette_2013.pdf, nameformat should be "author,_,year".
-If that happens to be the entry ID, 'ID' also works.
+To rename esd-4-11-2013.pdf as perrette_2013.pdf, nameformat should be `author,_,year`.
+If that happens to be the entry ID, `ID` also works.
 
 To rename esd-4-11-2013.pdf as 2013/Perrette2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf,
-nameformat should be 'year,/,Author,year,Title' (note the case).
+nameformat should be `year,/,Author,year,Title` (note the case).
 
-Entries are case-sensitive, so that:
-    'author' generates 'perrette'
-    'Author' generates 'Perrette'
-    'AUTHOR' generates 'PERRETTE'
+Entries are case-sensitive, so that:  
+    'author' generates 'perrette'  
+    'Author' generates 'Perrette'  
+    'AUTHOR' generates 'PERRETTE'  
 any other case, like 'AuTHoR', will retrieve the field from 'e' with unaltered case.
 
 

From 04374de99c51454d197b9b1dc35367268b95052c Mon Sep 17 00:00:00 2001
From: Thawann Malfatti <malfatti@disroot.org>
Date: Thu, 30 Apr 2020 17:11:17 -0300
Subject: [PATCH 05/34] Improve autoname function

---
 papers/bib.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/papers/bib.py b/papers/bib.py
index 431abda..e748290 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -478,7 +478,20 @@ def rename_entry_files(self, e, copy=False, nameformat='year,/,ID'):
         # newname = entrydir(e, root)
         direc = self.filesdir
 
-        autoname = lambda s: s.replace(':','-').replace(';','-') # ':' and ';' are forbidden in file name
+        def autoname(namestr):
+            # Adapted
+            # from https://gitlab.com/malfatti/SciScripts/-/blob/master/Python3/Files/FixStupidFileNames.py
+
+            New = namestr
+
+            for C in ['"',"'",'!','@','#','$','%','&','*','+','=',';',':','?',',','/','\\']:
+                if C in New: New = New.replace(C,'')
+
+            for C in ['(', ')', '[', ']', '{', '}', '<', '>', '|']:
+                if C in New: New = New.replace(C,'_')
+
+            return(New)
+
 
         Fields = nameformat.split(',')
         for F,Field in enumerate(Fields):

From b235c8abbb04dc45113d1a995fb4a6eb9261c236 Mon Sep 17 00:00:00 2001
From: Thawann Malfatti <malfatti@disroot.org>
Date: Wed, 6 May 2020 07:05:42 -0300
Subject: [PATCH 06/34] Fix new use of autoname function

---
 papers/bib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/papers/bib.py b/papers/bib.py
index e748290..44e2051 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -541,7 +541,7 @@ def autoname(namestr):
 
         # several files: only rename container
         else:
-            newdir = os.path.join(direc, autoname(e))
+            newdir = os.path.join(direc, autoname(e['ID']))
             newfiles = []
             for file in files:
                 newfile = os.path.join(newdir, os.path.basename(file))

From 5fdc5fda06938f0a493b621409d63f37065464b6 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Wed, 10 Mar 2021 20:11:58 +0100
Subject: [PATCH 07/34] add git-lfs track statement in git-init (draft)

---
 papers/config.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/papers/config.py b/papers/config.py
index da4858b..feeeea6 100644
--- a/papers/config.py
+++ b/papers/config.py
@@ -52,7 +52,7 @@ class Config(object):
     """configuration class to specify system-wide collections and files-dir
     """
     def __init__(self, file=CONFIG_FILE, data=DATA_DIR, cache=CACHE_DIR,
-        bibtex=None, filesdir=None, gitdir=None, git=False):
+        bibtex=None, filesdir=None, gitdir=None, git=False, gitlfs=False):
         self.file = file
         self.data = data
         self.cache = cache
@@ -60,6 +60,7 @@ def __init__(self, file=CONFIG_FILE, data=DATA_DIR, cache=CACHE_DIR,
         self.bibtex = bibtex  or os.path.join(data, 'papers.bib')
         self.gitdir = gitdir  or data
         self.git = git
+        self.gitlfs = gitlfs
 
     def collections(self):
         files = []
@@ -82,6 +83,7 @@ def load(self):
         self.bibtex = js.get('bibtex', self.bibtex)
         self.filesdir = js.get('filesdir', self.filesdir)
         self.git = js.get('git', self.git)
+        self.gitlfs = js.get('gitlfs', self.gitlfs)
         self.gitdir = js.get('gitdir', self.gitdir)
 
 
@@ -106,12 +108,21 @@ def gitinit(self, branch=None):
         if not os.path.exists(self._gitdir):
             # with open(os.devnull, 'w') as shutup:
             sp.check_call(['git','init'], cwd=self.gitdir)
+            if self.gitlfs:
+                try:
+                    sp.check_call('git lfs track "files/**"', cwd=self.gitdir, shell=True) # this does not seem to work
+                    sp.check_call('git lfs track "pdf/*"', cwd=self.gitdir, shell=True) # pdf tracked via git-lfs
+                except Exception as error:
+                    logger.warning("Install git-lfs : https://git-lfs.github.com to track PDF files")
+                    self.gitlfs = False
+
         else:
             raise ValueError('git is already initialized in '+self.gitdir)
 
     def gitcommit(self, branch=None, message=None):
         if os.path.exists(self._gitdir):
             target = os.path.join(self.gitdir, os.path.basename(self.bibtex))
+            target_files = os.path.join(self.gitdir, "files")
             if not os.path.samefile(self.bibtex, target):
                 shutil.copy(self.bibtex, target)
             message = message or 'save '+self.bibtex+' after command:\n\n    papers ' +' '.join(sys.argv[1:])
@@ -119,6 +130,8 @@ def gitcommit(self, branch=None, message=None):
                 if branch is not None:
                     sp.check_call(['git','checkout',branch], stdout=shutup, stderr=shutup, cwd=self.gitdir)
                 sp.check_call(['git','add',target], stdout=shutup, stderr=shutup, cwd=self.gitdir)
+                if self.gitlfs:
+                    sp.check_call(['git','add',target_files], stdout=shutup, stderr=shutup, cwd=self.gitdir)
                 res = sp.call(['git','commit','-m', message], stdout=shutup, stderr=shutup, cwd=self.gitdir)
                 if res == 0:
                     logger.info('git commit')
@@ -134,6 +147,7 @@ def status(self, check_files=False, verbose=False):
             lines.append('* cache directory:    '+self.cache)
             # lines.append('* app data directory: '+self.data)
             lines.append('* git-tracked:        '+str(self.git))
+            lines.append('* git-lfs tracked:        '+str(self.gitlfs))
             if self.git:
                 lines.append('* git directory :     '+self.gitdir)
 

From 23af6e1ecec537e1c531d3055c7216735328f176 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Thu, 11 Mar 2021 10:24:27 +0100
Subject: [PATCH 08/34] fix typos

---
 papers/bib.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/papers/bib.py b/papers/bib.py
index 19789ae..748e48e 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -176,7 +176,7 @@ def are_duplicates(e1, e2, similarity=DEFAULT_SIMILARITY, fuzzy_ratio=FUZZY_RATI
     except KeyError:
         raise ValueError('similarity must be one of EXACT, GOOD, FAIR, PARTIAL, FUZZY')
 
-    score = compare_entries(e1, e2, fuzzy=level==FUZZY_DUPLICATES)
+    score = compare_entries(e1, e2, fuzzy=target==FUZZY_DUPLICATES)
     logger.debug('score: {}, target: {}, similarity: {}'.format(score, target, similarity))
     return score >= target
 
@@ -1205,7 +1205,7 @@ def longmatch(word, target):
         if o.duplicates_tit:
             entries = list_dup(entries, key=title_id)
         if o.duplicates:
-            eq = lambda a, b: a['ID'] == b['ID'] or are_duplicates(a, b, similarity=level, fuzzy_ratio=o.fuzzy_ratio)
+            eq = lambda a, b: a['ID'] == b['ID'] or are_duplicates(a, b, fuzzy_ratio=o.fuzzy_ratio)
             entries = list_dup(entries, eq=eq)
 
         def nfiles(e):
@@ -1247,7 +1247,7 @@ def nfiles(e):
                 print(e['ID'].encode('utf-8'))
         elif o.one_liner:
             for e in entries:
-                tit = e['title'][:60]+ ('...' if len(e['title'])>60 else '')
+                tit = e.get('title', '')[:60]+ ('...' if len(e.get('title', ''))>60 else '')
                 info = []
                 if e.get('doi',''):
                     info.append('doi:'+e['doi'])

From 44d0691bba61c39e7fcb5821673fcd0eb52639bd Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Thu, 11 Mar 2021 10:29:29 +0100
Subject: [PATCH 09/34] duplicate search to PARTIAL in listing

---
 papers/bib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/papers/bib.py b/papers/bib.py
index 748e48e..55f039c 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -1205,7 +1205,7 @@ def longmatch(word, target):
         if o.duplicates_tit:
             entries = list_dup(entries, key=title_id)
         if o.duplicates:
-            eq = lambda a, b: a['ID'] == b['ID'] or are_duplicates(a, b, fuzzy_ratio=o.fuzzy_ratio)
+            eq = lambda a, b: a['ID'] == b['ID'] or are_duplicates(a, b, similarity="PARTIAL", fuzzy_ratio=o.fuzzy_ratio)
             entries = list_dup(entries, eq=eq)
 
         def nfiles(e):

From bc329312fe6d6572d41998058b39d4457ad7a6ba Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Thu, 11 Mar 2021 14:57:45 +0100
Subject: [PATCH 10/34] change file naming to include title

(and temporarily remove year folder until proper file naming template pattern is
decided)
---
 papers/bib.py | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/papers/bib.py b/papers/bib.py
index 55f039c..319b20e 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -13,6 +13,7 @@
 import re
 
 import bibtexparser
+from normality import slugify, normalize
 
 import papers
 from papers import logger
@@ -64,18 +65,27 @@ def append_abc(key, keys=[]):
     return Key
 
 
-def generate_key(entry, nauthor=NAUTHOR, ntitle=NTITLE, minwordlen=3, mintitlen=4, keys=None):
+def listtag(words, maxlength=30, minwordlen=3, n=100, sep='-'):
+    # preformat & filter words
+    words = [word for word in words if len(word) >= minwordlen]
+    while True:
+        tag = sep.join(words[:n])
+        n -= 1
+        if len(tag) <= maxlength or n < 2:
+            break
+    return tag
+
+
+def generate_key(entry, nauthor=NAUTHOR, ntitle=NTITLE, minwordlen=3, maxtitlen=4, keys=None):
     # names = bibtexparser.customization.getnames(entry.get('author','unknown').lower().split(' and '))
     names = family_names(entry.get('author','unknown').lower())
     authortag = '_'.join([nm for nm in names[:nauthor]])
-    yeartag = entry.get('year','0000')
+    yeartag = str(entry.get('year','0000'))
     if not ntitle or not entry.get('title',''):
         titletag = ''
     else:
-        words = [word for word in entry['title'].lower().strip().split() if len(word) >= minwordlen]
-        while len(u''.join(words[:ntitle])) < mintitlen and ntitle < len(words):
-            ntitle += 1
-        titletag = '_'.join(words[:ntitle])
+        titlewords = normalize(entry['title']).lower().split()
+        titletag = listtag(titlewords, n=ntitle, minwordlen=minwordlen, maxlength=maxtitlen, sep='-')
     key = authortag + yeartag + titletag
     if keys and key in keys: # and not isinstance(keys, set):
         key = append_abc(key, keys)
@@ -460,13 +470,22 @@ def rename_entry_files(self, e, copy=False):
 
         files = parse_file(e.get('file',''))
         # newname = entrydir(e, root)
-        direc = os.path.join(self.filesdir, e.get('year','0000'))
+        # direc = os.path.join(self.filesdir, e.get('year','0000'))
+        direc = os.path.join(self.filesdir)
 
         if not files:
             logger.info('no files to rename')
             return
 
-        autoname = lambda e: e['ID'].replace(':','-').replace(';','-') # ':' and ';' are forbidden in file name
+        # autoname = lambda e: e['ID'].replace(':','-').replace(';','-') # ':' and ';' are forbidden in file name
+        def autoname(e):
+            key = slugify(e['ID']).lower()
+            if e.get('title', ''):
+                titlewords = normalize(e['title']).lower().split()
+                titletag = listtag(titlewords, n=100, minwordlen=0, maxlength=70, sep='-')
+            else:
+                titletag = 'unknonwn'
+            return key + "_" + titletag
 
         count = 0
         if len(files) == 1:

From 1b661e5277832e4876d03eabda71267e74c1a709 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Thu, 11 Mar 2021 15:47:24 +0100
Subject: [PATCH 11/34] updates to work with newest scholarly

---
 papers/extract.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/papers/extract.py b/papers/extract.py
index 9afde81..6f9679e 100644
--- a/papers/extract.py
+++ b/papers/extract.py
@@ -232,7 +232,7 @@ def fetch_json_by_doi(doi):
 
 def _get_page_fast(pagerequest):
     """Return the data for a page on scholar.google.com"""
-    import scholarly.scholarly as scholarly
+    from scholarly import scholarly
     resp = scholarly._SESSION.get(pagerequest, headers=scholarly._HEADERS, cookies=scholarly._COOKIES)
     if resp.status_code == 200:
         return resp.text
@@ -248,10 +248,10 @@ def _scholar_score(txt, bib):
 
 @cached('scholar-bibtex.json', hashed_key=True)
 def fetch_bibtex_by_fulltext_scholar(txt, assess_results=True):
-    import scholarly.scholarly
-    scholarly._get_page = _get_page_fast  # remove waiting time
+    from scholarly import scholarly
+    # scholarly._get_page = _get_page_fast  # remove waiting time
     logger.debug(txt)
-    search_query = scholarly.search_pubs_query(txt)
+    search_query = scholarly.search_pubs(txt)
 
     # get the most likely match of the first results
     results = list(search_query)
@@ -266,13 +266,7 @@ def fetch_bibtex_by_fulltext_scholar(txt, assess_results=True):
     else:
         result = results[0]
 
-    # use url_scholarbib to get bibtex from google
-    if getattr(result, 'url_scholarbib', ''):
-        bibtex = scholarly._get_page(result.url_scholarbib).strip()
-    else:
-        raise NotImplementedError('no bibtex import link. Make crossref request using title?')
-    return bibtex
-
+    return scholarly.bibtex(result)
 
 
 def _crossref_get_author(res, sep=u'; '):

From 33fddc81713fe02c10ac941617bb63fe53bb4280 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Thu, 11 Mar 2021 16:19:59 +0100
Subject: [PATCH 12/34] add --doi option to papers add (when the DOI is known)

---
 papers/bib.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/papers/bib.py b/papers/bib.py
index 319b20e..f9cdf13 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -392,9 +392,12 @@ def fetch_doi(self, doi, **kw):
         self.add_bibtex(bibtex, **kw)
 
 
-    def add_pdf(self, pdf, attachments=None, rename=False, copy=False, search_doi=True, search_fulltext=True, scholar=False, **kw):
+    def add_pdf(self, pdf, attachments=None, rename=False, copy=False, search_doi=True, search_fulltext=True, scholar=False, doi=None, **kw):
 
-        bibtex = extract_pdf_metadata(pdf, search_doi, search_fulltext, scholar=scholar)
+        if doi:
+            bibtex = fetch_bibtex_by_doi(doi)
+        else:
+            bibtex = extract_pdf_metadata(pdf, search_doi, search_fulltext, scholar=scholar)
 
         bib = bibtexparser.loads(bibtex)
         entry = bib.entries[0]
@@ -945,6 +948,7 @@ def savebib(my, o):
         help='ignore errors when adding multiple files')
 
     grp = addp.add_argument_group('pdf metadata')
+    grp.add_argument('--doi', help='provide DOI -- skip parsing PDF')
     grp.add_argument('--no-query-doi', action='store_true', help='do not attempt to parse and query doi')
     grp.add_argument('--no-query-fulltext', action='store_true', help='do not attempt to query fulltext in case doi query fails')
     grp.add_argument('--scholar', action='store_true', help='use google scholar instead of crossref')
@@ -987,7 +991,7 @@ def addcmd(o):
                     my.add_pdf(file, attachments=o.attachment, rename=o.rename, copy=o.copy,
                             search_doi=not o.no_query_doi,
                             search_fulltext=not o.no_query_fulltext,
-                            scholar=o.scholar,
+                            scholar=o.scholar, doi=o.doi,
                             **kw)
 
                 else: # file.endswith('.bib'):

From 5ef6958e52656ad85f7dd36f470fd548f5ebaad3 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Thu, 11 Mar 2021 16:36:39 +0100
Subject: [PATCH 13/34] do file check sum when merging files field, to avoid
 duplicates

---
 papers/duplicate.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/papers/duplicate.py b/papers/duplicate.py
index 14ee111..c31a554 100644
--- a/papers/duplicate.py
+++ b/papers/duplicate.py
@@ -16,7 +16,7 @@
 from papers.extract import isvaliddoi, fetch_entry
 from papers.encoding import parse_file, format_file, format_entries
 
-from papers.config import bcolors
+from papers.config import bcolors, checksum
 
 
 # SEARCH DUPLICATES
@@ -50,7 +50,7 @@ def groupby_equal(entries, eq=None):
         else:
             group = groups[k]
         group.append(e)
-    return sorted(six.iteritems(groups)) 
+    return sorted(six.iteritems(groups))
 
 
 def search_duplicates(entries, key=None, eq=None, issorted=False, filter_key=None):
@@ -59,7 +59,7 @@ def search_duplicates(entries, key=None, eq=None, issorted=False, filter_key=Non
     entries: list elements
     key: key to check for equality
     eq: binary operator for equality check (slower)
-    issorted: if True and key is provided, skip sort 
+    issorted: if True and key is provided, skip sort
 
     returns:
     - unique_entries : list (entries for which no duplicates where found)
@@ -150,7 +150,7 @@ def merge_entries(entries, force=False):
 
 def handle_merge_conflict(merged):
     # TODO: boil down this command to the minimum, or build all into a merge class
-    
+
     if not isinstance(merged, MergedEntry):
         return merged  # all GOOD_DUPLICATES !
 
@@ -181,7 +181,7 @@ def _colordiffline(line, sign=None):
     elif sign == '*' or line.startswith('*'):
         return bcolors.BOLD + line + bcolors.ENDC
     # elif sign == '>' or line.startswith('>'):
-        # return bcolors.BOLD + line + bcolors.ENDC    
+        # return bcolors.BOLD + line + bcolors.ENDC
         # return bcolors.BOLD + bcolors.WARNING + line + bcolors.ENDC
     else:
         return line
@@ -289,12 +289,16 @@ def entry_sdiff(entries, color=True, bcolors=bcolors, best=None):
 # ==================
 
 def merge_files(entries):
+    checksums = []
     files = []
     for e in entries:
         for f in parse_file(e.get('file','')):
-            if f not in files:
+            check = checksum(f) if os.path.exists(f) else None
+            if f not in files and (check is None or check not in checksums):
                 files.append(f)
-    return format_file(files)   
+                if check is not None:
+                    checksums.append(check)
+    return format_file(files)
 
 
 def _ask_pick_loop(entries, extra=[], select=False):
@@ -345,7 +349,7 @@ def choose_entry_interactive(entries, extra=[], msg='', select=False, best=None)
 
 
 def edit_entries(entries, diff=False, ndiff=False):
-    '''edit entries and insert result in database 
+    '''edit entries and insert result in database
     '''
     # write the listed entries to temporary file
     import tempfile
@@ -423,7 +427,7 @@ def edit(self, diffview=False, update=False):
 
     def delete(self):
         self.entries = []
-    
+
     def best(self):
         return bestentry(self.entries)
 
@@ -471,7 +475,7 @@ def interactive_loop(self, diffview=False, update=False):
 (d)elete
 (n)ot a duplicate (validate several entries)
 (s)kip (cancel)
-(S)kip all 
+(S)kip all
 (v)iew toggle (diff - split)
 (V)iew toggle for diff mode
 '''
@@ -489,7 +493,7 @@ def interactive_loop(self, diffview=False, update=False):
                 e = ans
 
             if e == 'm':
-                self.merge() 
+                self.merge()
 
             elif e == 'e':
                 self.edit(diffview, update)
@@ -547,7 +551,7 @@ def resolve_duplicates(duplicates, mode='i'):
             raise ValueError('unresolved conflicts')
 
     return conflict.entries
-    
+
 
 def check_duplicates(entries, key=None, eq=None, issorted=False, filter_key=None, mode='i'):
     """check duplicates, given a key or equality function
@@ -562,7 +566,7 @@ def check_duplicates(entries, key=None, eq=None, issorted=False, filter_key=None
         except DuplicateSkip:
             entries.extend(duplicates)
         except DuplicateSkipAll:
-            entries.extend(itertools.chain(duplicates))        
+            entries.extend(itertools.chain(duplicates))
             break
     return entries
 
@@ -586,7 +590,7 @@ def conflict_resolution_on_insert(old, new, mode='i'):
 (s)kip
 (a)ppend anyway
 (r)aise'''.strip()
-.replace('(u)','('+_colordiffline('u','+')+')')  # green lines will be added 
+.replace('(u)','('+_colordiffline('u','+')+')')  # green lines will be added
 .replace('(o)','('+_colordiffline('o','-')+')') + bcolors.ENDC
 )
 # .replace('(s)','('+_colordiffline('s','-')+')'))

From f034661a682ecc47be1618d4124ecdce38bf64db Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Thu, 11 Mar 2021 16:51:57 +0100
Subject: [PATCH 14/34] slugify author name for key generation

---
 papers/bib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/papers/bib.py b/papers/bib.py
index f9cdf13..d577f7a 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -79,7 +79,7 @@ def listtag(words, maxlength=30, minwordlen=3, n=100, sep='-'):
 def generate_key(entry, nauthor=NAUTHOR, ntitle=NTITLE, minwordlen=3, maxtitlen=4, keys=None):
     # names = bibtexparser.customization.getnames(entry.get('author','unknown').lower().split(' and '))
     names = family_names(entry.get('author','unknown').lower())
-    authortag = '_'.join([nm for nm in names[:nauthor]])
+    authortag = '_'.join([slugify(nm) for nm in names[:nauthor]])
     yeartag = str(entry.get('year','0000'))
     if not ntitle or not entry.get('title',''):
         titletag = ''

From 536c604a012a129ddb0d5f039e143f56e3abbb6c Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Thu, 11 Mar 2021 20:46:19 +0100
Subject: [PATCH 15/34] convert to unicode on PDF import

---
 papers/bib.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/papers/bib.py b/papers/bib.py
index d577f7a..964caf7 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -402,6 +402,9 @@ def add_pdf(self, pdf, attachments=None, rename=False, copy=False, search_doi=Tr
         bib = bibtexparser.loads(bibtex)
         entry = bib.entries[0]
 
+        # convert curly brackets to unicode
+        bibtexparser.customization.convert_to_unicode(entry)
+
         files = [pdf]
         if attachments:
             files += attachments

From 861e2a83dba8baf26950be23ca8f12b4aac88106 Mon Sep 17 00:00:00 2001
From: T Malfatti <malfatti@disroot.org>
Date: Thu, 5 Aug 2021 10:18:23 +0200
Subject: [PATCH 16/34] Fix variable case

---
 papers/bib.py | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/papers/bib.py b/papers/bib.py
index 44e2051..ac733ed 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -493,33 +493,33 @@ def autoname(namestr):
             return(New)
 
 
-        Fields = nameformat.split(',')
-        for F,Field in enumerate(Fields):
-            if Field == 'ID':
-                Fields[F] = autoname(e['ID'])
+        fields = nameformat.split(',')
+        for ff,field in enumerate(fields):
+            if field == 'ID':
+                fields[ff] = autoname(e['ID'])
 
-            elif Field == 'year':
-                Fields[F] = e.get('year','0000')
+            elif field == 'year':
+                fields[ff] = e.get('year','0000')
 
-            elif Field.lower() in e.keys():
-                if Field.lower() == 'author':
+            elif field.lower() in e.keys():
+                if field.lower() == 'author':
                     Names = family_names(e['author'])
 
-                    if len(Names) >= 3: eField = Names[0] + ' et al'
-                    elif len(Names) == 2: eField = ' and '.join(Names)
-                    else: eField = Names[0]
+                    if len(Names) >= 3: etalfield = Names[0] + ' et al'
+                    elif len(Names) == 2: etalfield = ' and '.join(Names)
+                    else: etalfield = Names[0]
 
                 else:
-                    eField = e[Field.lower()]
-
-                if Field.istitle():
-                    Fields[F] = autoname(eField).title().replace(' ','')
-                elif Field.islower():
-                    Fields[F] = autoname(eField).lower().replace(' ','')
-                elif Field.isupper():
-                    Fields[F] = autoname(eField).upper().replace(' ','')
+                    etalfield = e[field.lower()]
+
+                if field.istitle():
+                    fields[ff] = autoname(etalfield).title().replace(' ','')
+                elif field.islower():
+                    fields[ff] = autoname(etalfield).lower().replace(' ','')
+                elif field.isupper():
+                    fields[ff] = autoname(etalfield).upper().replace(' ','')
                 else:
-                    Fields[F] = autoname(eField).replace(' ','')
+                    fields[ff] = autoname(etalfield).replace(' ','')
 
         if not files:
             logger.info('no files to rename')
@@ -529,7 +529,7 @@ def autoname(namestr):
         if len(files) == 1:
             file = files[0]
             base, ext = os.path.splitext(file)
-            newfile = os.path.join(direc, ''.join(Fields)+ext)
+            newfile = os.path.join(direc, ''.join(fields)+ext)
             if not os.path.exists(file):
                 raise ValueError(file+': original file link is broken')
             elif file != newfile:

From f46cc6fc981817e445520112af92d14f2675e159 Mon Sep 17 00:00:00 2001
From: T Malfatti <malfatti@disroot.org>
Date: Thu, 5 Aug 2021 10:20:56 +0200
Subject: [PATCH 17/34] Add more unvalid filename chars to be replaced

---
 papers/bib.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/papers/bib.py b/papers/bib.py
index ac733ed..7134203 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -484,7 +484,8 @@ def autoname(namestr):
 
             New = namestr
 
-            for C in ['"',"'",'!','@','#','$','%','&','*','+','=',';',':','?',',','/','\\']:
+            for C in ['"',"'",'!','@','#','$','%','&','*','+','=',';',':',
+                      '?',',','/','\\','\n', '{\\textquotesingle}']:
                 if C in New: New = New.replace(C,'')
 
             for C in ['(', ')', '[', ']', '{', '}', '<', '>', '|']:
@@ -1275,7 +1276,7 @@ def longmatch(word, target):
         if o.duplicates_tit:
             entries = list_dup(entries, key=title_id)
         if o.duplicates:
-            eq = lambda a, b: a['ID'] == b['ID'] or are_duplicates(a, b, similarity=level, fuzzy_ratio=o.fuzzy_ratio)
+            eq = lambda a, b: a['ID'] == b['ID'] or are_duplicates(a, b, similarity=DEFAULT_SIMILARITY, fuzzy_ratio=o.fuzzy_ratio)
             entries = list_dup(entries, eq=eq)
 
         def nfiles(e):

From cbd3d0ed351a89d0b51cbc03e349a268a8f334de Mon Sep 17 00:00:00 2001
From: T Malfatti <malfatti@disroot.org>
Date: Thu, 5 Aug 2021 10:23:23 +0200
Subject: [PATCH 18/34] Fix name for papers where multiple authors are set as a
 single author

---
 papers/encoding.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/papers/encoding.py b/papers/encoding.py
index 1bec1d9..917d6e9 100644
--- a/papers/encoding.py
+++ b/papers/encoding.py
@@ -7,7 +7,7 @@
 
 # fix bibtexparser issue
 if six.PY2:
-    _bloads = bibtexparser.loads 
+    _bloads = bibtexparser.loads
     _bdumps = bibtexparser.dumps
     bibtexparser.loads = lambda s: (_bloads(s.decode('utf-8') if type(s) is str else s))
     bibtexparser.dumps = lambda db: _bdumps(db).encode('utf-8')
@@ -30,7 +30,7 @@ def _parse_file(file):
     """ parse a single file entry
     """
     sfile = file.split(':')
-    
+
     if len(sfile) == 1:  # no ':'
         path, type = file, ''
 
@@ -110,10 +110,15 @@ def strip_outmost_brackets(family):
 def standard_name(author):
     names = []
     for name in bibtexparser.customization.getnames([strip_outmost_brackets(nm) for nm in author.split(' and ')]):
-        family, given = name.split(',')
-        family = strip_outmost_brackets(family.strip())
-        # given = strip_outmost_brackets(given.strip())
-        names.append(', '.join([family.strip(), given.strip()]))
+        # if 'name' contains more than one author
+        # Example: doi:10.1111/jnc.13687
+        name = name.split(' and ')
+
+        for n in name:
+            family, given = n.split(',')
+            family = strip_outmost_brackets(family.strip())
+            # given = strip_outmost_brackets(given.strip())
+            names.append(', '.join([family.strip(), given.strip()]))
     return ' and '.join(names)
 
 

From f4a79ad43ad75a03758c6628de8864dc8233ff22 Mon Sep 17 00:00:00 2001
From: T Malfatti <malfatti@disroot.org>
Date: Thu, 5 Aug 2021 10:24:02 +0200
Subject: [PATCH 19/34] Remove trailing spaces

---
 papers/extract.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/papers/extract.py b/papers/extract.py
index 63032ab..9912668 100644
--- a/papers/extract.py
+++ b/papers/extract.py
@@ -107,7 +107,7 @@ def parse_doi(txt):
     if doi.lower().endswith('.received'):
         doi = doi[:-len('.received')]
 
-    # quality check 
+    # quality check
     if len(doi) <= 8:
         raise DOIParsingError('failed to extract doi: '+doi)
 
@@ -144,8 +144,8 @@ def extract_pdf_doi(pdf, image=False):
 def query_text(txt, max_query_words=200):
     # list of paragraphs
     paragraphs = re.split(r"\n\n", txt)
- 
-    # remove anything that starts with 'reference'   
+
+    # remove anything that starts with 'reference'
     query = []
     for p in paragraphs:
         if p.lower().startswith('reference'):
@@ -300,7 +300,7 @@ def crossref_to_bibtex(r):
 
     if 'author' in r:
         family = lambda p: p['family'] if len(p['family'].split()) == 1 else u'{'+p['family']+u'}'
-        bib['author'] = ' and '.join([family(p) + ', '+ p.get('given','') 
+        bib['author'] = ' and '.join([family(p) + ', '+ p.get('given','')
             for p in r.get('author',[]) if 'family' in p])
 
     # for k in ['issued','published-print', 'published-online']:

From 50e43806c16b4a075d9f6291268ea613bdb98afe Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Wed, 7 Jul 2021 15:36:53 +0200
Subject: [PATCH 20/34] fix error message when no argument is provided

---
 papers/bib.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/papers/bib.py b/papers/bib.py
index 964caf7..8cf7ddc 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -1391,7 +1391,9 @@ def check_install():
     elif o.cmd == 'extract':
         extractcmd(o)
     else:
-        raise ValueError('this is a bug')
+        parser.print_help()
+        parser.exit(1)
+        # raise ValueError('this is a bug')
 
 
 if __name__ == '__main__':

From caa08a1f076e9a6beb436bba89b05c2220d11369 Mon Sep 17 00:00:00 2001
From: aiotter <git@aiotter.com>
Date: Wed, 26 Jan 2022 19:01:13 +0900
Subject: [PATCH 21/34] fix: Does not install dependencies

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 539189b..c36d796 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-from distutils.core import setup
+from setuptools import setup
 import versioneer
 
 version = versioneer.get_version()
@@ -14,5 +14,5 @@
       packages=['papers'],
       scripts=['scripts/papers'],
       license = "MIT",
-      requires = ["bibtexparser","crossrefapi","rapidfuzz", "unidecode", "scholarly", "six"],
+      install_requires = ["bibtexparser","crossrefapi","rapidfuzz", "unidecode", "scholarly", "six"],
       )

From f2dcc2f3fbe4a306013d88d56f82971e6bf4b0f2 Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <hugovk@users.noreply.github.com>
Date: Mon, 6 Jun 2022 09:27:46 +0300
Subject: [PATCH 22/34] Upgrade Python syntax with pyupgrade --py3-plus

---
 papers/__init__.py   |  2 --
 papers/bib.py        | 16 +++++++---------
 papers/config.py     | 11 ++++-------
 papers/duplicate.py  | 16 +++++++---------
 papers/encoding.py   |  1 -
 papers/extract.py    | 15 +++++++--------
 papers/latexenc.py   | 19 ++++++-------------
 tests/download.py    |  1 -
 tests/test_papers.py |  4 +---
 9 files changed, 32 insertions(+), 53 deletions(-)

diff --git a/papers/__init__.py b/papers/__init__.py
index dae5e2a..51316b8 100644
--- a/papers/__init__.py
+++ b/papers/__init__.py
@@ -1,5 +1,3 @@
-
-
 from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
diff --git a/papers/bib.py b/papers/bib.py
index 8cf7ddc..343e8c9 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import print_function
 import os, json, sys
 import logging
 # logger.basicConfig(level=logger.INFO)
@@ -227,7 +225,7 @@ def backupfile(bibtex):
 class DuplicateKeyError(ValueError):
     pass
 
-class Biblio(object):
+class Biblio:
     """main config
     """
     def __init__(self, db=None, filesdir=None, key_field='ID', nauthor=NAUTHOR, ntitle=NTITLE, similarity=DEFAULT_SIMILARITY):
@@ -369,11 +367,11 @@ def insert_entry_check(self, entry, update_key=False, mergefiles=True, on_confli
 
     def generate_key(self, entry):
         " generate a unique key not yet present in the record "
-        keys = set(self.key(e) for e in self.db.entries)
+        keys = {self.key(e) for e in self.db.entries}
         return generate_key(entry, keys=keys, nauthor=self.nauthor, ntitle=self.ntitle)
 
     def append_abc_to_key(self, entry):
-        return append_abc(entry['ID'], keys=set(self.key(e) for e in self.entries))
+        return append_abc(entry['ID'], keys={self.key(e) for e in self.entries})
 
 
     def add_bibtex(self, bibtex, **kw):
@@ -530,7 +528,7 @@ def autoname(e):
                 f.write(bibtex)
 
             # remove old direc if empty?
-            direcs = list(set([os.path.dirname(file) for file in files]))
+            direcs = list({os.path.dirname(file) for file in files})
             if len(direcs) == 1:
                 leftovers = os.listdir(direcs[0])
                 if not leftovers or len(leftovers) == 1 and leftovers[0] == os.path.basename(hidden_bibtex(direcs[0])):
@@ -727,7 +725,7 @@ def entry_filecheck(e, delete_broken=False, fix_mendeley=False,
                     file = candidate
 
             if old != file:
-                logger.info(e['ID']+u': file name fixed: "{}" => "{}".'.format(old, file))
+                logger.info(e['ID']+': file name fixed: "{}" => "{}".'.format(old, file))
                 fixed[old] = file # keep record of fixed files
 
         # parse PDF and check for metadata
@@ -911,7 +909,7 @@ def installcmd(o):
 
 
     def savebib(my, o):
-        logger.info(u'save '+o.bibtex)
+        logger.info('save '+o.bibtex)
         if papers.config.DRYRUN:
             return
         if my is not None:
@@ -1212,7 +1210,7 @@ def longmatch(word, target):
             first_author = lambda field : family_names(field)[0]
             entries = [e for e in entries if 'author' in e and match(firstauthor(e['author']), o.author)]
         if o.author:
-            author = lambda field : u' '.join(family_names(field))
+            author = lambda field : ' '.join(family_names(field))
             entries = [e for e in entries if 'author' in e and longmatch(author(e['author']), o.author)]
         if o.title:
             entries = [e for e in entries if 'title' in e and longmatch(e['title'], o.title)]
diff --git a/papers/config.py b/papers/config.py
index feeeea6..f2f8ebe 100644
--- a/papers/config.py
+++ b/papers/config.py
@@ -48,7 +48,7 @@ def check_filesdir(folder):
     return file_count, folder_size
 
 
-class Config(object):
+class Config:
     """configuration class to specify system-wide collections and files-dir
     """
     def __init__(self, file=CONFIG_FILE, data=DATA_DIR, cache=CACHE_DIR,
@@ -216,10 +216,7 @@ def decorator(fun):
             cache = {}
         def decorated(doi):
             if hashed_key: # use hashed parameter as key (for full text query)
-                if six.PY3:
-                    key = hashlib.sha256(doi.encode('utf-8')).hexdigest()[:6]
-                else:
-                    key = hashlib.sha256(doi).hexdigest()[:6]
+                key = hashlib.sha256(doi.encode('utf-8')).hexdigest()[:6]
             else:
                 key = doi
             if key in cache:
@@ -272,12 +269,12 @@ def move(f1, f2, copy=False, interactive=True):
             return
 
     if copy:
-        cmd = u'cp {} {}'.format(f1, f2)
+        cmd = 'cp {} {}'.format(f1, f2)
         logger.info(cmd)
         if not DRYRUN:
             shutil.copy(f1, f2)
     else:
-        cmd = u'mv {} {}'.format(f1, f2)
+        cmd = 'mv {} {}'.format(f1, f2)
         logger.info(cmd)
         if not DRYRUN:
             shutil.move(f1, f2)
diff --git a/papers/duplicate.py b/papers/duplicate.py
index c31a554..8c92921 100644
--- a/papers/duplicate.py
+++ b/papers/duplicate.py
@@ -1,5 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import print_function
 import operator
 import os
 import itertools
@@ -50,7 +48,7 @@ def groupby_equal(entries, eq=None):
         else:
             group = groups[k]
         group.append(e)
-    return sorted(six.iteritems(groups))
+    return sorted(groups.items()) 
 
 
 def search_duplicates(entries, key=None, eq=None, issorted=False, filter_key=None):
@@ -110,7 +108,7 @@ def list_uniques(entries, **kw):
 # ==================
 
 
-class ConflictingField(object):
+class ConflictingField:
     def __init__(self, choices=[]):
         self.choices = choices
 
@@ -221,14 +219,14 @@ def entry_ndiff(entries, color=True):
         if matches:
             k = matches[0]
             template = SECRET_STRING.format(k)
-            lines.append(u'\u2304'*3)
+            lines.append('\u2304'*3)
             for c in choices[k]:
-                newline = '  '+line.replace(template, u'{}'.format(c))
+                newline = '  '+line.replace(template, '{}'.format(c))
                 lines.append(_colordiffline(newline, '!') if color else newline)
                 lines.append('---')
             lines.pop() # remove last ---
             # lines.append('^^^')
-            lines.append(u'\u2303'*3)
+            lines.append('\u2303'*3)
         elif any('{} = {{'.format(k) in line for k in somemissing):
             newline = '  '+line
             lines.append(_colordiffline(newline, sign='*') if color else newline)
@@ -372,7 +370,7 @@ def edit_entries(entries, diff=False, ndiff=False):
     with open(filename, 'w') as f:
         f.write(entrystring)
 
-    res = os.system('%s %s' % (os.getenv('EDITOR'), filename))
+    res = os.system('{} {}'.format(os.getenv('EDITOR'), filename))
 
     if res == 0:
         logger.info('sucessfully edited file, insert edited entries')
@@ -400,7 +398,7 @@ class DuplicateSkipAll(Exception):
     pass
 
 
-class DuplicateHandler(object):
+class DuplicateHandler:
 
     def __init__(self, entries):
         self.entries = entries
diff --git a/papers/encoding.py b/papers/encoding.py
index 1bec1d9..5fc048f 100644
--- a/papers/encoding.py
+++ b/papers/encoding.py
@@ -1,4 +1,3 @@
-
 import os
 import six
 import bibtexparser
diff --git a/papers/extract.py b/papers/extract.py
index 6f9679e..8451fff 100644
--- a/papers/extract.py
+++ b/papers/extract.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import os
 import json
 import six
@@ -178,7 +177,7 @@ def extract_txt_metadata(txt, search_doi=True, search_fulltext=False, max_query_
             logger.debug('doi query successful')
 
         except DOIParsingError as error:
-            logger.debug(u'doi parsing error: '+str(error))
+            logger.debug('doi parsing error: '+str(error))
 
         except DOIRequestError as error:
             return '''@misc{{{doi},
@@ -237,13 +236,13 @@ def _get_page_fast(pagerequest):
     if resp.status_code == 200:
         return resp.text
     else:
-        raise Exception('Error: {0} {1}'.format(resp.status_code, resp.reason))
+        raise Exception('Error: {} {}'.format(resp.status_code, resp.reason))
 
 
 def _scholar_score(txt, bib):
     # high score means high similarity
     from rapidfuzz.fuzz import token_set_ratio
-    return sum([token_set_ratio(bib[k], txt) for k in ['title', 'author', 'abstract'] if k in bib])
+    return sum(token_set_ratio(bib[k], txt) for k in ['title', 'author', 'abstract'] if k in bib)
 
 
 @cached('scholar-bibtex.json', hashed_key=True)
@@ -269,7 +268,7 @@ def fetch_bibtex_by_fulltext_scholar(txt, assess_results=True):
     return scholarly.bibtex(result)
 
 
-def _crossref_get_author(res, sep=u'; '):
+def _crossref_get_author(res, sep='; '):
     return sep.join([p.get('given','') + p['family'] for p in res.get('author',[]) if 'family' in p])
 
 
@@ -293,7 +292,7 @@ def crossref_to_bibtex(r):
     bib = {}
 
     if 'author' in r:
-        family = lambda p: p['family'] if len(p['family'].split()) == 1 else u'{'+p['family']+u'}'
+        family = lambda p: p['family'] if len(p['family'].split()) == 1 else '{'+p['family']+'}'
         bib['author'] = ' and '.join([family(p) + ', '+ p.get('given','')
             for p in r.get('author',[]) if 'family' in p])
 
@@ -323,7 +322,7 @@ def crossref_to_bibtex(r):
     # bibtex key
     year = str(bib.get('year','0000'))
     if 'author' in r:
-        ID = r['author'][0]['family'] + u'_' + six.u(year)
+        ID = r['author'][0]['family'] + '_' + year
     else:
         ID = year
     # if six.PY2:
@@ -338,7 +337,7 @@ def crossref_to_bibtex(r):
 # @cached('crossref-bibtex-fulltext.json', hashed_key=True)
 def fetch_bibtex_by_fulltext_crossref(txt, **kw):
     work = Works(etiquette=my_etiquette)
-    logger.debug(six.u('crossref fulltext seach:\n')+six.u(txt))
+    logger.debug('crossref fulltext seach:\n'+txt)
 
     # get the most likely match of the first results
     # results = []
diff --git a/papers/latexenc.py b/papers/latexenc.py
index 2adee13..b6281ca 100644
--- a/papers/latexenc.py
+++ b/papers/latexenc.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 # Original source: github.com/okfn/bibserver
 # Authors:
@@ -83,7 +82,7 @@ def latex_to_unicode(string):
     # string, that is always having only compound accentuated character (letter
     # + accent) or single accentuated character (letter with accent). We choose
     # to normalize to the latter.
-    cleaned_string = unicodedata.normalize("NFC", u"".join(cleaned_string))
+    cleaned_string = unicodedata.normalize("NFC", "".join(cleaned_string))
 
     # Remove any left braces
     cleaned_string = cleaned_string.replace("{", "").replace("}", "")
@@ -98,7 +97,7 @@ def protect_uppercase(string):
     :param string: string to convert
     :returns: string
     """
-    string = re.sub('([^{]|^)([A-Z])([^}]|$)', '\g<1>{\g<2>}\g<3>', string)
+    string = re.sub('([^{]|^)([A-Z])([^}]|$)', r'\g<1>{\g<2>}\g<3>', string)
     return string
 
 
@@ -2691,15 +2690,9 @@ def prepare_unicode_to_latex():
         ("\uD7FF", "\\mathtt{9}"),
     )
 
-    if sys.version_info >= (3, 0):
-        unicode_to_latex = to_latex
-        unicode_to_crappy_latex1 = to_crappy1
-        unicode_to_crappy_latex2 = to_crappy2
-        unicode_to_latex_map = dict(unicode_to_latex)
-    else:
-        unicode_to_latex = tuple((k.decode('unicode-escape'), v) for k, v in to_latex)
-        unicode_to_crappy_latex1 = tuple((k.decode('unicode-escape'), v) for k, v in to_crappy1)
-        unicode_to_crappy_latex2 = tuple((k.decode('unicode-escape'), v) for k, v in to_crappy2)
-        unicode_to_latex_map = dict(unicode_to_latex)
+    unicode_to_latex = to_latex
+    unicode_to_crappy_latex1 = to_crappy1
+    unicode_to_crappy_latex2 = to_crappy2
+    unicode_to_latex_map = dict(unicode_to_latex)
 
 prepare_unicode_to_latex()
diff --git a/tests/download.py b/tests/download.py
index 706b54e..1d94251 100644
--- a/tests/download.py
+++ b/tests/download.py
@@ -1,5 +1,4 @@
 #!/bin/env python2.7
-from __future__ import print_function
 import six
 import six.moves.urllib.request
 import os
diff --git a/tests/test_papers.py b/tests/test_papers.py
index 61d11b3..b5c7948 100644
--- a/tests/test_papers.py
+++ b/tests/test_papers.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, absolute_import
-
 import unittest
 import os, subprocess as sp
 import tempfile, shutil
@@ -429,7 +427,7 @@ def test_conflictyear(self):
 
 
 
-class SimilarityBase(object):
+class SimilarityBase:
 
     similarity = None
 

From 81f3094b8368b4d14229ec882ce133da9af9b4c4 Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <hugovk@users.noreply.github.com>
Date: Mon, 6 Jun 2022 09:42:00 +0300
Subject: [PATCH 23/34] pytest: drop the dot
 https://twitter.com/pytestdotorg/status/753767547866972160

---
 .travis.yml | 2 +-
 setup.cfg   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index ceb6bcf..4be0e17 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,7 +25,7 @@ before_install:
 install:
   - python setup.py install
 
-script: py.test -xv --cov papers.bib tests
+script: pytest -xv --cov papers.bib tests
 #script: tox -e py27 -e py34
 
 after-success: coveralls
diff --git a/setup.cfg b/setup.cfg
index dbcbd9f..ced4a8c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,7 +18,7 @@ python =
     3.8: py38
 
 [testenv]
-commands = py.test tests -xv
+commands = pytest tests -xv
 deps =
     bibtexparser
     six

From 90fa47e46db70a221669f573a302526b09a42135 Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <hugovk@users.noreply.github.com>
Date: Mon, 6 Jun 2022 09:43:51 +0300
Subject: [PATCH 24/34] Ignore papers_cli.egg-info

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 8b564c9..0db7fc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*.egg-info/
 *.pyc
 .tox
 .cache

From f94c11f7e2b13a67bf9ecedffa2ae1e1df622141 Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <hugovk@users.noreply.github.com>
Date: Mon, 6 Jun 2022 09:57:29 +0300
Subject: [PATCH 25/34] Remove six library

---
 .travis.yml         |  1 -
 README.md           |  1 -
 papers/bib.py       | 13 +++++--------
 papers/config.py    |  6 ++----
 papers/duplicate.py | 13 +++----------
 papers/encoding.py  |  9 ---------
 papers/extract.py   |  5 -----
 requirements.txt    |  1 -
 setup.cfg           |  1 -
 setup.py            |  2 +-
 tests/download.py   |  8 ++++----
 11 files changed, 15 insertions(+), 45 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4be0e17..9d795b2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,7 +13,6 @@ notifications:
 before_install:
   - pip install pip --upgrade
   - pip install bibtexparser
-  - pip install six
   - pip install crossrefapi
   - pip install rapidfuzz
   - pip install unidecode
diff --git a/README.md b/README.md
index deeafd9..4a7595d 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,6 @@ Dependencies
 - [scholarly (0.2.2)](https://github.com/OrganicIrradiation/scholarly) : interface for google scholar
 - [rapidfuzz (0.2.0)](https://github.com/rhasspy/rapidfuzz) : calculate score to sort crossref requests
 - [unidecode (0.04.21)](https://github.com/avian2/unidecode) : replace unicode with ascii equivalent
-- [six](http://pythonhosted.org/six): python 2-3 compatibility
 
 Install
 -------
diff --git a/papers/bib.py b/papers/bib.py
index 343e8c9..d5e77b5 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -1,4 +1,4 @@
-import os, json, sys
+import os
 import logging
 # logger.basicConfig(level=logger.INFO)
 import argparse
@@ -6,9 +6,6 @@
 import shutil
 import bisect
 import itertools
-import six
-from six.moves import input as raw_input
-import re
 
 import bibtexparser
 from normality import slugify, normalize
@@ -641,7 +638,7 @@ def fix_entry(self, e, fix_doi=True, fetch=False, fetch_all=False,
             print(bcolors.OKBLUE+'*** UPDATE ***'+bcolors.ENDC)
             print(entry_diff(e_old, e))
 
-            if raw_input('update ? [Y/n] or [Enter] ').lower() not in ('', 'y'):
+            if input('update ? [Y/n] or [Enter] ').lower() not in ('', 'y'):
                 logger.info('cancel changes')
                 e.update(e_old)
                 for k in list(e.keys()):
@@ -742,7 +739,7 @@ def entry_filecheck(e, delete_broken=False, fix_mendeley=False,
                 logger.info('delete file from entry: "{}"'.format(file))
                 continue
             elif interactive:
-                ans = raw_input('delete file from entry ? [Y/n] ')
+                ans = input('delete file from entry ? [Y/n] ')
                 if ans.lower == 'y':
                     continue
 
@@ -849,7 +846,7 @@ def installcmd(o):
         old = o.bibtex
 
         if config.git and not o.git and o.bibtex == config.bibtex:
-            ans = raw_input('stop git tracking (this will not affect actual git directory)? [Y/n] ')
+            ans = input('stop git tracking (this will not affect actual git directory)? [Y/n] ')
             if ans.lower() != 'y':
                 o.git = True
 
@@ -883,7 +880,7 @@ def installcmd(o):
             logger.warn('Cannot make global install if local config file exists.')
             ans = None
             while ans not in ('1','2'):
-                ans = raw_input('(1) remove local config file '+local_config+'\n(2) make local install\nChoice: ')
+                ans = input('(1) remove local config file '+local_config+'\n(2) make local install\nChoice: ')
             if ans == '1':
                 os.remove(local_config)
             else:
diff --git a/papers/config.py b/papers/config.py
index f2f8ebe..0c206c2 100644
--- a/papers/config.py
+++ b/papers/config.py
@@ -1,9 +1,7 @@
-import os, json, shutil
+import os, json
 import subprocess as sp, sys, shutil
 import hashlib
 import bibtexparser
-import six
-from six.moves import input as raw_input
 from papers import logger
 
 # GIT = False
@@ -264,7 +262,7 @@ def move(f1, f2, copy=False, interactive=True):
         logger.info('dest is identical to src: '+f1)
         return
     if os.path.exists(f2):
-        ans = raw_input('dest file already exists: '+f2+'. Replace? (y/n) ')
+        ans = input('dest file already exists: '+f2+'. Replace? (y/n) ')
         if ans != 'y':
             return
 
diff --git a/papers/duplicate.py b/papers/duplicate.py
index 8c92921..ad63abd 100644
--- a/papers/duplicate.py
+++ b/papers/duplicate.py
@@ -1,8 +1,6 @@
 import operator
 import os
 import itertools
-import six
-from six.moves import input as raw_input
 import re
 import difflib
 
@@ -259,8 +257,6 @@ def entry_sdiff(entries, color=True, bcolors=bcolors, best=None):
     for i, entry in enumerate(entries):
         db.entries[0] = entry
         string = bibtexparser.dumps(db)
-        if six.PY2:
-            string = string.decode('utf-8') # decode to avoid failure in replace
         # color the conflicting fields
         lines = []
         for line in string.splitlines():
@@ -327,7 +323,7 @@ def _process_choice(i):
 
     while True:
         print('choices: '+', '.join(choices))
-        i = raw_input('>>> ')
+        i = input('>>> ')
         try:
             return _process_choice(i)
         except:
@@ -364,9 +360,6 @@ def edit_entries(entries, diff=False, ndiff=False):
         db.entries.extend(entries)
         entrystring = bibtexparser.dumps(db)
 
-    if six.PY2:
-        entrystring = entrystring.encode('utf-8')
-
     with open(filename, 'w') as f:
         f.write(entrystring)
 
@@ -487,7 +480,7 @@ def interactive_loop(self, diffview=False, update=False):
                 ans = None
                 while ans not in choices:
                     print('choices: '+', '.join(choices))
-                    ans = raw_input('>>> ')
+                    ans = input('>>> ')
                 e = ans
 
             if e == 'm':
@@ -596,7 +589,7 @@ def conflict_resolution_on_insert(old, new, mode='i'):
         ans = None
         while ans not in choices:
             print('choices: '+', '.join(choices))
-            ans = raw_input('>>> ')
+            ans = input('>>> ')
         mode = ans
 
     # overwrite?
diff --git a/papers/encoding.py b/papers/encoding.py
index 5fc048f..4fc7bb9 100644
--- a/papers/encoding.py
+++ b/papers/encoding.py
@@ -1,17 +1,8 @@
 import os
-import six
 import bibtexparser
 from papers.latexenc import latex_to_unicode, unicode_to_latex
 from unidecode import unidecode as unicode_to_ascii
 
-# fix bibtexparser issue
-if six.PY2:
-    _bloads = bibtexparser.loads 
-    _bdumps = bibtexparser.dumps
-    bibtexparser.loads = lambda s: (_bloads(s.decode('utf-8') if type(s) is str else s))
-    bibtexparser.dumps = lambda db: _bdumps(db).encode('utf-8')
-
-
 # fix bibtexparser call on empty strings
 _bloads_orig = bibtexparser.loads
 def _bloads_fixed(s):
diff --git a/papers/extract.py b/papers/extract.py
index 8451fff..f3bd14d 100644
--- a/papers/extract.py
+++ b/papers/extract.py
@@ -1,10 +1,7 @@
 import os
 import json
-import six
 import subprocess as sp
-# import six.moves.urllib.request
 import re
-import shutil
 import tempfile
 
 from crossref.restful import Works, Etiquette
@@ -325,8 +322,6 @@ def crossref_to_bibtex(r):
         ID = r['author'][0]['family'] + '_' + year
     else:
         ID = year
-    # if six.PY2:
-        # ID = str(''.join([c if ord(c) < 128 else '_' for c in ID]))  # make sure the resulting string is ASCII
     bib['ID'] = ID
 
     db = bibtexparser.bibdatabase.BibDatabase()
diff --git a/requirements.txt b/requirements.txt
index 3e2bb2f..86b732a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,3 @@ crossrefapi
 bibtexparser
 scholarly
 rapidfuzz
-six
diff --git a/setup.cfg b/setup.cfg
index ced4a8c..0612b56 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,7 +21,6 @@ python =
 commands = pytest tests -xv
 deps =
     bibtexparser
-    six
     scholarly
     crossrefapi
     rapidfuzz
diff --git a/setup.py b/setup.py
index c36d796..3167686 100644
--- a/setup.py
+++ b/setup.py
@@ -14,5 +14,5 @@
       packages=['papers'],
       scripts=['scripts/papers'],
       license = "MIT",
-      install_requires = ["bibtexparser","crossrefapi","rapidfuzz", "unidecode", "scholarly", "six"],
+      requires=["bibtexparser", "crossrefapi", "rapidfuzz", "unidecode", "scholarly"],
       )
diff --git a/tests/download.py b/tests/download.py
index 1d94251..86b60ff 100644
--- a/tests/download.py
+++ b/tests/download.py
@@ -1,8 +1,7 @@
-#!/bin/env python2.7
-import six
-import six.moves.urllib.request
+#!/bin/env python3
 import os
 import logging
+import urllib.request
 
 DOWNDIR = os.path.join(os.path.dirname(__file__), 'downloadedpapers')
 
@@ -12,6 +11,7 @@
     'esd-4-11-2013-supplement.pdf': 'https://www.earth-syst-dynam.net/4/11/2013/esd-4-11-2013-supplement.pdf',
 }
 
+
 def _downloadpdf(url, filename, overwrite=False):
 
     if os.path.exists(filename) and not overwrite:
@@ -25,7 +25,7 @@ def _downloadpdf(url, filename, overwrite=False):
 
     print('download',url,'to',filename)
 
-    response = six.moves.urllib.request.urlopen(url)
+    response = urllib.request.urlopen(url)
     resp = response.read()
 
     with open(filename, 'wb') as f:

From 15e2fef9d69a2abb138a1912132857c93292b07f Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <hugovk@users.noreply.github.com>
Date: Mon, 6 Jun 2022 10:08:24 +0300
Subject: [PATCH 26/34] Switch from python2.7 to python3

---
 scripts/papers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/papers b/scripts/papers
index cbfc0a0..b00b800 100644
--- a/scripts/papers
+++ b/scripts/papers
@@ -1,4 +1,4 @@
-#!/bin/env python2.7
+#!/bin/env python3
 import papers.bib
 
 if __name__ == '__main__':

From 36443800f37e6fdad38a95fbf05fc6aa60e28ae0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Leon=20Me=C3=9Fner?= <elon@zwelf.net>
Date: Thu, 16 Dec 2021 12:20:57 +0100
Subject: [PATCH 27/34] Import papers.duplicate.entry_diff for fix_entry()

fix_entry method of Biblio class needs entry_diff in interactive mode
so import it.
---
 papers/bib.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/papers/bib.py b/papers/bib.py
index d5e77b5..e741e63 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -24,6 +24,7 @@
 
 from papers.duplicate import check_duplicates, resolve_duplicates, conflict_resolution_on_insert
 from papers.duplicate import search_duplicates, list_duplicates, list_uniques, merge_files, edit_entries
+from papers.duplicate import entry_diff
 
 # DRYRUN = False
 

From bfa60c03cd969b21edd803559f5e54703ca3cccb Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Tue, 14 Jun 2022 13:37:42 +0200
Subject: [PATCH 28/34] remove unused .travis.yml and add github/workflows
 tests for python versions 3.9 and 3.10

---
 .github/workflows/tests.yml |  4 ++--
 .travis.yml                 | 30 ------------------------------
 README.md                   |  5 +++--
 setup.cfg                   |  6 ++++--
 4 files changed, 9 insertions(+), 36 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 661fda9..337f9bb 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: [3.8,3.9,3.10]
 
     steps:
     - uses: actions/checkout@v2
@@ -24,4 +24,4 @@ jobs:
         pip install tox tox-gh-actions
         sudo apt install poppler-utils
     - name: Test with tox
-      run: tox
\ No newline at end of file
+      run: tox
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 9d795b2..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-language: python
-dist: xenial
-sudo : false
-addons:
-  apt:
-    packages:
-    - poppler-utils
-python:
-  - "3.5"
-  - "3.8"
-notifications:
-  email: false
-before_install:
-  - pip install pip --upgrade
-  - pip install bibtexparser
-  - pip install crossrefapi
-  - pip install rapidfuzz
-  - pip install unidecode
-  - pip install scholarly
-# test coverage
-  - pip install pytest
-  - pip install pytest-cov
-  - pip install coveralls
-install:
-  - python setup.py install
-
-script: pytest -xv --cov papers.bib tests
-#script: tox -e py27 -e py34
-
-after-success: coveralls
diff --git a/README.md b/README.md
index 4a7595d..7604b0d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 [![test](https://github.com/perrette/papers/workflows/CI/badge.svg)](https://github.com/perrette/papers/actions)
 [![python](https://img.shields.io/badge/python-3.8-blue.svg)]()
-<!-- [![python](https://img.shields.io/badge/python-3.5%20%7C%203.8-blue.svg)]() -->
+[![python](https://img.shields.io/badge/python-3.9-blue.svg)]()
+[![python](https://img.shields.io/badge/python-3.10-blue.svg)]()
 
 # papers
 
@@ -20,7 +21,7 @@ command-line bibliography managenent tool. Aims:
 
 Dependencies
 ------------
-- python 3
+- python 3.8+
 - [poppler-utils](https://en.wikipedia.org/wiki/Poppler_(software)) (only:`pdftotext`): convert PDF to text for parsing
 - [bibtexparser (1.0.1)](https://bibtexparser.readthedocs.io) : parse bibtex files
 - [crossrefapi (1.2.0)](https://github.com/fabiobatalha/crossrefapi) : make polite requests to crossref API
diff --git a/setup.cfg b/setup.cfg
index 0612b56..c249c7e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -11,11 +11,13 @@ tag_prefix =
 parentdir_prefix = papers-
 
 [tox:tox]
-envlist = py38
+envlist = py38, py39, py310
 
 [gh-actions]
 python =
     3.8: py38
+    3.9: py39
+    3.10: py310
 
 [testenv]
 commands = pytest tests -xv
@@ -25,4 +27,4 @@ deps =
     crossrefapi
     rapidfuzz
     unidecode
-    pytest
\ No newline at end of file
+    pytest

From 366739e20ec622f0738f2ed2a5510b281d05aefe Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Tue, 14 Jun 2022 13:45:10 +0200
Subject: [PATCH 29/34] python versions as string (otherwise 3.10 = 3.1)

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 337f9bb..9e6dafd 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8,3.9,3.10]
+        python-version: [ "3.8", "3.9", "3.10" ]
 
     steps:
     - uses: actions/checkout@v2

From 6ac5c6e9d22ba36c65097f6bb935a0bbf48acd20 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Sun, 16 Apr 2023 19:08:01 +0200
Subject: [PATCH 30/34] tests updated

---
 tests/downloadedpapers/bg-8-515-2011.txt | 84 ++++++++++++++++++++++++
 tests/test_papers.py                     | 35 +++++-----
 2 files changed, 104 insertions(+), 15 deletions(-)
 create mode 100644 tests/downloadedpapers/bg-8-515-2011.txt

diff --git a/tests/downloadedpapers/bg-8-515-2011.txt b/tests/downloadedpapers/bg-8-515-2011.txt
new file mode 100644
index 0000000..616f6dc
--- /dev/null
+++ b/tests/downloadedpapers/bg-8-515-2011.txt
@@ -0,0 +1,84 @@
+Biogeosciences, 8, 515–524, 2011
+www.biogeosciences.net/8/515/2011/
+doi:10.5194/bg-8-515-2011
+© Author(s) 2011. CC Attribution 3.0 License.
+
+Biogeosciences
+
+Near-ubiquity of ice-edge blooms in the Arctic
+M. Perrette1,* , A. Yool1 , G. D. Quartly1 , and E. E. Popova1
+1 National
+* now
+
+Oceanography Centre; Univ. of Southampton Waterfront Campus, European Way, Southampton SO14 3ZH, UK
+at: Potsdam Institute for Climate Impact Research (PIK), Telegrafenberg A31, 14412 Potsdam, Germany
+
+Received: 22 September 2010 – Published in Biogeosciences Discuss.: 4 November 2010
+Revised: 10 February 2011 – Accepted: 15 February 2011 – Published: 25 February 2011
+
+Abstract. Ice-edge blooms are significant features of Arctic
+primary production, yet have received relatively little attention. Here we combine satellite ocean colour and sea-ice data
+in a pan-Arctic study. Ice-edge blooms occur in all seasonally ice-covered areas and from spring to late summer, being
+observed in 77–89% of locations for which adequate data exist, and usually peaking within 20 days of ice retreat. They
+sometimes form long belts along the ice-edge (greater than
+100 km), although smaller structures were also found. The
+bloom peak is on average more than 1 mg m−3 , with major
+blooms more than 10 mg m−3 , and is usually located close
+to the ice-edge, though not always. Some propagate behind
+the receding ice-edge over hundreds of kilometres and over
+several months, while others remain stationary. The strong
+connection between ice retreat and productivity suggests that
+the ongoing changes in Arctic sea-ice may have a significant
+impact on higher trophic levels and local fish stocks.
+
+1
+
+Introduction
+
+The classical picture of Arctic ice-edge phytoplankton
+blooms found in the literature – mainly based on cruise transects – is of a long but narrow (20–100 km) band along the
+ice-edge, moving northward as the ice breaks up and melts
+over spring and summer (Sakshaug and Skjoldal, 1989).
+They differ from more traditional open-water blooms with
+respect to the nature of water column stratification, here induced primarily by freshwater input instead of solar heating.
+When sea-ice breaks up and melts, there is an input of freshwater to the surface that induces strong stratification. Another causal factor is increased solar irradiance at the surface
+as ice cover shrinks. Since irradiance is typically sufficient
+Correspondence to: M. Perrette
+(mahe.perrette@pik-potsdam.de)
+
+by the time ice cover recedes, Sverdrup’s (1953) criterion of
+a mixed layer shallower than the critical depth is met, making
+the light regime suitable for phytoplankton growth. Ice-edge
+blooms are generally understood as short-lived phenomena
+that quickly strip out the nutrients of the shallow (15–35 m)
+surface mixed layer characteristic of seasonally ice-covered
+waters (Niebauer, 1991). The area located between the multiyear ice and maximal extent is the seasonal ice cover, and this
+forms the subject of this study, with a particular focus on the
+marginal ice zone (MIZ), which is the region of recent ice
+melt.
+Ice-edge phytoplankton blooms have been detected from
+cruises in many locations including Bering Sea (Alexander
+and Niebauer, 1981; Niebauer et al., 1995), Chukchi and
+Beaufort Seas (Hill et al., 2005; Sukhanova et al., 2009),
+Canadian Archipelago (Klein et al., 2002; Tremblay et al.,
+2006), Greenland Sea (Smith et al., 1997), Barents Sea
+(Luchetta et al., 2000; Hegseth and Sundfjord, 2008), and
+also in the Southern Ocean (Smith and Nelson, 1985). In
+the Barents Sea and on the Bering Shelf they are thought
+to account for 50–65% of annual primary production (Sakshaug, 2004). Indications of ice-edge blooms had been noted
+in ocean colour imagery from the Coastal Zone Color Scanner (e.g. Maynard, 1986; Maynard et al., 1987; Mitchell et
+al., 1991; Kögeler and Rey, 1999) but detailed investigations
+were not possible on account of its poor sampling due to limited onboard storage, and underestimation problems close to
+ice due to a “ringing effect” as the scan line moved from
+bright to dark features (Mitchell et al., 1991). The launch
+of the SeaWiFS in 1997 ushered in a new era of long-term
+continuous ocean colour observations, with the whole globe
+sampled every two days, albeit that in some places cloud frequently obscures the surface. However, the potential of the
+SeaWiFS archive for the investigation of ice-edge blooms
+has only led to a few publications to date (e.g. Arrigo and
+van Dijken, 2004), and thus a primary aim of this study is
+to fill this gap and investigate their existence at the large
+
+Published by Copernicus Publications on behalf of the European Geosciences Union.
+
+
\ No newline at end of file
diff --git a/tests/test_papers.py b/tests/test_papers.py
index b5c7948..3e36d48 100644
--- a/tests/test_papers.py
+++ b/tests/test_papers.py
@@ -2,6 +2,7 @@
 import os, subprocess as sp
 import tempfile, shutil
 import difflib
+from pathlib import Path
 
 from papers.extract import extract_pdf_metadata
 from papers.bib import Biblio, bibtexparser, parse_file, format_file
@@ -31,7 +32,10 @@ def prepare_paper():
     year = 2011,
 }"""
 
-    return pdf, doi, key, newkey, year, bibtex
+    file_rename = "perrette-yool2011_near-ubiquity-of-ice-edge-blooms-in-the-arctic.pdf"
+
+    return pdf, doi, key, newkey, year, bibtex, file_rename
+
 
 def prepare_paper2():
     pdf = downloadpdf('esd-4-11-2013.pdf')
@@ -53,7 +57,8 @@ def prepare_paper2():
     volume = {4},
     year = 2013,
 }"""
-    return pdf, si, doi, key, newkey, year, bibtex
+    file_rename = "perrette-landerer2013_a-scaling-approach-to-project-regional-sea-level-rise-and-its.pdf"
+    return pdf, si, doi, key, newkey, year, bibtex, file_rename
 
 class TestBibtexFileEntry(unittest.TestCase):
 
@@ -89,7 +94,7 @@ def test_format_files(self):
 class TestSimple(unittest.TestCase):
 
     def setUp(self):
-        self.pdf, self.doi, self.key, self.newkey, self.year, self.bibtex = prepare_paper()
+        self.pdf, self.doi, self.key, self.newkey, self.year, self.bibtex, self.file_rename = prepare_paper()
         self.assertTrue(os.path.exists(self.pdf))
 
     def test_doi(self):
@@ -127,7 +132,7 @@ def tearDown(self):
 class TestAdd(unittest.TestCase):
 
     def setUp(self):
-        self.pdf, self.doi, self.key, self.newkey, self.year, self.bibtex = prepare_paper()
+        self.pdf, self.doi, self.key, self.newkey, self.year, self.bibtex, self.file_rename = prepare_paper()
         self.assertTrue(os.path.exists(self.pdf))
         self.mybib = tempfile.mktemp(prefix='papers.bib')
         self.filesdir = tempfile.mktemp(prefix='papers.files')
@@ -144,7 +149,7 @@ def _checkbib(self, doi_only=False, dismiss_key=False):
             self.assertEqual([e['doi'] for e in db1.entries], [e['doi'] for e in db2.entries]) # entry is as expected
             # self.assertEqual([e['title'].lower() for e in db1.entries], [e['title'].lower() for e in db2.entries]) # entry is as expected
         elif dismiss_key:
-            f = lambda e: {k:e[k] for k in e if k!='ID'}
+            f = lambda e: bibtexparser.customization.convert_to_unicode({k:e[k] for k in e if k!='ID'})
             self.assertEqual([f(e) for e in db1.entries], [f(e) for e in db2.entries]) # entry is as expected
         else:
             self.assertEqual(db1.entries, db2.entries) # entry is as expected
@@ -171,6 +176,8 @@ def test_add(self):
 
         file_ = self._checkbib(dismiss_key=True)
         file = self._checkfile(file_)
+        print("file created during test:", file)
+        print("file for check:", self.pdf)
         self.assertEqual(file, self.pdf)
         # self.assertTrue(os.path.exists(self.pdf)) # old pdf still exists
 
@@ -193,7 +200,7 @@ def test_add_rename_copy(self):
 
         file_ = self._checkbib(dismiss_key=True)  # 'file:pdf'
         file = self._checkfile(file_)
-        self.assertEqual(file, os.path.join(self.filesdir, self.year, self.newkey+'.pdf')) # update key since pdf
+        self.assertEqual(file, os.path.join(self.filesdir, self.file_rename)) # update key since pdf
         self.assertTrue(os.path.exists(self.pdf)) # old pdf still exists
 
 
@@ -207,7 +214,7 @@ def test_add_rename(self):
 
         file_ = self._checkbib(dismiss_key=True)  # 'file:pdf'
         file = self._checkfile(file_)
-        self.assertEqual(file, os.path.join(self.filesdir,self.year,self.newkey+'.pdf')) # update key since pdf
+        self.assertEqual(file, os.path.join(self.filesdir,self.file_rename)) # update key since pdf
         self.assertFalse(os.path.exists(pdfcopy))
 
 
@@ -223,7 +230,7 @@ def tearDown(self):
 class TestAdd2(TestAdd):
 
     def setUp(self):
-        self.pdf, self.si, self.doi, self.key, self.newkey, self.year, self.bibtex = prepare_paper2()
+        self.pdf, self.si, self.doi, self.key, self.newkey, self.year, self.bibtex, self.file_rename = prepare_paper2()
         self.assertTrue(os.path.exists(self.pdf))
         self.mybib = tempfile.mktemp(prefix='papers.bib')
         self.filesdir = tempfile.mktemp(prefix='papers.files')
@@ -245,9 +252,7 @@ def test_add_attachment(self):
         dirsi = os.path.dirname(si)
         self.assertEqual(dirmain, dirsi)
         dirmains = dirmain.split(os.path.sep)
-        self.assertEqual(dirmains[-1], self.newkey)  # NEW KEY since loaded as pdf
-        self.assertEqual(dirmains[-2], self.year)
-        self.assertEqual(os.path.sep.join(dirmains[:-2]), self.filesdir)
+        self.assertEqual(Path(dirmain).name, Path(self.file_rename).stem)
         # individual files have not been renamed
         self.assertEqual(os.path.basename(main), os.path.basename(self.pdf))
         self.assertEqual(os.path.basename(si), os.path.basename(self.si))
@@ -261,8 +266,8 @@ class TestAddBib(unittest.TestCase):
     def setUp(self):
         self.mybib = tempfile.mktemp(prefix='papers.bib')
         self.somebib = tempfile.mktemp(prefix='papers.somebib.bib')
-        self.pdf1, self.doi, self.key1, self.newkey1, self.year, self.bibtex1 = prepare_paper()
-        self.pdf2, self.si, self.doi, self.key2, self.newkey2, self.year, self.bibtex2 = prepare_paper2()
+        self.pdf1, self.doi, self.key1, self.newkey1, self.year, self.bibtex1, self.file_rename1 = prepare_paper()
+        self.pdf2, self.si, self.doi, self.key2, self.newkey2, self.year, self.bibtex2, self.file_rename2 = prepare_paper2()
         bib = '\n'.join([self.bibtex1, self.bibtex2])
         open(self.somebib,'w').write(bib)
         self.my = Biblio.newbib(self.mybib, '')
@@ -285,8 +290,8 @@ def tearDown(self):
 class TestAddDir(unittest.TestCase):
 
     def setUp(self):
-        self.pdf1, self.doi, self.key1, self.newkey1, self.year, self.bibtex1 = prepare_paper()
-        self.pdf2, self.si, self.doi, self.key2, self.newkey2, self.year, self.bibtex2 = prepare_paper2()
+        self.pdf1, self.doi, self.key1, self.newkey1, self.year, self.bibtex1, self.file_rename1 = prepare_paper()
+        self.pdf2, self.si, self.doi, self.key2, self.newkey2, self.year, self.bibtex2, self.file_rename2 = prepare_paper2()
         self.somedir = tempfile.mktemp(prefix='papers.somedir')
         self.subdir = os.path.join(self.somedir, 'subdir')
         os.makedirs(self.somedir)

From 480ff8bb61ac711dd20ea224b64e2ef3d754aae2 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Sun, 16 Apr 2023 19:08:49 +0200
Subject: [PATCH 31/34] add normality requirement

---
 requirements.txt | 1 +
 setup.cfg        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 86b732a..a8752c9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ crossrefapi
 bibtexparser
 scholarly
 rapidfuzz
+normality
diff --git a/setup.cfg b/setup.cfg
index c249c7e..3da46da 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,4 +27,5 @@ deps =
     crossrefapi
     rapidfuzz
     unidecode
+    normality
     pytest

From c227f9cc768a9ee6bc972e386478d27cc63fd310 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Mon, 17 Apr 2023 03:07:18 +0200
Subject: [PATCH 32/34] more flexible naming scheme for file names

---
 papers/bib.py        | 169 ++++++++++++++++---------------------------
 papers/config.py     |  21 +++---
 papers/filename.py   |  94 ++++++++++++++++++++++++
 tests/test_papers.py |   5 +-
 4 files changed, 172 insertions(+), 117 deletions(-)
 create mode 100644 papers/filename.py

diff --git a/papers/bib.py b/papers/bib.py
index 5103ea8..93712c2 100644
--- a/papers/bib.py
+++ b/papers/bib.py
@@ -8,7 +8,6 @@
 import itertools
 
 import bibtexparser
-from normality import slugify, normalize
 
 import papers
 from papers import logger
@@ -58,33 +57,6 @@ def append_abc(key, keys=[]):
     return Key
 
 
-def listtag(words, maxlength=30, minwordlen=3, n=100, sep='-'):
-    # preformat & filter words
-    words = [word for word in words if len(word) >= minwordlen]
-    while True:
-        tag = sep.join(words[:n])
-        n -= 1
-        if len(tag) <= maxlength or n < 2:
-            break
-    return tag
-
-
-def generate_key(entry, nauthor=config.nauthor, ntitle=config.ntitle, minwordlen=3, maxtitlen=4, keys=None, authorsep='_'):
-    # names = bibtexparser.customization.getnames(entry.get('author','unknown').lower().split(' and '))
-    names = family_names(entry.get('author','unknown').lower())
-    authortag = authorsep.join([slugify(nm) for nm in names[:nauthor]])
-    yeartag = str(entry.get('year','0000'))
-    if not ntitle or not entry.get('title',''):
-        titletag = ''
-    else:
-        titlewords = normalize(entry['title']).lower().split()
-        titletag = listtag(titlewords, n=ntitle, minwordlen=minwordlen, maxlength=maxtitlen, sep='-')
-    key = authortag + yeartag + titletag
-    if keys and key in keys: # and not isinstance(keys, set):
-        key = append_abc(key, keys)
-    return key
-
-
 # DUPLICATE DEFINITION
 # ====================
 
@@ -121,8 +93,6 @@ def entry_id(e):
     return (e.get('doi','').lower(), authortitle)
 
 
-
-
 FUZZY_RATIO = 80
 
 # should be conservative (used in papers add)
@@ -213,8 +183,6 @@ def read_entry_dir(self, direc, update_files=True):
     return entry
 
 
-
-
 def backupfile(bibtex):
     return os.path.join(os.path.dirname(bibtex), '.'+os.path.basename(bibtex)+'.backup')
 
@@ -224,7 +192,7 @@ class DuplicateKeyError(ValueError):
 class Biblio:
     """main config
     """
-    def __init__(self, db=None, filesdir=None, key_field='ID', nauthor=config.nauthor, ntitle=config.ntitle, nameformat=config.nameformat, similarity=DEFAULT_SIMILARITY):
+    def __init__(self, db=None, filesdir=None, key_field='ID', nameformat=None, keyformat=None, similarity=DEFAULT_SIMILARITY):
         self.filesdir = filesdir
         # assume an already sorted list
         self.key_field = key_field
@@ -233,11 +201,10 @@ def __init__(self, db=None, filesdir=None, key_field='ID', nauthor=config.nautho
         elif not isinstance(db, bibtexparser.bibdatabase.BibDatabase):
             raise TypeError('db must of type BibDatabase')
         self.db = db
-        self.sort()
-        self.nauthor = nauthor
-        self.ntitle = ntitle
-        self.nameformat = nameformat
+        self.nameformat = nameformat or config.nameformat
+        self.keyformat = keyformat or config.keyformat
         self.similarity = similarity
+        self.sort()
 
     @property
     def entries(self):
@@ -365,7 +332,10 @@ def insert_entry_check(self, entry, update_key=False, mergefiles=True, on_confli
     def generate_key(self, entry):
         " generate a unique key not yet present in the record "
         keys = {self.key(e) for e in self.db.entries}
-        return generate_key(entry, keys=keys, nauthor=self.nauthor, ntitle=self.ntitle)
+        key = self.keyformat(entry)
+        if keys and key in keys: # and not isinstance(keys, set):
+            key = append_abc(key, keys)
+        return key
 
     def append_abc_to_key(self, entry):
         return append_abc(entry['ID'], keys={self.key(e) for e in self.entries})
@@ -413,7 +383,7 @@ def add_pdf(self, pdf, attachments=None, rename=False, copy=False, search_doi=Tr
         self.insert_entry(entry, update_key=True, **kw)
 
         if rename:
-            self.rename_entry_files(entry, copy=copy, nameformat=self.nameformat)
+            self.rename_entry_files(entry, copy=copy)
 
 
     def scan_dir(self, direc, search_doi=True, search_fulltext=True, **kw):
@@ -464,18 +434,18 @@ def check_duplicates(self, key=None, eq=None, mode='i'):
         self.sort() # keep sorted
 
 
-    def rename_entry_files(self, e, copy=False, nameformat='year,/,ID'):
-        """ Rename files according to 'nameformat'
-            'nameformat' is a comma-separated string, and every field that is in
-            e.keys() will be replaced by the corresponding value. Fields not in
-            e.keys() will remain untouched.
+    def rename_entry_files(self, e, copy=False, formatter=None):
+        """ Rename files
+
+        See `confog.Format` class
 
-            To rename esd-4-11-2013.pdf as perrette_2013.pdf, nameformat should be 'author,_,year'.
+
+            To rename esd-4-11-2013.pdf as perrette_2013.pdf, nameformat should be '{author}_{year}' with --name-nauthor 1.
             If that happens to be the entry ID, 'ID' also works.
 
             To rename esd-4-11-2013.pdf as
             2013/Perrette2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf,
-            nameformat should be 'year,/,Author,year,Title' (note the case).
+            nameformat should be '{year}/{Author}{year}-{Title}' with --name-nauthor 1 (note the case).
 
             Entries are case-sensitive, so that:
                 'author' generates 'perrette'
@@ -496,53 +466,7 @@ def rename_entry_files(self, e, copy=False, nameformat='year,/,ID'):
             logger.info('no files to rename')
             return
 
-        def autoname(namestr):
-            # Adapted
-            # from https://gitlab.com/malfatti/SciScripts/-/blob/master/Python3/Files/FixStupidFileNames.py
-
-            New = namestr
-
-            for C in ['"',"'",'!','@','#','$','%','&','*','+','=',';',':','?',',','/','\\']:
-                if C in New: New = New.replace(C,'')
-
-            for C in ['(', ')', '[', ']', '{', '}', '<', '>', '|']:
-                if C in New: New = New.replace(C,'_')
-
-            return(New)
-
-
-        def malfatti_rename(e, nameformat):
-            Fields = nameformat.split(',')
-            for F,Field in enumerate(Fields):
-                if Field == 'ID':
-                    Fields[F] = autoname(e['ID'])
-
-                elif Field == 'year':
-                    Fields[F] = e.get('year','0000')
-
-                elif Field.lower() in e.keys():
-                    if Field.lower() == 'author':
-                        Names = family_names(e['author'])
-
-                        if len(Names) >= 3: eField = Names[0] + ' et al'
-                        elif len(Names) == 2: eField = ' and '.join(Names)
-                        else: eField = Names[0]
-
-                    else:
-                        eField = e[Field.lower()]
-
-                    if Field.istitle():
-                        Fields[F] = autoname(eField).title().replace(' ','')
-                    elif Field.islower():
-                        Fields[F] = autoname(eField).lower().replace(' ','')
-                    elif Field.isupper():
-                        Fields[F] = autoname(eField).upper().replace(' ','')
-                    else:
-                        Fields[F] = autoname(eField).replace(' ','')
-
-            return ''.join(Fields)
-
-        newname = malfatti_rename(e, nameformat)
+        newname = (formatter or self.nameformat)(e)
 
         count = 0
         if len(files) == 1:
@@ -597,7 +521,7 @@ def malfatti_rename(e, nameformat):
     def rename_entries_files(self, copy=False):
         for e in self.db.entries:
             try:
-                self.rename_entry_files(e, copy, nameformat=self.nameformat)
+                self.rename_entry_files(e, copy)
             except Exception as error:
                 logger.error(str(error))
                 continue
@@ -848,12 +772,52 @@ def main():
         help='bibtex database (default: %(default)s)')
     grp.add_argument('--dry-run', action='store_true',
         help='no PDF renaming/copying, no bibtex writing on disk (for testing)')
-    grp.add_argument('--nauthor', type=int, default=config.nauthor,
+
+    grp.add_argument('--key-template', default=config.keyformat.template,
+        help='python template for generating keys (default:%(default)s)')
+    grp.add_argument('--key-author-num', type=int, default=config.keyformat.author_num,
         help='number of authors to include in key (default:%(default)s)')
-    grp.add_argument('--ntitle', type=int, default=config.ntitle,
+    grp.add_argument('--key-author-sep', default=config.keyformat.author_sep,
+        help='separator for authors in key (default:%(default)s)')
+    grp.add_argument('--key-title-word-num', type=int, default=config.keyformat.title_word_num,
         help='number of title words to include in key (default:%(default)s)')
-    grp.add_argument('--nameformat', default=config.nameformat,
-        help='comma-separated fields for renaming files (default:%(default)s)')
+    grp.add_argument('--key-title-word-size', type=int, default=config.keyformat.title_word_size,
+        help='number of title words to include in key (default:%(default)s)')
+    grp.add_argument('--key-title-sep', default=config.keyformat.title_sep,
+        help='separator for title words in key (default:%(default)s)')
+
+    grp.add_argument('--name-template', default=config.nameformat.template,
+        help='python template for renaming files (default:%(default)s)')
+    grp.add_argument('--name-author-num', type=int, default=config.nameformat.author_num,
+        help='number of authors to include in filename (default:%(default)s)')
+    grp.add_argument('--name-author-sep', default=config.nameformat.author_sep,
+        help='separator for authors in filename (default:%(default)s)')
+    grp.add_argument('--name-title-word-num', type=int, default=config.nameformat.title_word_num,
+        help='number of title words to include in filename (default:%(default)s)')
+    grp.add_argument('--name-title-word-size', type=int, default=config.nameformat.title_word_size,
+        help='min size of title words to include in filename (default:%(default)s)')
+    grp.add_argument('--name-title-length', type=int, default=config.nameformat.title_length,
+        help='title length to include in filename (default:%(default)s)')
+    grp.add_argument('--name-title-sep', default=config.nameformat.title_sep,
+        help='separator for title words in filename (default:%(default)s)')
+
+
+    def set_format_config_from_cmd(o):
+        config.keyformat.template = o.key_template
+        config.keyformat.author_num = o.key_author_num
+        config.keyformat.author_sep = o.key_author_sep
+        config.keyformat.title_word_num = o.key_title_word_num
+        config.keyformat.title_word_size = o.key_title_word_size
+        config.keyformat.title_sep = o.key_title_sep
+
+        config.nameformat.template = o.name_template
+        config.nameformat.author_num = o.name_author_num
+        config.nameformat.author_sep = o.name_author_sep
+        config.nameformat.title_length = o.name_title_length
+        config.nameformat.title_word_num = o.name_title_word_num
+        config.nameformat.title_word_size = o.name_title_word_size
+        config.nameformat.title_sep = o.name_title_sep
+
 
     # status
     # ======
@@ -920,10 +884,6 @@ def installcmd(o):
         if o.filesdir is not None:
             config.filesdir = o.filesdir
 
-        config.nauthor = o.nauthor
-        config.ntitle = o.ntitle
-        config.nameformat = o.nameformat
-
         if o.reset_paths:
             config.reset()
 
@@ -1032,10 +992,6 @@ def addcmd(o):
         else:
             my = Biblio.newbib(o.bibtex, o.filesdir)
 
-        my.nauthor = o.nauthor
-        my.ntitle = o.ntitle
-        my.nameformat = o.nameformat
-
         if len(o.file) > 1 and o.attachment:
             logger.error('--attachment is only valid for one added file')
             addp.exit(1)
@@ -1433,6 +1389,7 @@ def gitcmd(o):
         return statuscmd(o)
 
     def check_install():
+        set_format_config_from_cmd(o)
         if not os.path.exists(o.bibtex):
             print('papers: error: no bibtex file found, use `papers install` or `touch {}`'.format(o.bibtex))
             parser.exit(1)
diff --git a/papers/config.py b/papers/config.py
index 975bcef..a4fe3f7 100644
--- a/papers/config.py
+++ b/papers/config.py
@@ -3,6 +3,7 @@
 import hashlib
 import bibtexparser
 from papers import logger
+from papers.filename import Format
 
 # GIT = False
 DRYRUN = False
@@ -45,19 +46,23 @@ def check_filesdir(folder):
             file_count += 1
     return file_count, folder_size
 
+KEYFORMAT = Format(template='{author}{year}', author_num=2, author_sep="_")
+NAMEFORMAT = Format(template='{authorX}_{year}_{title}', author_sep="_", title_sep="-")
 
 class Config:
     """configuration class to specify system-wide collections and files-dir
     """
     def __init__(self, file=CONFIG_FILE, data=DATA_DIR, cache=CACHE_DIR,
-        bibtex=None, filesdir=None, nauthor=2, ntitle=0, nameformat='year,/,ID', gitdir=None, git=False, gitlfs=False):
+        bibtex=None, filesdir=None,
+        keyformat=KEYFORMAT,
+        nameformat=NAMEFORMAT,
+        gitdir=None, git=False, gitlfs=False):
         self.file = file
         self.data = data
         self.cache = cache
         self.filesdir = filesdir or os.path.join(data, 'files')
         self.bibtex = bibtex  or os.path.join(data, 'papers.bib')
-        self.nauthor = nauthor
-        self.ntitle = ntitle
+        self.keyformat = keyformat
         self.nameformat = nameformat
         self.gitdir = gitdir  or data
         self.git = git
@@ -74,9 +79,8 @@ def save(self):
         json.dump({
             "filesdir":self.filesdir,
             "bibtex":self.bibtex,
-            "nauthor":self.nauthor,
-            "ntitle":self.ntitle,
-            "nameformat":self.nameformat,
+            "keyformat":self.keyformat.todict(),
+            "nameformat":self.nameformat.todict(),
             "git":self.git,
             "gitdir":self.gitdir,
             }, open(self.file, 'w'), sort_keys=True, indent=2, separators=(',', ': '))
@@ -86,9 +90,8 @@ def load(self):
         js = json.load(open(self.file))
         self.bibtex = js.get('bibtex', self.bibtex)
         self.filesdir = js.get('filesdir', self.filesdir)
-        self.nauthor = js.get('nauthor', self.nauthor)
-        self.ntitle = js.get('ntitle', self.ntitle)
-        self.nameformat = js.get('nameformat', self.nameformat)
+        self.nameformat = Format(**js["nameformat"]) if "nameformat" in js else self.nameformat
+        self.keyformat = Format(**js["keyformat"]) if "keyformat" in js else self.keyformat
         self.git = js.get('git', self.git)
         self.gitlfs = js.get('gitlfs', self.gitlfs)
         self.gitdir = js.get('gitdir', self.gitdir)
diff --git a/papers/filename.py b/papers/filename.py
new file mode 100644
index 0000000..7f7c138
--- /dev/null
+++ b/papers/filename.py
@@ -0,0 +1,94 @@
+"""Key and file name formatting
+"""
+from normality import slugify, normalize
+from papers.encoding import family_names
+
+
+def listtag(words, maxlength=30, minwordlen=3, n=100, sep='-'):
+    # preformat & filter words
+    words = [word for word in words if len(word) >= minwordlen]
+    while True:
+        tag = sep.join(words[:n])
+        n -= 1
+        if len(tag) <= maxlength or n < 2:
+            break
+    return tag
+
+def _cite_author(names):
+    if len(names) >= 3: return names[0] + ' et al'
+    elif len(names) == 2: return ' and '.join(names)
+    else: return names[0]
+
+
+def make_template_fields(entry, author_num=2, title_word_num=100, title_word_size=1, title_length=100, author_sep="_", title_sep="-"):
+    """
+     Available fields in output
+    - author : slugified author names (lower case) separated by {author_sep} ('_' by default), with max {author_num} authors
+    - Author : same as author but titel case (first letter capitalized)
+    - AUTHOR : same as author but upper case
+    - authorX: first; first and second; first et al
+    - title : normalized title in lower case, separated by {title_sep} ('-' by default) with max {title_word_num} words
+    - Title: same as title by with capitalized words
+    - year
+    - ID : bibtex key
+    """
+    # names = bibtexparser.customization.getnames(entry.get('author','unknown').lower().split(' and '))
+    _names = family_names(entry.get('author','unknown').lower())
+    _names = [slugify(nm) for nm in _names]
+    author = author_sep.join([nm for nm in _names[:author_num]])
+    Author = author_sep.join([nm.capitalize() for nm in _names[:author_num]])
+    AuthorX = _cite_author([nm.capitalize() for nm in _names]).replace(" ", author_sep)
+    authorX = AuthorX.lower()
+
+    year = str(entry.get('year','0000'))
+
+    if not title_word_num or not entry.get('title',''):
+        title = ''
+        Title = ''
+    else:
+        titlewords = normalize(entry['title']).lower().split()
+        _titles = listtag(titlewords, n=title_word_num, minwordlen=title_word_size, maxlength=title_length, sep="*").split('*')
+        title = title_sep.join(_titles)
+        Title = title_sep.join(w.capitalize() for w in _titles)
+
+    return {
+        "author": author,
+        "Author": Author,
+        "AUTHOR": author.upper(),
+        "authorX": authorX,
+        "AuthorX": AuthorX,
+        "year": year,
+        "title": title,
+        "Title": Title,
+        "ID": entry.get("ID"),
+        }
+
+
+def stringify_entry(entry, template, **opt):
+    fields = make_template_fields(entry, **opt)
+    res = template.format(**fields)
+    return res
+
+
+class Format:
+    """
+    Store formmatting info as python template formatted with str.format() method. See make_template_fields for available fields.
+    """
+    # def __init__(self, template="{author}{year}{title}", author_num=2, title_word_num=5, author_sep="_", title_sep="-")
+    def __init__(self, template, author_num=2, title_word_num=100, title_word_size=1, title_length=100, author_sep="_", title_sep="-"):
+        self.template = template
+        self.author_num = author_num
+        self.title_word_num = title_word_num
+        self.title_word_size = title_word_size
+        self.title_length = title_length
+        self.author_sep = author_sep
+        self.title_sep = title_sep
+
+    def todict(self):
+        return vars(self)
+
+    def render(self, **entry):
+        return stringify_entry(entry, **vars(self))
+
+    def __call__(self, entry):
+        return self.render(**entry)
\ No newline at end of file
diff --git a/tests/test_papers.py b/tests/test_papers.py
index fff60b1..4b45364 100644
--- a/tests/test_papers.py
+++ b/tests/test_papers.py
@@ -32,7 +32,7 @@ def prepare_paper():
     year = 2011,
 }"""
 
-    file_rename = "2011/perrette_yool2011.pdf"
+    file_rename = "perrette_et_al_2011_near-ubiquity-of-ice-edge-blooms-in-the-arctic.pdf"
 
     return pdf, doi, key, newkey, year, bibtex, file_rename
 
@@ -57,7 +57,8 @@ def prepare_paper2():
     volume = {4},
     year = 2013,
 }"""
-    file_rename = "2013/perrette_landerer2013.pdf"
+    file_rename = "perrette_et_al_2013_a-scaling-approach-to-project-regional-sea-level-rise-and-its-uncertainties.pdf"
+
     return pdf, si, doi, key, newkey, year, bibtex, file_rename
 
 class TestBibtexFileEntry(unittest.TestCase):

From d3a7bf19432bd1fd4c153930f99cc4b64790b583 Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Mon, 17 Apr 2023 03:35:13 +0200
Subject: [PATCH 33/34] update doc

---
 README.md          | 38 ++++++++++++++++++++++++++------------
 papers/config.py   |  5 +----
 papers/filename.py | 12 ++++++++----
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index c8e73dc..5bf28a5 100644
--- a/README.md
+++ b/README.md
@@ -74,30 +74,44 @@ For the sake of the example, one of my owns: https://www.earth-syst-dynam.net/4/
 
 - control fields when renaming file
 
-        $> papers add --rename --info --nameformat Author,year,-,Title --nauthor 1 --ntitle 1 esd-4-11-2013
+        $> papers add --rename --info --name-template "{AuthorX}{year}-{Title}" --name-title-sep '' --name-author-sep '' esd-4-11-2013
         INFO:papers:found doi:10.5194/esd-4-11-2013
         INFO:papers:new entry: perrette2013scaling
         INFO:papers:create directory: files/2013
-        INFO:papers:mv /home/perrette/playground/papers/esd-4-11-2013.pdf files/2013/PerretteEtAl2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf
+        INFO:papers:mv /home/perrette/playground/papers/esd-4-11-2013.pdf files/PerretteEtAl2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf
         INFO:papers:renamed file(s): 1
 
-where 'nameformat' is a comma-separated list of fields, with valid fields being any field available in the bibtex. Fields not in the bibtex will remain untouched.
+where '--name-template' is a python template (will be formated via .format() method) with valid fields being any field available in the bibtex. Fields not in the bibtex will remain untouched.
 
-To rename esd-4-11-2013.pdf as perrette_2013.pdf, nameformat should be `author,_,year`.
+To rename `esd-4-11-2013.pdf` as `perrette_2013.pdf`, the template should be `--name-template {author}_{year} --name-author-num 1`
 If that happens to be the entry ID, `ID` also works.
 
-To rename esd-4-11-2013.pdf as 2013/Perrette2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf,
-nameformat should be `year,/,Author,year,Title` (note the case).
+To `rename esd-4-11-2013.pdf` as `2013/Perrette2013-AScalingApproachToProjectRegionalSeaLevelRiseAndItsUncertainties.pdf`,
+name-template should be `--name-template {year}/{Author}{year}-{Title} --name-title-sep ''` (note the case).
 
-Entries are case-sensitive, so that:  
-    'author' generates 'perrette'  
-    'Author' generates 'Perrette'  
-    'AUTHOR' generates 'PERRETTE'  
-any other case, like 'AuTHoR', will retrieve the field from 'e' with unaltered case.
+Entries are case-sensitive, and a few more fields are added, so that:
+- 'author' generates 'perrette'
+- 'Author' generates 'Perrette'
+- 'AUTHOR' generates 'PERRETTE'
+- 'authorX' generates 'perrette', 'perrette_and_landerer' or 'perrette_et_al' dependening on the number of authors
+- 'AuthorX' same as authorX but capitalized
 
+The modifiers are:
 
+- `--name-title-sep` : separator for title words
+- `--name-title-length` : max title length
+- `--name-title-word-size` : min size to be considered a word
+- `--name-title-word-num` : max number of title words
 
-In the common case where the bibtex (`--bibtex`), files directory  (`--filesdir`), number of authors in key (`--nauthor`) or number of title words in key (`--ntitle`) do not change, it is convenient to *install* `papers`.
+and similarly:
+
+- `--name-author-sep` : separator for authors
+- `--name-author-num` : number of authors to  (not relevant for `{authorX}`)
+
+The same template and modifiers system applies to the bibtex key generation by replacing the prefix `--name-` with `--key-`, e.g. `--key-template`
+
+
+In the common case where the bibtex (`--bibtex`), files directory  (`--filesdir`), and name and key formats (e.g. `--name-template`) do not change, it is convenient to *install* `papers`.
 Install comes with the option to git-track any change to the bibtex file (`--git`) options.
 
 - setup git-tracked library (optional)
diff --git a/papers/config.py b/papers/config.py
index a4fe3f7..3d70cb8 100644
--- a/papers/config.py
+++ b/papers/config.py
@@ -3,7 +3,7 @@
 import hashlib
 import bibtexparser
 from papers import logger
-from papers.filename import Format
+from papers.filename import Format, NAMEFORMAT, KEYFORMAT
 
 # GIT = False
 DRYRUN = False
@@ -46,9 +46,6 @@ def check_filesdir(folder):
             file_count += 1
     return file_count, folder_size
 
-KEYFORMAT = Format(template='{author}{year}', author_num=2, author_sep="_")
-NAMEFORMAT = Format(template='{authorX}_{year}_{title}', author_sep="_", title_sep="-")
-
 class Config:
     """configuration class to specify system-wide collections and files-dir
     """
diff --git a/papers/filename.py b/papers/filename.py
index 7f7c138..70f8be2 100644
--- a/papers/filename.py
+++ b/papers/filename.py
@@ -78,11 +78,11 @@ class Format:
     def __init__(self, template, author_num=2, title_word_num=100, title_word_size=1, title_length=100, author_sep="_", title_sep="-"):
         self.template = template
         self.author_num = author_num
-        self.title_word_num = title_word_num
-        self.title_word_size = title_word_size
-        self.title_length = title_length
         self.author_sep = author_sep
+        self.title_length = title_length
         self.title_sep = title_sep
+        self.title_word_num = title_word_num
+        self.title_word_size = title_word_size
 
     def todict(self):
         return vars(self)
@@ -91,4 +91,8 @@ def render(self, **entry):
         return stringify_entry(entry, **vars(self))
 
     def __call__(self, entry):
-        return self.render(**entry)
\ No newline at end of file
+        return self.render(**entry)
+
+
+KEYFORMAT = Format(template='{author}{year}', author_num=2, author_sep="_")
+NAMEFORMAT = Format(template='{authorX}_{year}_{title}', author_sep="_", title_sep="-")
\ No newline at end of file

From 6caf3535687ef708e34ddc154c82bd0d6165bf2c Mon Sep 17 00:00:00 2001
From: perrette <mahe.perrette@pik-potsdam.de>
Date: Mon, 17 Apr 2023 03:42:18 +0200
Subject: [PATCH 34/34] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5bf28a5..7730a3d 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ For the sake of the example, one of my owns: https://www.earth-syst-dynam.net/4/
     	INFO:papers:found doi:10.5194/esd-4-11-2013
     	INFO:papers:new entry: perrette_2013
     	INFO:papers:create directory: files/2013
-    	INFO:papers:mv /home/perrette/playground/papers/esd-4-11-2013.pdf files/2013/Perrette_2013.pdf
+    	INFO:papers:mv /home/perrette/playground/papers/esd-4-11-2013.pdf files/perrette_et_al_2013_a-scaling-approach-to-project-regional-sea-level-rise-and-its-uncertainties.pdf
     	INFO:papers:renamed file(s): 1
 
 (the `--info` argument asks for the above output information to be printed out to the terminal)