psolin · petri · Apr 18, 2020 · Apr 19, 2020 · Apr 19, 2020 · Apr 19, 2020
diff --git a/.gitignore b/.gitignore
@@ -55,3 +55,7 @@ docs/_build/
 
 # PyBuilder
 target/
+
+# MacOS
+
+.DS_Store
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,6 +1,15 @@
 Changelog
 **********
 
+2.0 (2020-04-18)
+----------------
+
+- Major refactoring & cleanup (e.g. #16)
+- Optimizations
+- new APIs
+- Python3 only (#46)
+- Better Unicode matching (#45)
+
 1.3 (9.9. 2015)
 ----------------
 

diff --git a/cleanco.py b/cleanco.py
diff --git a/cleanco/__init__.py b/cleanco/__init__.py
@@ -0,0 +1 @@
+from .cleanco import cleanco
diff --git a/cleanco/classify.py b/cleanco/classify.py
@@ -0,0 +1,61 @@
+"""
+Functions to help classify business names by country or type, based on legal terms.
+
+Examples of use:
+
+>> # check name for its possible business type(s)
+>> classification_sources = typesources()
+>> matches("MyCompany Ltd", classification_sources)
+['Limited']
+>>
+
+>> # check name for its possible jurisdictions, usually countries
+>> classification_sources = countrysources()
+>> matches("MyCompany Ltd", classification_sources)
+['New Zealand', 'United Kingdom', 'United States of America']
+>>
+
+"""
+
+from .termdata import terms_by_country, terms_by_type
+from .clean import strip_tail, normalized
+
+
+def typesources():
+   "business types / abbreviations sorted by length of business type"
+   types = []
+   for business_type in terms_by_type:
+       for item in terms_by_type[business_type]:
+           types.append((business_type, item))
+
+   return sorted(types, key=lambda part: len(part[1]), reverse=True)
+
+
+def countrysources():
+   "business countries / type abbreviations sorted by length of type abbreviations"
+   countries = []
+   for country in terms_by_country:
+       for item in terms_by_country[country]:
+           countries.append((country, item))
+
+   return sorted(countries, key=lambda part: len(part[1]), reverse=True)
+
+
+def matches(name, sources):
+    "get types or countries matching with the legal terms in name"
+
+    name = strip_tail(name)
+    parts = name.split()
+    nparts = [normalized(p) for p in parts]
+    matches = []
+    for classifier, term in sources:
+        nterm = normalized(term)
+        try:
+            idx = nparts.index(nterm)
+        except ValueError:
+            pass
+        else:
+            matches.append(classifier)
+
+    return matches
+
diff --git a/cleanco/clean.py b/cleanco/clean.py
@@ -0,0 +1,77 @@
+"""Functions to help clean & normalize business names.
+
+See http://www.unicode.org/reports/tr15/#Normalization_Forms_Table for details
+on Unicode normalization and the NFKD normalization used here.
+
+Basic usage:
+
+>> terms = get_terms()
+>> clean_name("Daddy & Sons, Ltd.", terms)
+Daddy & Sons
+
+"""
+
+import functools
+import operator
+from collections import OrderedDict
+import re
+import unicodedata
+from .termdata import terms_by_type, terms_by_country
+
+
+tail_removal_rexp = re.compile(r"[^\.\w]+$", flags=re.UNICODE)
+
+
+def get_terms():
+    "retrieve all unique terms from termdata definitions"
+    ts = functools.reduce(operator.iconcat, terms_by_type.values(), [])
+    cs = functools.reduce(operator.iconcat, terms_by_country.values(), [])
+    return set(ts + cs)
+
+
+def strip_tail(name):
+    "Get rid of all trailing non-letter symbols except the dot"
+    match = re.search(tail_removal_rexp, name)
+    if match is not None:
+        name = name[: match.span()[0]]
+    return name
+
+
+def normalized(text):
+    "caseless Unicode normalization"
+    return unicodedata.normalize("NFKD", text.casefold())
+
+
+def basename(name, terms, suffix=True, prefix=False, middle=False, multi=False):
+    "return cleaned base version of the business name"
+
+    name = strip_tail(name)
+    parts = name.split()
+    nparts = [normalized(p) for p in parts]
+
+    # return name without suffixed/prefixed/middle type term(s)
+    for term in (normalized(t) for t in terms):
+        if suffix and nparts[-1] == term:
+            del nparts[-1]
+            del parts[-1]
+            if multi == False:
+                break
+        if prefix and nparts[0] == term:
+            del nparts[0]
+            del parts[0]
+            if multi == False:
+                break
+        if middle:
+            try:
+                idx = nparts.index(term)
+            except ValueError:
+                pass
+            else:
+                del nparts[idx]
+                del parts[idx]
+            if multi == False:
+                break
+
+    return strip_tail(" ".join(parts))
+
+
diff --git a/cleanco/cleanco.py b/cleanco/cleanco.py
@@ -0,0 +1,20 @@
+from .clean import get_terms, basename
+from .classify import typesources, countrysources
+
+
+class cleanco:
+   "silly backwards compatibility wrapper, you should NOT use this"
+
+   def __init__(self):
+      self._types = typesources()
+      self._countries = countrysources()
+      self._terms = get_terms()
+
+   def clean_name(self, name):
+      return basename(name, self._terms)
+
+   def country(self, name):
+      return matches(name, self._countries)
+
+   def type(self, name):
+      return matches(name, self._types)
diff --git a/termdata.py → cleanco/termdata.py b/termdata.py → cleanco/termdata.py
diff --git a/setup.py b/setup.py
@@ -6,20 +6,19 @@
 
 setup(name='cleanco',
       description='Python library to process company names',
-      version='1.361',
+      version='2.0',
       license="MIT",
       classifiers = [
          "Topic :: Office/Business",
          "Development Status :: 4 - Beta",
          "Intended Audience :: Developers",
          "License :: OSI Approved :: MIT License",
-         "Programming Language :: Python :: 2.7",
-         "Programming Language :: Python :: 3.5"
+         "Programming Language :: Python :: 3"
       ],
       url='https://github.com/psolin/cleanco',
       author='Paul Solin',
       author_email='[email protected]',
-      py_modules=['cleanco', 'termdata'],
+      packages=["cleanco"],
       setup_requires=['pytest-runner'],
       tests_require=['pytest', 'tox'],
-      )
+)