diff --git a/.gitignore b/.gitignore index ba74660..d0f89f2 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,7 @@ docs/_build/ # PyBuilder target/ + +# MacOS + +.DS_Store diff --git a/CHANGES.txt b/CHANGES.txt index 0548c48..c312a72 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,6 +1,15 @@ Changelog ********** +2.0 (2020-04-18) +---------------- + +- Major refactoring & cleanup (e.g. #16) +- Optimizations +- new APIs +- Python3 only (#46) +- Better Unicode matching (#45) + 1.3 (9.9. 2015) ---------------- diff --git a/README.md b/README.md index 1f5c90d..e805011 100644 --- a/README.md +++ b/README.md @@ -17,36 +17,39 @@ countries. ## How do I install it? Just use 'pip install cleanco' if you have pip installed (as most systems do). Or download the zip distribution from this site, unzip it and then: -* Mac: `cd` into it, and enter `sudo python setup.py install` along with your system password. -* Windows: Same thing but without `sudo`. +* Mac: `cd` into it, and enter `sudo python3 setup.py install` along with your system password. +* Windows: `python setup.py install`. ## How does it work? -Let's look at some sample code. First, create an instance of the module: +If you only want a clean version of the company name, first pull in the terms: - >>> from cleanco import cleanco + >>> terms = get_terms() -Prepare a string of a company name that you want to process: +Then, run the string and the terms through the "basename" function: - >>> business_name = "Some Big Pharma, LLC" + >>> basename("Daddy & Sons, Ltd.", terms) + Daddy & Sons -Throw it into the instance: +If you want to classify the name by business entity type, first select it as a source: - >>> x = cleanco(business_name) + >>> classification_sources = typesources() -You can now get the company types: +Then, run the string and classication source through the "matches" function: - >>> x.type() - ['Limited Liability Company'] + >>> matches("MyCompany Ltd", classification_sources) + ['Limited'] -...the possible countries... +If you want to classify the name by possible countries, first select it as a source: - >>> x.country() - ['United States of America', 'Philippines'] + >>> classification_sources = countrysources() + +Then, run the string and classication source through the "matches" function: -...and a clean version of the company name. + >>> matches("MyCompany Ltd", classification_sources) + ['United States of America', 'Philippines'] - >>> x.clean_name() - 'Some Big Pharma' +## Compatibility with previous versions +cleanco's API was simplified in version 2.0. While previous functions are still compatible, they are not preferred. ## Are there bugs? See the issue tracker. If you find a bug or have enhancement suggestion or question, please file an issue and provide a PR if you can. For example, some of the company suffixes may be incorrect or there may be suffixes missing. @@ -55,5 +58,5 @@ To run tests, simply install the package and run `python setup.py test`. To run ## Special thanks to: -- Wikipedia's [Types of Business Entity article](http://en.wikipedia.org/wiki/Types_of_business_entity), where I spent hours of research. +- Wikipedia's [Types of Business Entity article](http://en.wikipedia.org/wiki/Types_of_business_entity). - Contributors: Petri Savolainen diff --git a/cleanco.py b/cleanco.py deleted file mode 100644 index 9cc813e..0000000 --- a/cleanco.py +++ /dev/null @@ -1,113 +0,0 @@ -# Note that this script is geared towards identifying businesses in terms of the US/UK - -from collections import OrderedDict -import re - -from termdata import terms_by_country as country_dict, terms_by_type as type_dict - - -# Sorted business types / abbreviation by length of business type -sorted_types = [] -for business_type in type_dict: - for item in type_dict[business_type]: - temp_tuple = [business_type, item] - sorted_types.append(temp_tuple) -sorted_types = sorted(sorted_types, key=lambda part: len(part[1]), reverse=True) - -# Sorted business countries / type abbreviations by length of business type abbreviations -sorted_countries = [] -for country in country_dict: - for item in country_dict[country]: - temp_tuple = [country, item] - sorted_countries.append(temp_tuple) -sorted_countries = sorted(sorted_countries, key=lambda part: len(part[1]), reverse=True) - -# All of the suffixes sorted by length -all_sorted = sorted_types + sorted_countries -suffix_sort = [] -for item in all_sorted: - suffix_sort.append(item[1]) -suffix_sort = sorted(suffix_sort, key=lambda part: len(part), reverse=True) - - -class cleanco(object): - - def __init__(self, business_name): - # always do non-visible cleanup, but store the original just in case - self.business_name = ' '.join(business_name.split()) - self._original = business_name - - def string_stripper(self, business_name): - - # Get rid of extra prefix-, suffix- & in-between spaces - business_name = " ".join(business_name.split()) - - # Get rid of all trailing non-letter symbols except '.' - match = re.search(r'[^\.\w]+$', business_name, flags=re.UNICODE) - if match is not None: - business_name = business_name[:match.span()[0]] - - return business_name - - def end_strip(self, a_set): - - end_set = [] - business_name = self.business_name - business_name = self.string_stripper(business_name) - - for key, suffix in a_set: - if ((business_name.lower()).endswith(" " + suffix)): - end_set.append(key) - - end_set = list(OrderedDict.fromkeys(end_set)) - - if end_set != []: - return end_set - else: - return None - - - def clean_name(self, suffix=True, prefix=False, middle=False, multi=False): - "return cleared version of the business name" - - name = self.business_name - - # Run it through the string_stripper once more - name = self.string_stripper(name) - loname = name.lower() - - # return name without suffixed/prefixed/middle type term(s) - - for item in suffix_sort: - if suffix: - if loname.endswith(" " + item): - start = loname.find(item) - end = len(item) - name = name[0:-end-1] - name = self.string_stripper(name) - if multi==False: - break - if prefix: - if loname.startswith(item+' '): - name = name[len(item)+1:] - if multi==False: - break - if middle: - term = ' ' + item + ' ' - if term in loname: - start = loname.find(term) - end = start + len(term) - name = name[:start] + " " + name[end:] - if multi==False: - break - - return self.string_stripper(name) - - - def type(self): - self.type = self.end_strip(sorted_types) - return self.type - - def country(self): - self.country = self.end_strip(sorted_countries) - return self.country diff --git a/cleanco/__init__.py b/cleanco/__init__.py new file mode 100644 index 0000000..750f103 --- /dev/null +++ b/cleanco/__init__.py @@ -0,0 +1 @@ +from .cleanco import cleanco diff --git a/cleanco/classify.py b/cleanco/classify.py new file mode 100644 index 0000000..9c53304 --- /dev/null +++ b/cleanco/classify.py @@ -0,0 +1,60 @@ +""" +Functions to help classify business names by country or type, based on legal terms. + +Examples of use: + +>> # check name for its possible business type(s) +>> classification_sources = typesources() +>> matches("MyCompany Ltd", classification_sources) +['Limited'] +>> + +>> # check name for its possible jurisdictions, usually countries +>> classification_sources = countrysources() +>> matches("MyCompany Ltd", classification_sources) +['New Zealand', 'United Kingdom', 'United States of America'] +>> + +""" + +from termdata import terms_by_country, terms_by_type +from clean import strip_tail, normalized + + +def typesources(): + "business types / abbreviations sorted by length of business type" + types = [] + for business_type in terms_by_type: + for item in terms_by_type[business_type]: + types.append((business_type, item)) + + return sorted(types, key=lambda part: len(part[1]), reverse=True) + +def countrysources(): + "business countries / type abbreviations sorted by length of type abbreviations" + countries = [] + for country in terms_by_country: + for item in terms_by_country[country]: + countries.append((country, item)) + + return sorted(countries, key=lambda part: len(part[1]), reverse=True) + +def matches(name, sources): + "get types or countries matching with the legal terms in name" + + name = strip_tail(name) + parts = name.split() + nparts = [normalized(p) for p in parts] + matches = [] + + for classifier, term in sources: + nterm = normalized(term) + try: + idx = nparts.index(nterm) + except ValueError: + pass + else: + matches.append(classifier) + + return matches + diff --git a/cleanco/clean.py b/cleanco/clean.py new file mode 100644 index 0000000..62d5ec1 --- /dev/null +++ b/cleanco/clean.py @@ -0,0 +1,75 @@ +"""Functions to help clean & normalize business names. + +See http://www.unicode.org/reports/tr15/#Normalization_Forms_Table for details +on Unicode normalization and the NFKD normalization used here. + +Basic usage: + +>> terms = get_terms() +>> clean_name("Daddy & Sons, Ltd.", terms) +Daddy & Sons + +""" + +import functools +import operator +from collections import OrderedDict +import re +import unicodedata +from termdata import terms_by_type, terms_by_country + + +tail_removal_rexp = re.compile(r"[^\.\w]+$", flags=re.UNICODE) + + +def get_terms(): + "retrieve all unique terms from termdata definitions" + ts = functools.reduce(operator.iconcat, terms_by_type.values(), []) + cs = functools.reduce(operator.iconcat, terms_by_country.values(), []) + return set(ts + cs) + + +def strip_tail(name): + "Get rid of all trailing non-letter symbols except the dot" + match = re.search(tail_removal_rexp, name) + if match is not None: + name = name[: match.span()[0]] + return name + + +def normalized(text): + "caseless Unicode normalization" + return unicodedata.normalize("NFKD", text.casefold()) + + +def basename(name, terms, suffix=True, prefix=False, middle=False, multi=False): + "return cleaned base version of the business name" + + name = strip_tail(name) + parts = name.split() + nparts = [normalized(p) for p in parts] + + # return name without suffixed/prefixed/middle type term(s) + for term in (normalized(t) for t in terms): + if suffix and nparts[-1] == term: + del nparts[-1] + del parts[-1] + if multi == False: + break + if prefix and nparts[0] == term: + del nparts[0] + del parts[0] + if multi == False: + break + if middle: + try: + idx = nparts.index(term) + except ValueError: + pass + else: + del nparts[idx] + del parts[idx] + if multi == False: + break + + return strip_tail(" ".join(parts)) diff --git a/cleanco/cleanco.py b/cleanco/cleanco.py new file mode 100644 index 0000000..52c651d --- /dev/null +++ b/cleanco/cleanco.py @@ -0,0 +1,20 @@ +from clean import get_terms, basename +from classify import typesources, countrysources + + +class cleanco: + "silly backwards compatibility wrapper, you should NOT use this" + + def __init__(self): + self._types = typesources() + self._countries = countrysources() + self._terms = get_terms() + + def clean_name(self, name): + return basename(name, self._terms) + + def country(self, name): + return matches(name, self._countries) + + def type(self, name): + return matches(name, self._types) diff --git a/termdata.py b/cleanco/termdata.py similarity index 90% rename from termdata.py rename to cleanco/termdata.py index 5f6bfb3..072f3dd 100644 --- a/termdata.py +++ b/cleanco/termdata.py @@ -19,7 +19,7 @@ 'lda.', 'tov', 'pp' ], 'Limited Liability Company': ['pllc', 'llc', 'l.l.c.', 'plc.', 'plc', 'hf.', 'oyj', - 'a.e.', 'nyrt.', 'p.l.c.', 'sh.a.', 's.a.', 's.r.l.', 'srl.', 'aat', '3at', 'd.d.', + 'a.e.', 'nyrt.', 'p.l.c.', 'sh.a.', 's.a.', 's.r.l.', 'srl.', 'srl', 'aat', '3at', 'd.d.', 's.r.o.', 'spol. s r.o.', 's.m.b.a.', 'smba', 'sarl', 'nv', 'sa', 'aps', 'a/s', 'p/s', 'sae', 'sasu', 'eurl', 'ae', 'cpt', 'as', 'ab', 'asa', 'ooo', 'dat', 'vat', 'zat', 'mchj', 'a.d.' @@ -50,10 +50,10 @@ 'Australia': ['nl', 'pty. ltd.', 'pty ltd'], 'Austria': ['e.u.', 'stg', 'gesbr', 'a.g.', 'ag', 'og', 'kg', 'aktiengesellschaft'], 'Belarus': ['aat', '3at'], - 'Belgium': ['esv', 'vzw', 'vof', 'snc', 'comm.v', 'scs', 'bvba', 'sprl', 'cbva', + 'Belgium': ['esv', 'vzw', 'vof', 'snc', 'comm.v', 'scs', 'bvba', 'sprl', 'cvba', 'cvoa', 'sca', 'sep', 'gie' ], - 'Bosnia / Herzegovina': ['d.d.', 'a.d.', 'd.n.o.', 'd.o.o.', 'k.v.', 's.p.'], + 'Bosnia and Herzegovina': ['d.d.', 'a.d.', 'd.n.o.', 'd.o.o.', 'k.v.', 's.p.'], 'Brazil': ['ltda', 's.a.', 'pllc', 'ad', 'adsitz', 'ead', 'et', 'kd', 'kda', 'sd'], 'Bulgaria': ['ad', 'adsitz', 'ead', 'et', 'kd', 'kda', 'sd'], 'Cambodia': ['gp', 'sm pte ltd.', 'pte ltd.', 'plc ltd.', 'peec', 'sp'], @@ -61,13 +61,13 @@ 'Chile': ['eirl', 's.a.', 'sgr', 's.g.r.', 'ltda', 's.p.a.', 'sa', 's. en c.', 'ltda.' ], - 'Columbia': ['s.a.', 'e.u.', 's.a.s.', 'suc. de descendants', 'sca'], + 'Colombia': ['s.a.', 'e.u.', 's.a.s.', 'suc. de descendants', 'sca'], 'Croatia': ['d.d.', 'd.o.o.', 'obrt'], - 'Czech Republic': ['a.s.', 'akc. spol.', 's.r.o.', 'spol. s r.o.', 'v.o.s.', u've\xc5\x99. obch. spol.', 'a spol.', 'k.s.', 'kom. spol.', 'kom. spol.'], + 'Czechia': ['a.s.', 'akc. spol.', 's.r.o.', 'spol. s r.o.', 'v.o.s.', u've\xc5\x99. obch. spol.', 'a spol.', 'k.s.', 'kom. spol.', 'kom. spol.'], 'Denmark': ['i/s', 'a/s', 'k/s', 'p/s', 'amba', 'a.m.b.a.', 'fmba', 'f.m.b.a.', 'smba', 's.m.b.a.', 'g/s' ], - 'Dominican Republic': ['c. por a.', 'cxa', 's.a.', 's.a.s.', 'srl.', 'eirl.', 'sa', + 'Dominican Republic': ['c. por a.', 'cxa', 's.a.', 's.a.s.', 'srl.', 'srl', 'eirl.', 'sa', 'sas' ], 'Ecuador': ['s.a.', 'c.a.', 'sa', 'ep'], @@ -98,8 +98,8 @@ 'Latvia': ['as', 'sia', 'ik', 'ps', 'ks'], 'Lebanon': ['sal'], 'Lithuania': ['uab', 'ab', 'ij', 'mb'], - 'Luxemborg': ['s.a.', 's.a.r.l.', 'secs'], - 'Macedonia': ['d.o.o.', 'd.o.o.e.l', 'k.d.a.', 'j.t.d.', 'a.d.', 'k.d.'], + 'Luxembourg': ['s.a.', 's.a.r.l.', 'secs'], + 'North Macedonia': ['d.o.o.', 'd.o.o.e.l', 'k.d.a.', 'j.t.d.', 'a.d.', 'k.d.'], 'Malaysia': ['bhd.', 'sdn. bhd.'], 'Mexico': ['s.a.', 's. de. r.l.', 's. en c.', 's.a.b.', 's.a.p.i.'], 'Mongolia': ['xk', 'xxk'], @@ -118,7 +118,7 @@ 'Poland': ['p.p.', 's.k.a.', 'sp.j.', 'sp.k.', 'sp.p.', 'sp. z.o.o.', 's.c.', 's.a.'], 'Portugal': ['lda.', 'crl', 's.a.', 's.f.', 'sgps'], 'Romania': ['s.c.a.', 's.c.s.', 's.n.c.', 's.r.l.', 'o.n.g.', 's.a.'], - 'Russia': ['ooo', 'oao', 'zao', '3ao'], + 'Russian Federation': ['ooo', 'oao', 'zao', '3ao'], 'Serbia': ['d.o.o.', 'a.d.', 'k.d.', 'o.d.'], 'Singapore': ['bhd', 'pte ltd', 'sdn bhd', 'llp', 'l.l.p.', 'ltd.', 'pte'], 'Slovenia': ['d.d.', 'd.o.o.', 'd.n.o.', 'k.d.', 's.p.'], @@ -130,7 +130,7 @@ 'Switzerland': ['ab', 'sa', 'gmbh', 'g.m.b.h.', 'sarl', 'sagl'], 'Turkey': ['koop.'], 'Ukraine': ['dat', 'fop', 'kt', 'pt', 'tdv', 'tov', 'pp', 'vat', 'zat', 'at'], - 'United Kingdom': ['plc.', 'plc', 'cic', 'cio', 'l.l.p.', 'llp', 'l.p.', 'lp', 'ltd.', + 'United Kingdom of Great Britain and Northern Ireland': ['plc.', 'plc', 'cic', 'cio', 'l.l.p.', 'llp', 'l.p.', 'lp', 'ltd.', 'ltd', 'limited' ], 'United States of America': ['llc', 'inc.', 'corporation', 'incorporated', 'company', diff --git a/setup.py b/setup.py index 403e5c5..c1f1141 100755 --- a/setup.py +++ b/setup.py @@ -6,20 +6,19 @@ setup(name='cleanco', description='Python library to process company names', - version='1.361', + version='2.0', license="MIT", classifiers = [ "Topic :: Office/Business", "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3.5" + "Programming Language :: Python :: 3" ], url='https://github.com/psolin/cleanco', author='Paul Solin', author_email='paul@paulsolin.com', - py_modules=['cleanco', 'termdata'], + packages=["cleanco"], setup_requires=['pytest-runner'], tests_require=['pytest', 'tox'], - ) +) diff --git a/tests/test_cleanname.py b/tests/test_cleanname.py index 4ee2e9a..f997bb0 100644 --- a/tests/test_cleanname.py +++ b/tests/test_cleanname.py @@ -1,7 +1,11 @@ # encoding: utf-8 +import pytest +from cleanco.clean import get_terms, basename -from cleanco import cleanco +@pytest.fixture +def terms(): + return get_terms() # Tests that demonstrate stuff is stripped away @@ -14,11 +18,11 @@ "name w/ ws suffix dot ws": " Hello World ltd. ", } -def test_basic_cleanups(): +def test_basic_cleanups(terms): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in basic_cleanup_tests.items(): - assert cleanco(variation).clean_name() == expected, errmsg % testname + assert basename(variation, terms) == expected, errmsg % testname multi_cleanup_tests = { "name + suffix": "Hello World Oy", @@ -29,38 +33,38 @@ def test_basic_cleanups(): "name w/ mid + suffix": "Hello Oy World Ab" } -def test_multi_type_cleanups(): +def test_multi_type_cleanups(terms): expected = "Hello World" errmsg = "cleanup of %s failed" for testname, variation in multi_cleanup_tests.items(): - result = cleanco(variation).clean_name(prefix=True, suffix=True, middle=True, multi=True) + result = basename(variation, terms, prefix=True, suffix=True, middle=True, multi=True) assert result == expected, errmsg % testname # Tests that demonstrate organization name is kept intact preserving_cleanup_tests = { - "name with comma": (u"Hello, World, ltd.", u"Hello, World"), - "name with dot": (u"Hello. World, Oy", u"Hello. World") + "name with comma": ("Hello, World, ltd.", "Hello, World"), + "name with dot": ("Hello. World, Oy", "Hello. World") } -def test_preserving_cleanups(): +def test_preserving_cleanups(terms): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in preserving_cleanup_tests.items(): - assert cleanco(variation).clean_name() == expected, errmsg % testname + assert basename(variation, terms) == expected, errmsg % testname # Test umlauts unicode_umlaut_tests = { - "name with umlaut in end": (u"Säätämö Oy", u"Säätämö"), - "name with umlauts & comma": (u"Säätämö, Oy", u"Säätämö"), - "name with no ending umlaut": (u"Säätämo Oy", u"Säätämo"), - "name with beginning umlaut": (u"Äätämo Oy", u"Äätämo"), - "name with just umlauts": (u"Äätämö", u"Äätämö") + "name with umlaut in end": ("Säätämö Oy", "Säätämö"), + "name with umlauts & comma": ("Säätämö, Oy", "Säätämö"), + "name with no ending umlaut": ("Säätämo Oy", "Säätämo"), + "name with beginning umlaut": ("Äätämo Oy", "Äätämo"), + "name with just umlauts": ("Äätämö", "Äätämö") } -def test_with_unicode_umlauted_name(): +def test_with_unicode_umlauted_name(terms): errmsg = "preserving cleanup of %s failed" for testname, (variation, expected) in unicode_umlaut_tests.items(): - assert cleanco(variation).clean_name() == expected, errmsg % testname + assert basename(variation, terms) == expected, errmsg % testname diff --git a/tox.ini b/tox.ini index 6677f4e..a814859 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27,py35 +envlist = py35, py36, py37, py38 [testenv] deps=pytest