Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

version 2.0 #47

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,7 @@ docs/_build/

# PyBuilder
target/

# MacOS

.DS_Store
9 changes: 9 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
Changelog
**********

2.0 (2020-04-18)
----------------

- Major refactoring & cleanup (e.g. #16)
- Optimizations
- new APIs
- Python3 only (#46)
- Better Unicode matching (#45)

1.3 (9.9. 2015)
----------------

Expand Down
113 changes: 0 additions & 113 deletions cleanco.py

This file was deleted.

1 change: 1 addition & 0 deletions cleanco/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .cleanco import cleanco
61 changes: 61 additions & 0 deletions cleanco/classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
Functions to help classify business names by country or type, based on legal terms.

Examples of use:

>> # check name for its possible business type(s)
>> classification_sources = typesources()
>> matches("MyCompany Ltd", classification_sources)
['Limited']
>>

>> # check name for its possible jurisdictions, usually countries
>> classification_sources = countrysources()
>> matches("MyCompany Ltd", classification_sources)
['New Zealand', 'United Kingdom', 'United States of America']
>>

"""

from .termdata import terms_by_country, terms_by_type
from .clean import strip_tail, normalized


def typesources():
"business types / abbreviations sorted by length of business type"
types = []
for business_type in terms_by_type:
for item in terms_by_type[business_type]:
types.append((business_type, item))

return sorted(types, key=lambda part: len(part[1]), reverse=True)


def countrysources():
"business countries / type abbreviations sorted by length of type abbreviations"
countries = []
for country in terms_by_country:
for item in terms_by_country[country]:
countries.append((country, item))

return sorted(countries, key=lambda part: len(part[1]), reverse=True)


def matches(name, sources):
"get types or countries matching with the legal terms in name"

name = strip_tail(name)
parts = name.split()
nparts = [normalized(p) for p in parts]
matches = []
for classifier, term in sources:
nterm = normalized(term)
try:
idx = nparts.index(nterm)
except ValueError:
pass
else:
matches.append(classifier)

return matches

77 changes: 77 additions & 0 deletions cleanco/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Functions to help clean & normalize business names.

See http://www.unicode.org/reports/tr15/#Normalization_Forms_Table for details
on Unicode normalization and the NFKD normalization used here.

Basic usage:

>> terms = get_terms()
>> clean_name("Daddy & Sons, Ltd.", terms)
Daddy & Sons

"""

import functools
import operator
from collections import OrderedDict
import re
import unicodedata
from .termdata import terms_by_type, terms_by_country


tail_removal_rexp = re.compile(r"[^\.\w]+$", flags=re.UNICODE)


def get_terms():
"retrieve all unique terms from termdata definitions"
ts = functools.reduce(operator.iconcat, terms_by_type.values(), [])
cs = functools.reduce(operator.iconcat, terms_by_country.values(), [])
return set(ts + cs)


def strip_tail(name):
"Get rid of all trailing non-letter symbols except the dot"
match = re.search(tail_removal_rexp, name)
if match is not None:
name = name[: match.span()[0]]
return name


def normalized(text):
"caseless Unicode normalization"
return unicodedata.normalize("NFKD", text.casefold())


def basename(name, terms, suffix=True, prefix=False, middle=False, multi=False):
"return cleaned base version of the business name"

name = strip_tail(name)
parts = name.split()
nparts = [normalized(p) for p in parts]

# return name without suffixed/prefixed/middle type term(s)
for term in (normalized(t) for t in terms):
if suffix and nparts[-1] == term:
del nparts[-1]
del parts[-1]
if multi == False:
break
if prefix and nparts[0] == term:
del nparts[0]
del parts[0]
if multi == False:
break
if middle:
try:
idx = nparts.index(term)
except ValueError:
pass
else:
del nparts[idx]
del parts[idx]
if multi == False:
break

return strip_tail(" ".join(parts))


20 changes: 20 additions & 0 deletions cleanco/cleanco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from .clean import get_terms, basename
from .classify import typesources, countrysources


class cleanco:
"silly backwards compatibility wrapper, you should NOT use this"

def __init__(self):
self._types = typesources()
self._countries = countrysources()
self._terms = get_terms()

def clean_name(self, name):
return basename(name, self._terms)

def country(self, name):
return matches(name, self._countries)

def type(self, name):
return matches(name, self._types)
File renamed without changes.
9 changes: 4 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,19 @@

setup(name='cleanco',
description='Python library to process company names',
version='1.361',
version='2.0',
license="MIT",
classifiers = [
"Topic :: Office/Business",
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.5"
"Programming Language :: Python :: 3"
],
url='https://github.com/psolin/cleanco',
author='Paul Solin',
author_email='[email protected]',
py_modules=['cleanco', 'termdata'],
packages=["cleanco"],
setup_requires=['pytest-runner'],
tests_require=['pytest', 'tox'],
)
)
Loading