Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python3 support #9

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@ dist
*.swn
.noseids
build/*
.idea/
venv.sh
**/__pycache__/*
virtualenvs/
2 changes: 1 addition & 1 deletion name_cleaver/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from cleaver import PoliticianNameCleaver, OrganizationNameCleaver, IndividualNameCleaver
from .cleaver import PoliticianNameCleaver, OrganizationNameCleaver, IndividualNameCleaver
29 changes: 16 additions & 13 deletions name_cleaver/cleaver.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re
from exception import UnparseableNameException
from names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \
from .exception import UnparseableNameException
from .names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \
OrganizationName
from nicknames import NICKNAMES
from .nicknames import NICKNAMES
from builtins import str


class BaseNameCleaver(object):
Expand All @@ -15,7 +16,7 @@ def cannot_parse(self, safe, e=None):
return self.orig_str
else:
# uncomment for debugging
#if e:
# if e:
# print e
raise UnparseableNameException(u"Couldn't parse name: {0}".format(self.name))

Expand All @@ -33,7 +34,7 @@ def parse(self, safe=False):
if not self.orig_str:
return ''

if not ' ' in self.name:
if ' ' not in self.name:
self.name = self.get_object_class().new_from_tokens(self.name)
return self.name.case_name_parts()
else:
Expand All @@ -47,10 +48,10 @@ def parse(self, safe=False):

name = self.reverse_last_first(name)
self.name = self.convert_name_to_obj(name, nick, honorific, suffix)
except Exception, e:
except Exception as e:
return self.cannot_parse(safe, e)
finally:
if (isinstance(self.name, self.object_class) and self.name.last):
if isinstance(self.name, self.object_class) and self.name.last:
return self.name.case_name_parts()
else:
return self.cannot_parse(safe)
Expand Down Expand Up @@ -135,7 +136,8 @@ def reverse_last_first(self, name):
def convert_name_to_obj(self, name, nick, honorific, suffix):
name = ' '.join([x.strip() for x in [name, nick, suffix, honorific] if x])

return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name)], **{'allow_quoted_nicknames': True})
return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name)],
**{'allow_quoted_nicknames': True})

@classmethod
def name_processing_failed(cls, subject_name):
Expand Down Expand Up @@ -199,17 +201,18 @@ def parse(self, safe=False):
if not self.orig_str:
return ''

if not ' ' in self.name:
if ' ' not in self.name:
self.name = self.get_object_class().new_from_tokens(self.name)
return self.name.case_name_parts()
else:
try:
self.strip_party()
self.name = self.convert_name_to_obj(self.name) # important for "last, first", and also running mates
except Exception, e:
except Exception as e:
return self.cannot_parse(safe, e)
finally:
if ((isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name, RunningMatesNames)):
if (isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name,
RunningMatesNames):
return self.name.case_name_parts()
else:
return self.cannot_parse(safe)
Expand All @@ -229,7 +232,7 @@ def convert_regular_name_to_obj(self, name):
return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name) if x])

def convert_running_mates_names_to_obj(self, name):
return RunningMatesNames(*[self.convert_name_to_obj(x) for x in re.split(' [&/] ', name)])
return RunningMatesNames(*[self.convert_name_to_obj(x) for x in re.split('[&/]', name)])


class OrganizationNameCleaver(BaseNameCleaver):
Expand All @@ -246,7 +249,7 @@ def parse(self, safe=False):
self.name = self.name.strip()

self.name = self.get_object_class().new(self.name)
except Exception, e:
except Exception as e:
return self.cannot_parse(safe, e)
finally:
if isinstance(self.name, self.object_class):
Expand Down
73 changes: 37 additions & 36 deletions name_cleaver/names.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
import re
from builtins import str
from future.utils import python_2_unicode_compatible

DEGREE_RE = 'j\.?d\.?|m\.?d\.?|ph\.?d\.?'
SUFFIX_RE = '([js]r\.?|%s|[IVX]{2,})' % DEGREE_RE


class Name(object):
scottish_re = r'(?i)\b(?P<mc>ma?c)(?!hin)(?P<first_letter>\w)\w+'

def primary_name_parts(self):
raise NotImplementedError("Subclasses of Name must implement primary_name_parts.")

def non_empty_primary_name_parts(self):
return ' '.join([ x for x in self.primary_name_parts() if x ])
return ' '.join([x for x in self.primary_name_parts() if x])

def is_mixed_case(self):
return re.search(r'[A-Z][a-z]', self.non_empty_primary_name_parts())
Expand Down Expand Up @@ -39,7 +42,7 @@ class OrganizationName(Name):
'inst': 'Institute',
'corp': 'Corporation',
'co': 'Company',
'fedn' : 'Federation',
'fedn': 'Federation',
'fed': 'Federal',
'fzco': 'Company',
'usa': 'USA',
Expand All @@ -65,7 +68,7 @@ class OrganizationName(Name):

name = None

#suffix = None
# suffix = None

def new(self, name):
self.name = name
Expand All @@ -77,32 +80,30 @@ def case_name_parts(self):
self.name = self.uppercase_the_scots(self.name)

if re.match(r'(?i)^\w*PAC$', self.name):
self.name = self.name.upper() # if there's only one word that ends in PAC, make the whole thing uppercase
# if there's only one word that ends in PAC, make the whole thing uppercase
self.name = self.name.upper()
else:
self.name = re.sub(r'(?i)\bpac\b', 'PAC', self.name) # otherwise just uppercase the PAC part
self.name = re.sub(r'(?i)\bpac\b', 'PAC', self.name) # otherwise just uppercase the PAC part

self.name = self.uppercase_the_scots(self.name)
self.name = self.fix_case_for_possessives(self.name)

return self

def primary_name_parts(self):
return [ self.without_extra_phrases() ]

def __unicode__(self):
return unicode(self.name)
return [self.without_extra_phrases()]

def __str__(self):
return unicode(self.name).encode('utf-8')
return self.name

def without_extra_phrases(self):
"""Removes parenthethical and dashed phrases"""
"""Removes parenthetical and dashed phrases"""
# the last parenthesis is optional, because sometimes they are truncated
name = re.sub(r'\s*\([^)]*\)?\s*$', '', self.name)
name = re.sub(r'(?i)\s* formerly.*$', '', name)
name = re.sub(r'(?i)\s*and its affiliates$', '', name)
name = re.sub(r'\bet al\b', '', name)

# in some datasets, the name of an organization is followed by a hyphen and an abbreviated name, or a specific
# department or geographic subdivision; we want to remove this extraneous stuff without breaking names like
# Wal-Mart or Williams-Sonoma
Expand All @@ -111,10 +112,12 @@ def without_extra_phrases(self):
if "-" in name:
hyphen_parts = name.rsplit("-", 1)
# if the part after the hyphen is shorter than the part before,
# AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X),
# AND the hyphen is preceded by either whitespace or at least four characters,
# AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X),
# discard the hyphen and whatever follows
if len(hyphen_parts[1]) < len(hyphen_parts[0]) and re.search(r'(\w{4,}|\s+)$', hyphen_parts[0]) and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]):
if len(hyphen_parts[1]) < len(hyphen_parts[0]) \
and re.search(r'^(\s+)|^(\w{0,4})$', hyphen_parts[1]) \
and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]):
name = hyphen_parts[0].strip()

return name
Expand All @@ -128,8 +131,8 @@ def expand(self):

def kernel(self):
""" The 'kernel' is an attempt to get at just the most pithy words in the name """
stop_words = [ y.lower() for y in self.abbreviations.values() + self.filler_words ]
kernel = ' '.join([ x for x in self.expand().split() if x.lower() not in stop_words ])
stop_words = [y.lower() for y in list(self.abbreviations.values()) + self.filler_words]
kernel = ' '.join([x for x in self.expand().split() if x.lower() not in stop_words])

# this is a hack to get around the fact that this is the only two-word phrase we want to block
# amongst our stop words. if we end up with more, we may need a better way to do this
Expand All @@ -144,6 +147,7 @@ def crp_style_firm_name(self, with_et_al=True):
return ', '.join(self.kernel().split()[0:2])


@python_2_unicode_compatible
class PersonName(Name):
honorific = None
first = None
Expand Down Expand Up @@ -195,9 +199,9 @@ def new_from_tokens(self, *args, **kwargs):
"""

if kwargs.get('allow_quoted_nicknames'):
args = [ x.strip() for x in args if not re.match(r'^[(]', x) ]
args = [x.strip() for x in args if not re.match(r'^[(]', x)]
else:
args = [ x.strip() for x in args if not re.match(r'^[("]', x) ]
args = [x.strip() for x in args if not re.match(r'^[("]', x)]

if len(args) > 2:
self.detect_and_fix_two_part_surname(args)
Expand Down Expand Up @@ -261,17 +265,14 @@ def detect_and_fix_two_part_surname(self, args):
i = 0
while i < len(args) - 1:
if args[i].lower() in self.family_name_prefixes:
args[i] = ' '.join(args[i:i+2])
del(args[i+1])
args[i] = ' '.join(args[i:i + 2])
del (args[i + 1])
break
else:
i += 1

def __unicode__(self):
return unicode(self.name_str())

def __str__(self):
return unicode(self.name_str()).encode('utf-8')
return self.name_str()

def name_str(self):
return ' '.join([x.strip() for x in [
Expand Down Expand Up @@ -320,25 +321,27 @@ def is_only_initials(self, name_part):
def capitalize_and_punctuate_initials(self, name_part):
if self.is_only_initials(name_part):
if '.' not in name_part:
return ''.join([ '{0}.'.format(x.upper()) for x in name_part])
return ''.join(['{0}.'.format(x.upper()) for x in name_part])
else:
return name_part
else:
return name_part

def primary_name_parts(self, include_middle=False):
if include_middle:
return [ self.first, self.middle, self.last ]
return [self.first, self.middle, self.last]
else:
return [ self.first, self.last ]
return [self.first, self.last]

def as_dict(self):
return { 'first': self.first, 'middle': self.middle, 'last': self.last, 'honorific': self.honorific, 'suffix': self.suffix }
return {'first': self.first, 'middle': self.middle, 'last': self.last, 'honorific': self.honorific,
'suffix': self.suffix}

def __repr__(self):
return self.as_dict()


@python_2_unicode_compatible
class PoliticalMetadata(object):
party = None
state = None
Expand All @@ -351,30 +354,30 @@ def plus_metadata(self, party, state):

def __str__(self):
if self.party or self.state:
party_state = u"-".join([ x for x in [self.party, self.state] if x ]) # because presidential candidates are listed without a state
return unicode(u"{0} ({1})".format(unicode(self.name_str()), party_state)).encode('utf-8')
party_state = u"-".join([x for x in [self.party, self.state] if
x]) # because presidential candidates are listed without a state
return u"{0} ({1})".format(self.name_str(), party_state)
else:
return unicode(self.name_str()).encode('utf-8')
return self.name_str()


class PoliticianName(PoliticalMetadata, PersonName):
pass


class RunningMatesNames(PoliticalMetadata):

def __init__(self, mate1, mate2):
self.mate1 = mate1
self.mate2 = mate2

def name_str(self):
return u' & '.join([unicode(self.mate1), unicode(self.mate2)])
return u' & '.join([str(self.mate1), str(self.mate2)])

def __repr__(self):
return self.__str__()

def mates(self):
return [ self.mate1, self.mate2 ]
return [self.mate1, self.mate2]

def is_mixed_case(self):
for mate in self.mates():
Expand All @@ -388,5 +391,3 @@ def case_name_parts(self):
mate.case_name_parts()

return self


Loading