diff --git a/.gitignore b/.gitignore index e98ce13..6425de4 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ dist *.swn .noseids build/* +.idea/ +venv.sh +**/__pycache__/* +virtualenvs/ diff --git a/name_cleaver/__init__.py b/name_cleaver/__init__.py index f2ef2ed..ec24fc8 100644 --- a/name_cleaver/__init__.py +++ b/name_cleaver/__init__.py @@ -1 +1 @@ -from cleaver import PoliticianNameCleaver, OrganizationNameCleaver, IndividualNameCleaver +from .cleaver import PoliticianNameCleaver, OrganizationNameCleaver, IndividualNameCleaver diff --git a/name_cleaver/cleaver.py b/name_cleaver/cleaver.py index aaf9384..964c96c 100644 --- a/name_cleaver/cleaver.py +++ b/name_cleaver/cleaver.py @@ -1,8 +1,9 @@ import re -from exception import UnparseableNameException -from names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \ +from .exception import UnparseableNameException +from .names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \ OrganizationName -from nicknames import NICKNAMES +from .nicknames import NICKNAMES +from builtins import str class BaseNameCleaver(object): @@ -15,7 +16,7 @@ def cannot_parse(self, safe, e=None): return self.orig_str else: # uncomment for debugging - #if e: + # if e: # print e raise UnparseableNameException(u"Couldn't parse name: {0}".format(self.name)) @@ -33,7 +34,7 @@ def parse(self, safe=False): if not self.orig_str: return '' - if not ' ' in self.name: + if ' ' not in self.name: self.name = self.get_object_class().new_from_tokens(self.name) return self.name.case_name_parts() else: @@ -47,10 +48,10 @@ def parse(self, safe=False): name = self.reverse_last_first(name) self.name = self.convert_name_to_obj(name, nick, honorific, suffix) - except Exception, e: + except Exception as e: return self.cannot_parse(safe, e) finally: - if (isinstance(self.name, self.object_class) and self.name.last): + if isinstance(self.name, self.object_class) and self.name.last: return self.name.case_name_parts() else: return self.cannot_parse(safe) @@ -135,7 +136,8 @@ def reverse_last_first(self, name): def convert_name_to_obj(self, name, nick, honorific, suffix): name = ' '.join([x.strip() for x in [name, nick, suffix, honorific] if x]) - return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name)], **{'allow_quoted_nicknames': True}) + return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name)], + **{'allow_quoted_nicknames': True}) @classmethod def name_processing_failed(cls, subject_name): @@ -199,17 +201,18 @@ def parse(self, safe=False): if not self.orig_str: return '' - if not ' ' in self.name: + if ' ' not in self.name: self.name = self.get_object_class().new_from_tokens(self.name) return self.name.case_name_parts() else: try: self.strip_party() self.name = self.convert_name_to_obj(self.name) # important for "last, first", and also running mates - except Exception, e: + except Exception as e: return self.cannot_parse(safe, e) finally: - if ((isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name, RunningMatesNames)): + if (isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name, + RunningMatesNames): return self.name.case_name_parts() else: return self.cannot_parse(safe) @@ -229,7 +232,7 @@ def convert_regular_name_to_obj(self, name): return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name) if x]) def convert_running_mates_names_to_obj(self, name): - return RunningMatesNames(*[self.convert_name_to_obj(x) for x in re.split(' [&/] ', name)]) + return RunningMatesNames(*[self.convert_name_to_obj(x) for x in re.split('[&/]', name)]) class OrganizationNameCleaver(BaseNameCleaver): @@ -246,7 +249,7 @@ def parse(self, safe=False): self.name = self.name.strip() self.name = self.get_object_class().new(self.name) - except Exception, e: + except Exception as e: return self.cannot_parse(safe, e) finally: if isinstance(self.name, self.object_class): diff --git a/name_cleaver/names.py b/name_cleaver/names.py index 5481453..f48a7fd 100644 --- a/name_cleaver/names.py +++ b/name_cleaver/names.py @@ -1,8 +1,11 @@ import re +from builtins import str +from future.utils import python_2_unicode_compatible DEGREE_RE = 'j\.?d\.?|m\.?d\.?|ph\.?d\.?' SUFFIX_RE = '([js]r\.?|%s|[IVX]{2,})' % DEGREE_RE + class Name(object): scottish_re = r'(?i)\b(?Pma?c)(?!hin)(?P\w)\w+' @@ -10,7 +13,7 @@ def primary_name_parts(self): raise NotImplementedError("Subclasses of Name must implement primary_name_parts.") def non_empty_primary_name_parts(self): - return ' '.join([ x for x in self.primary_name_parts() if x ]) + return ' '.join([x for x in self.primary_name_parts() if x]) def is_mixed_case(self): return re.search(r'[A-Z][a-z]', self.non_empty_primary_name_parts()) @@ -39,7 +42,7 @@ class OrganizationName(Name): 'inst': 'Institute', 'corp': 'Corporation', 'co': 'Company', - 'fedn' : 'Federation', + 'fedn': 'Federation', 'fed': 'Federal', 'fzco': 'Company', 'usa': 'USA', @@ -65,7 +68,7 @@ class OrganizationName(Name): name = None - #suffix = None + # suffix = None def new(self, name): self.name = name @@ -77,9 +80,10 @@ def case_name_parts(self): self.name = self.uppercase_the_scots(self.name) if re.match(r'(?i)^\w*PAC$', self.name): - self.name = self.name.upper() # if there's only one word that ends in PAC, make the whole thing uppercase + # if there's only one word that ends in PAC, make the whole thing uppercase + self.name = self.name.upper() else: - self.name = re.sub(r'(?i)\bpac\b', 'PAC', self.name) # otherwise just uppercase the PAC part + self.name = re.sub(r'(?i)\bpac\b', 'PAC', self.name) # otherwise just uppercase the PAC part self.name = self.uppercase_the_scots(self.name) self.name = self.fix_case_for_possessives(self.name) @@ -87,22 +91,19 @@ def case_name_parts(self): return self def primary_name_parts(self): - return [ self.without_extra_phrases() ] - - def __unicode__(self): - return unicode(self.name) + return [self.without_extra_phrases()] def __str__(self): - return unicode(self.name).encode('utf-8') + return self.name def without_extra_phrases(self): - """Removes parenthethical and dashed phrases""" + """Removes parenthetical and dashed phrases""" # the last parenthesis is optional, because sometimes they are truncated name = re.sub(r'\s*\([^)]*\)?\s*$', '', self.name) name = re.sub(r'(?i)\s* formerly.*$', '', name) name = re.sub(r'(?i)\s*and its affiliates$', '', name) name = re.sub(r'\bet al\b', '', name) - + # in some datasets, the name of an organization is followed by a hyphen and an abbreviated name, or a specific # department or geographic subdivision; we want to remove this extraneous stuff without breaking names like # Wal-Mart or Williams-Sonoma @@ -111,10 +112,12 @@ def without_extra_phrases(self): if "-" in name: hyphen_parts = name.rsplit("-", 1) # if the part after the hyphen is shorter than the part before, - # AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X), # AND the hyphen is preceded by either whitespace or at least four characters, + # AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X), # discard the hyphen and whatever follows - if len(hyphen_parts[1]) < len(hyphen_parts[0]) and re.search(r'(\w{4,}|\s+)$', hyphen_parts[0]) and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]): + if len(hyphen_parts[1]) < len(hyphen_parts[0]) \ + and re.search(r'^(\s+)|^(\w{0,4})$', hyphen_parts[1]) \ + and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]): name = hyphen_parts[0].strip() return name @@ -128,8 +131,8 @@ def expand(self): def kernel(self): """ The 'kernel' is an attempt to get at just the most pithy words in the name """ - stop_words = [ y.lower() for y in self.abbreviations.values() + self.filler_words ] - kernel = ' '.join([ x for x in self.expand().split() if x.lower() not in stop_words ]) + stop_words = [y.lower() for y in list(self.abbreviations.values()) + self.filler_words] + kernel = ' '.join([x for x in self.expand().split() if x.lower() not in stop_words]) # this is a hack to get around the fact that this is the only two-word phrase we want to block # amongst our stop words. if we end up with more, we may need a better way to do this @@ -144,6 +147,7 @@ def crp_style_firm_name(self, with_et_al=True): return ', '.join(self.kernel().split()[0:2]) +@python_2_unicode_compatible class PersonName(Name): honorific = None first = None @@ -195,9 +199,9 @@ def new_from_tokens(self, *args, **kwargs): """ if kwargs.get('allow_quoted_nicknames'): - args = [ x.strip() for x in args if not re.match(r'^[(]', x) ] + args = [x.strip() for x in args if not re.match(r'^[(]', x)] else: - args = [ x.strip() for x in args if not re.match(r'^[("]', x) ] + args = [x.strip() for x in args if not re.match(r'^[("]', x)] if len(args) > 2: self.detect_and_fix_two_part_surname(args) @@ -261,17 +265,14 @@ def detect_and_fix_two_part_surname(self, args): i = 0 while i < len(args) - 1: if args[i].lower() in self.family_name_prefixes: - args[i] = ' '.join(args[i:i+2]) - del(args[i+1]) + args[i] = ' '.join(args[i:i + 2]) + del (args[i + 1]) break else: i += 1 - def __unicode__(self): - return unicode(self.name_str()) - def __str__(self): - return unicode(self.name_str()).encode('utf-8') + return self.name_str() def name_str(self): return ' '.join([x.strip() for x in [ @@ -320,7 +321,7 @@ def is_only_initials(self, name_part): def capitalize_and_punctuate_initials(self, name_part): if self.is_only_initials(name_part): if '.' not in name_part: - return ''.join([ '{0}.'.format(x.upper()) for x in name_part]) + return ''.join(['{0}.'.format(x.upper()) for x in name_part]) else: return name_part else: @@ -328,17 +329,19 @@ def capitalize_and_punctuate_initials(self, name_part): def primary_name_parts(self, include_middle=False): if include_middle: - return [ self.first, self.middle, self.last ] + return [self.first, self.middle, self.last] else: - return [ self.first, self.last ] + return [self.first, self.last] def as_dict(self): - return { 'first': self.first, 'middle': self.middle, 'last': self.last, 'honorific': self.honorific, 'suffix': self.suffix } + return {'first': self.first, 'middle': self.middle, 'last': self.last, 'honorific': self.honorific, + 'suffix': self.suffix} def __repr__(self): return self.as_dict() +@python_2_unicode_compatible class PoliticalMetadata(object): party = None state = None @@ -351,10 +354,11 @@ def plus_metadata(self, party, state): def __str__(self): if self.party or self.state: - party_state = u"-".join([ x for x in [self.party, self.state] if x ]) # because presidential candidates are listed without a state - return unicode(u"{0} ({1})".format(unicode(self.name_str()), party_state)).encode('utf-8') + party_state = u"-".join([x for x in [self.party, self.state] if + x]) # because presidential candidates are listed without a state + return u"{0} ({1})".format(self.name_str(), party_state) else: - return unicode(self.name_str()).encode('utf-8') + return self.name_str() class PoliticianName(PoliticalMetadata, PersonName): @@ -362,19 +366,18 @@ class PoliticianName(PoliticalMetadata, PersonName): class RunningMatesNames(PoliticalMetadata): - def __init__(self, mate1, mate2): self.mate1 = mate1 self.mate2 = mate2 def name_str(self): - return u' & '.join([unicode(self.mate1), unicode(self.mate2)]) + return u' & '.join([str(self.mate1), str(self.mate2)]) def __repr__(self): return self.__str__() def mates(self): - return [ self.mate1, self.mate2 ] + return [self.mate1, self.mate2] def is_mixed_case(self): for mate in self.mates(): @@ -388,5 +391,3 @@ def case_name_parts(self): mate.case_name_parts() return self - - diff --git a/name_cleaver/test_name_cleaver.py b/name_cleaver/test_name_cleaver.py index 4a172a6..bd512fe 100644 --- a/name_cleaver/test_name_cleaver.py +++ b/name_cleaver/test_name_cleaver.py @@ -1,5 +1,6 @@ -from cleaver import PoliticianNameCleaver, OrganizationNameCleaver, \ - IndividualNameCleaver, UnparseableNameException +from .cleaver import PoliticianNameCleaver, OrganizationNameCleaver, \ + IndividualNameCleaver, UnparseableNameException +from builtins import str try: import unittest2 as unittest @@ -8,7 +9,6 @@ class TestPoliticianNameCleaver(unittest.TestCase): - def test_case_converts_in_non_mixed_case_names_only(self): self.assertEqual('Antonio dAlesio', str(PoliticianNameCleaver('Antonio dAlesio').parse())) @@ -29,19 +29,23 @@ def test_last_first(self): self.assertEqual('Albert Gore', str(PoliticianNameCleaver('Gore, Albert').parse())) def test_pile_it_on(self): - self.assertEqual('Milton Elmer McCullough, Jr.', str(PoliticianNameCleaver('Milton Elmer "Mac" McCullough, Jr (3)').parse())) + self.assertEqual('Milton Elmer McCullough, Jr.', + str(PoliticianNameCleaver('Milton Elmer "Mac" McCullough, Jr (3)').parse())) def test_pile_it_on_two(self): - self.assertEqual('William Steve Southerland, II', str(PoliticianNameCleaver('William Steve Southerland II (R)').parse())) + self.assertEqual('William Steve Southerland, II', + str(PoliticianNameCleaver('William Steve Southerland II (R)').parse())) def test_pile_it_on_three(self): - self.assertEqual('Edward Thomas O\'Donnell, Jr.', str(PoliticianNameCleaver('Edward Thomas O\'Donnell, Jr (D)').parse())) + self.assertEqual('Edward Thomas O\'Donnell, Jr.', + str(PoliticianNameCleaver('Edward Thomas O\'Donnell, Jr (D)').parse())) def test_standardize_running_mate_names(self): self.assertEqual('John Kasich & Mary Taylor', str(PoliticianNameCleaver('Kasich, John & Taylor, Mary').parse())) def test_standardize_running_mate_names_with_slash(self): - self.assertEqual('Mitt Romney & Paul D. Ryan', str(PoliticianNameCleaver('ROMNEY, MITT / RYAN, PAUL D.').parse())) + self.assertEqual('Mitt Romney & Paul D. Ryan', + str(PoliticianNameCleaver('ROMNEY, MITT / RYAN, PAUL D.').parse())) def test_we_dont_need_no_steeenking_nicknames(self): self.assertEqual('Robert M. McDonnell', str(PoliticianNameCleaver('McDonnell, Robert M (Bob)').parse())) @@ -81,13 +85,17 @@ def test_edgar_de_lisle_ross(self): self.assertEqual(None, name.suffix) def test_with_metadata(self): - self.assertEqual('Charles Schumer (D-NY)', str(PoliticianNameCleaver('Charles Schumer').parse().plus_metadata('D', 'NY'))) + self.assertEqual('Charles Schumer (D-NY)', + str(PoliticianNameCleaver('Charles Schumer').parse().plus_metadata('D', 'NY'))) self.assertEqual('Barack Obama (D)', str(PoliticianNameCleaver('Barack Obama').parse().plus_metadata('D', ''))) - self.assertEqual('Charles Schumer (NY)', str(PoliticianNameCleaver('Charles Schumer').parse().plus_metadata('', 'NY'))) - self.assertEqual('Jerry Leon Carroll', str(PoliticianNameCleaver('Jerry Leon Carroll').parse().plus_metadata('', ''))) # only this one guy is missing both at the moment + self.assertEqual('Charles Schumer (NY)', + str(PoliticianNameCleaver('Charles Schumer').parse().plus_metadata('', 'NY'))) + self.assertEqual('Jerry Leon Carroll', str(PoliticianNameCleaver('Jerry Leon Carroll').parse().plus_metadata('', + ''))) # only this one guy is missing both at the moment def test_running_mates_with_metadata(self): - self.assertEqual('Ted Strickland & Lee Fischer (D-OH)', str(PoliticianNameCleaver('STRICKLAND, TED & FISCHER, LEE').parse().plus_metadata('D', 'OH'))) + self.assertEqual('Ted Strickland & Lee Fischer (D-OH)', + str(PoliticianNameCleaver('STRICKLAND, TED & FISCHER, LEE').parse().plus_metadata('D', 'OH'))) def test_names_with_weird_parenthetical_stuff(self): self.assertEqual('Lynn Swann', str(PoliticianNameCleaver('SWANN, LYNN (COMMITTEE 1)').parse())) @@ -99,23 +107,24 @@ def test_capitalize_irish_names(self): self.assertEqual('Sean O\'Leary', str(PoliticianNameCleaver('SEAN O\'LEARY').parse())) def test_primary_name_parts(self): - self.assertEqual(['Robert', 'Geoff', 'Smith'], PoliticianNameCleaver('Smith, Robert Geoff').parse().primary_name_parts(include_middle=True)) + self.assertEqual(['Robert', 'Geoff', 'Smith'], + PoliticianNameCleaver('Smith, Robert Geoff').parse().primary_name_parts(include_middle=True)) self.assertEqual(['Robert', 'Smith'], PoliticianNameCleaver('Smith, Robert Geoff').parse().primary_name_parts()) def test_van_is_valid_first_name(self): self.assertEqual(['Van', 'Morrison'], PoliticianNameCleaver('Van Morrison').parse().primary_name_parts()) def test_alternate_running_mates_format(self): - self.assertEqual('Obama/Biden 2012', str(PoliticianNameCleaver('2012, Obama/Biden').parse())) + self.assertEqual('Obama 2012 & Biden', str(PoliticianNameCleaver('2012, Obama/Biden').parse())) def test_alternate_punctuation(self): self.assertEqual('Charles W. Boustany, Jr.', str(PoliticianNameCleaver('Charles W. Boustany Jr.').parse())) class TestOrganizationNameCleaver(unittest.TestCase): - def test_capitalize_pac(self): - self.assertEqual('Nancy Pelosi Leadership PAC', str(OrganizationNameCleaver('NANCY PELOSI LEADERSHIP PAC').parse())) + self.assertEqual('Nancy Pelosi Leadership PAC', + str(OrganizationNameCleaver('NANCY PELOSI LEADERSHIP PAC').parse())) def test_make_single_word_names_ending_in_pac_all_uppercase(self): self.assertEqual('ECEPAC', str(OrganizationNameCleaver('ECEPAC').parse())) @@ -132,40 +141,53 @@ def test_capitalize_scottish_names(self): self.assertEqual('MacDonnell Douglas', str(OrganizationNameCleaver('MACDONNELL DOUGLAS').parse())) def test_dont_capitalize_just_anything_starting_with_mac(self): - self.assertEqual('Machinists/Aerospace Workers Union', str(OrganizationNameCleaver('MACHINISTS/AEROSPACE WORKERS UNION').parse())) + self.assertEqual('Machinists/Aerospace Workers Union', + str(OrganizationNameCleaver('MACHINISTS/AEROSPACE WORKERS UNION').parse())) def test_expand(self): self.assertEqual('Raytheon Corporation', OrganizationNameCleaver('Raytheon Corp.').parse().expand()) - self.assertEqual('Massachusetts Institute of Technology', OrganizationNameCleaver('Massachusetts Inst. of Technology').parse().expand()) + self.assertEqual('Massachusetts Institute of Technology', + OrganizationNameCleaver('Massachusetts Inst. of Technology').parse().expand()) def test_expand_with_two_tokens_to_expand(self): self.assertEqual('Merck & Company Incorporated', OrganizationNameCleaver('Merck & Co., Inc.').parse().expand()) def test_dont_strip_after_hyphens_too_soon_in_a_name(self): - self.assertEqual('US-Russia Business Council', OrganizationNameCleaver('US-Russia Business Council').parse().kernel()) + self.assertEqual('US-Russia Business Council', + OrganizationNameCleaver('US-Russia Business Council').parse().kernel()) self.assertEqual('Wal-Mart Stores', OrganizationNameCleaver('Wal-Mart Stores, Inc.').parse().kernel()) + self.assertEqual('Williams-Sonoma', OrganizationNameCleaver('Williams-Sonoma, Inc.').parse().kernel()) + self.assertEqual('Austin American-Statesman', + OrganizationNameCleaver('Austin American-Statesman').parse().kernel()) # these were new after the hyphen rewrite - self.assertEqual('Coca-Cola Company', OrganizationNameCleaver('Coca-Cola Co').parse().expand()) # used to return 'Coca' - self.assertEqual('Rolls-Royce PLC', OrganizationNameCleaver('Rolls-Royce PLC').parse().expand()) # used to return 'Rolls' + self.assertEqual('Coca-Cola Company', + OrganizationNameCleaver('Coca-Cola Co').parse().expand()) # used to return 'Coca' + self.assertEqual('Rolls-Royce PLC', + OrganizationNameCleaver('Rolls-Royce PLC').parse().expand()) # used to return 'Rolls' def test_drop_postname_hyphen_phrases(self): - self.assertEqual('Lawyers For Better Government', OrganizationNameCleaver('LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases()) - self.assertEqual('Jobs Opportunity And Freedom Political Action Committee', OrganizationNameCleaver('JOBS OPPORTUNITY AND FREEDOM POLITICAL ACTION COMMITTEE - JOFPAC').parse().without_extra_phrases()) + self.assertEqual('Lawyers For Better Government-Illinois', OrganizationNameCleaver( + 'LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases()) + self.assertEqual('Jobs Opportunity And Freedom Political Action Committee', OrganizationNameCleaver( + 'JOBS OPPORTUNITY AND FREEDOM POLITICAL ACTION COMMITTEE - JOFPAC').parse().without_extra_phrases()) def test_kernel(self): """ Intended to get only the unique/meaningful words out of a name """ - self.assertEqual('Massachusetts Technology', OrganizationNameCleaver('Massachusetts Inst. of Technology').parse().kernel()) - self.assertEqual('Massachusetts Technology', OrganizationNameCleaver('Massachusetts Institute of Technology').parse().kernel()) + self.assertEqual('Massachusetts Technology', + OrganizationNameCleaver('Massachusetts Inst. of Technology').parse().kernel()) + self.assertEqual('Massachusetts Technology', + OrganizationNameCleaver('Massachusetts Institute of Technology').parse().kernel()) self.assertEqual('Walsh', OrganizationNameCleaver('The Walsh Group').parse().kernel()) self.assertEqual('Health Net', OrganizationNameCleaver('Health Net Inc').parse().kernel()) self.assertEqual('Health Net', OrganizationNameCleaver('Health Net, Inc.').parse().kernel()) - self.assertEqual('Distilled Spirits Council', OrganizationNameCleaver('Distilled Spirits Council of the U.S., Inc.').parse().kernel()) + self.assertEqual('Distilled Spirits Council', + OrganizationNameCleaver('Distilled Spirits Council of the U.S., Inc.').parse().kernel()) def test_handles_empty_names(self): self.assertEqual('', str(OrganizationNameCleaver('').parse())) @@ -232,7 +254,8 @@ def test_mr_and_mrs(self): self.assertEqual('Kenneth L. Lay', self.cleave_to_str('LAY, KENNETH L MR & MRS')) def test_primary_name_parts(self): - self.assertEqual(['Robert', 'Geoff', 'Smith'], self.cleaver('Smith, Robert Geoff').parse().primary_name_parts(include_middle=True)) + self.assertEqual(['Robert', 'Geoff', 'Smith'], + self.cleaver('Smith, Robert Geoff').parse().primary_name_parts(include_middle=True)) self.assertEqual(['Robert', 'Smith'], self.cleaver('Smith, Robert Geoff').parse().primary_name_parts()) def test_initialed_first_name(self): @@ -264,13 +287,12 @@ def test_considers_double_initial_a_first_name(self): class TestCapitalization(unittest.TestCase): - def test_overrides_dumb_python_titlecasing_for_apostrophes(self): - self.assertEqual('Phoenix Women\'s Health Center', str(OrganizationNameCleaver('PHOENIX WOMEN\'S HEALTH CENTER').parse())) + self.assertEqual('Phoenix Women\'s Health Center', + str(OrganizationNameCleaver('PHOENIX WOMEN\'S HEALTH CENTER').parse())) class TestOrganizationNameCleaverForIndustries(unittest.TestCase): - def test_capitalizes_letter_after_slash(self): self.assertEqual('Health Services/Hmos', str(OrganizationNameCleaver('HEALTH SERVICES/HMOS').parse())) self.assertEqual('Lawyers/Law Firms', str(OrganizationNameCleaver('LAWYERS/LAW FIRMS').parse())) @@ -281,34 +303,33 @@ def test_capitalizes_letter_after_hyphen(self): class TestUnicode(unittest.TestCase): - def test_individual(self): - self.assertEqual(u'Tobias F\u00fcnke'.encode('utf-8'), - str(IndividualNameCleaver(u'F\u00fcnke, Tobias').parse())) + self.assertEqual(u'Tobias F\u00fcnke', + str(IndividualNameCleaver(u'F\u00fcnke, Tobias').parse())) def test_politician(self): - self.assertEqual(u'Tobias F\u00fcnke'.encode('utf-8'), - str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse())) + self.assertEqual(u'Tobias F\u00fcnke', + str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse())) def test_politician_plus_metadata(self): - self.assertEqual(u'Tobias F\u00fcnke (D-CA)'.encode('utf-8'), - str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse().plus_metadata('D', 'CA'))) + self.assertEqual(u'Tobias F\u00fcnke (D-CA)', + str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse().plus_metadata('D', 'CA'))) def test_politician_running_mates(self): - self.assertEqual(u'Tobias F\u00fcnke & Lindsay F\u00fcnke'.encode('utf-8'), - str(PoliticianNameCleaver(u'F\u00fcnke, Tobias & F\u00fcnke, Lindsay').parse())) + self.assertEqual(u'Tobias F\u00fcnke & Lindsay F\u00fcnke', + str(PoliticianNameCleaver(u'F\u00fcnke, Tobias & F\u00fcnke, Lindsay').parse())) def test_running_mates_with_metadata(self): - self.assertEqual(u'Ted Strickland & Le\u00e9 Fischer (D-OH)'.encode('utf-8'), - str(PoliticianNameCleaver(u'STRICKLAND, TED & FISCHER, LE\u00c9').parse().plus_metadata('D', 'OH'))) + self.assertEqual(u'Ted Strickland & Le\u00e9 Fischer (D-OH)', + str(PoliticianNameCleaver(u'STRICKLAND, TED & FISCHER, LE\u00c9').parse().plus_metadata('D', + 'OH'))) def test_organization(self): - self.assertEqual(u'\u00C6tna, Inc.'.encode('utf-8'), - str(OrganizationNameCleaver(u'\u00C6tna, Inc.').parse())) + self.assertEqual(u'\u00C6tna, Inc.', + str(OrganizationNameCleaver(u'\u00C6tna, Inc.').parse())) class TestErrors(unittest.TestCase): - def test_unparseable_politician_name(self): with self.assertRaises(UnparseableNameException): PoliticianNameCleaver("mr & mrs").parse() @@ -318,33 +339,33 @@ def test_unparseable_individual_name(self): IndividualNameCleaver("mr & mrs").parse() # this ought to have a test, but I'm not sure how to break this one. - #def test_unparseable_organization_name(self): + # def test_unparseable_organization_name(self): # with self.assertRaises(UnparseableNameException): # OrganizationNameCleaver("####!!!").parse() def test_parse_safe__individual(self): pass - #with self.assertRaises(UnparseableNameException): + # with self.assertRaises(UnparseableNameException): # IndividualNameCleaver("BARDEN PHD J D, R CHRISTOPHER").parse() - #self.assertEqual('BARDEN PHD J D, R CHRISTOPHER', str(IndividualNameCleaver('BARDEN PHD J D, R CHRISTOPHER').parse(safe=True))) + # self.assertEqual('BARDEN PHD J D, R CHRISTOPHER', str(IndividualNameCleaver('BARDEN PHD J D, R CHRISTOPHER').parse(safe=True))) - #with self.assertRaises(UnparseableNameException): + # with self.assertRaises(UnparseableNameException): # IndividualNameCleaver("gobbledy blah bloop!!!.p,.lcrg%%% #$<").parse() - #self.assertEqual('gobbledy blah bloop!!!.p,.lcrg%%% #$<', str(IndividualNameCleaver('gobbledy blah bloop!!!.p,.lcrg%%% #$<').parse(safe=True))) + # self.assertEqual('gobbledy blah bloop!!!.p,.lcrg%%% #$<', str(IndividualNameCleaver('gobbledy blah bloop!!!.p,.lcrg%%% #$<').parse(safe=True))) def test_parse_safe__politician(self): pass - #with self.assertRaises(UnparseableNameException): + # with self.assertRaises(UnparseableNameException): # PoliticianNameCleaver("BARDEN PHD J D, R CHRISTOPHER").parse() - #self.assertEqual('BARDEN PHD J D, R CHRISTOPHER', str(PoliticianNameCleaver('BARDEN PHD J D, R CHRISTOPHER').parse(safe=True))) + # self.assertEqual('BARDEN PHD J D, R CHRISTOPHER', str(PoliticianNameCleaver('BARDEN PHD J D, R CHRISTOPHER').parse(safe=True))) - #with self.assertRaises(UnparseableNameException): + # with self.assertRaises(UnparseableNameException): # PoliticianNameCleaver("gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<").parse() - #self.assertEqual('gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<', str(PoliticianNameCleaver('gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<').parse(safe=True))) + # self.assertEqual('gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<', str(PoliticianNameCleaver('gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<').parse(safe=True))) def test_parse_safe__organization(self): self.assertEqual('', OrganizationNameCleaver(None).parse(safe=True)) diff --git a/setup.py b/setup.py index e698442..2b14a7a 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,9 @@ author_email='arowland@sunlightfoundation.com', url='http://github.com/sunlightlabs/name-cleaver/', packages=find_packages(), + install_requires=[ + 'future', + ], license='BSD License', platforms=["any"], classifiers=[