From 99a5460b6af785494f3531bf27fe7555d111b344 Mon Sep 17 00:00:00 2001 From: Victor Trac Date: Wed, 20 Aug 2014 14:06:42 -0500 Subject: [PATCH 1/7] fixing bug with cleaving hyphens --- name_cleaver/names.py | 6 ++++-- name_cleaver/test_name_cleaver.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/name_cleaver/names.py b/name_cleaver/names.py index 5481453..a98ee5a 100644 --- a/name_cleaver/names.py +++ b/name_cleaver/names.py @@ -111,10 +111,12 @@ def without_extra_phrases(self): if "-" in name: hyphen_parts = name.rsplit("-", 1) # if the part after the hyphen is shorter than the part before, - # AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X), # AND the hyphen is preceded by either whitespace or at least four characters, + # AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X), # discard the hyphen and whatever follows - if len(hyphen_parts[1]) < len(hyphen_parts[0]) and re.search(r'(\w{4,}|\s+)$', hyphen_parts[0]) and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]): + if len(hyphen_parts[1]) < len(hyphen_parts[0]) \ + and re.search(r'^(\s+)|^(\w{0,4})$', hyphen_parts[1]) \ + and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]): name = hyphen_parts[0].strip() return name diff --git a/name_cleaver/test_name_cleaver.py b/name_cleaver/test_name_cleaver.py index 4a172a6..e38868a 100644 --- a/name_cleaver/test_name_cleaver.py +++ b/name_cleaver/test_name_cleaver.py @@ -144,13 +144,15 @@ def test_expand_with_two_tokens_to_expand(self): def test_dont_strip_after_hyphens_too_soon_in_a_name(self): self.assertEqual('US-Russia Business Council', OrganizationNameCleaver('US-Russia Business Council').parse().kernel()) self.assertEqual('Wal-Mart Stores', OrganizationNameCleaver('Wal-Mart Stores, Inc.').parse().kernel()) + self.assertEqual('Williams-Sonoma', OrganizationNameCleaver('Williams-Sonoma, Inc.').parse().kernel()) + self.assertEqual('Austin American-Statesman', OrganizationNameCleaver('Austin American-Statesman').parse().kernel()) # these were new after the hyphen rewrite self.assertEqual('Coca-Cola Company', OrganizationNameCleaver('Coca-Cola Co').parse().expand()) # used to return 'Coca' self.assertEqual('Rolls-Royce PLC', OrganizationNameCleaver('Rolls-Royce PLC').parse().expand()) # used to return 'Rolls' def test_drop_postname_hyphen_phrases(self): - self.assertEqual('Lawyers For Better Government', OrganizationNameCleaver('LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases()) + self.assertEqual('Lawyers For Better Government-Illinois', OrganizationNameCleaver('LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases()) self.assertEqual('Jobs Opportunity And Freedom Political Action Committee', OrganizationNameCleaver('JOBS OPPORTUNITY AND FREEDOM POLITICAL ACTION COMMITTEE - JOFPAC').parse().without_extra_phrases()) def test_kernel(self): From 99d9208f270d2a7e2227eda01bf65a82f0c8f838 Mon Sep 17 00:00:00 2001 From: Bidhan Date: Thu, 18 May 2017 12:17:21 +0545 Subject: [PATCH 2/7] - Add .idea to .gitignore - Reformat files to meet PEP 8 standards --- .gitignore | 1 + name_cleaver/cleaver.py | 14 ++-- name_cleaver/names.py | 47 ++++++------- name_cleaver/test_name_cleaver.py | 108 +++++++++++++++++------------- 4 files changed, 96 insertions(+), 74 deletions(-) diff --git a/.gitignore b/.gitignore index e98ce13..e5d82b7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ dist *.swn .noseids build/* +.idea/ diff --git a/name_cleaver/cleaver.py b/name_cleaver/cleaver.py index aaf9384..ecc09cd 100644 --- a/name_cleaver/cleaver.py +++ b/name_cleaver/cleaver.py @@ -15,7 +15,7 @@ def cannot_parse(self, safe, e=None): return self.orig_str else: # uncomment for debugging - #if e: + # if e: # print e raise UnparseableNameException(u"Couldn't parse name: {0}".format(self.name)) @@ -33,7 +33,7 @@ def parse(self, safe=False): if not self.orig_str: return '' - if not ' ' in self.name: + if ' ' not in self.name: self.name = self.get_object_class().new_from_tokens(self.name) return self.name.case_name_parts() else: @@ -50,7 +50,7 @@ def parse(self, safe=False): except Exception, e: return self.cannot_parse(safe, e) finally: - if (isinstance(self.name, self.object_class) and self.name.last): + if isinstance(self.name, self.object_class) and self.name.last: return self.name.case_name_parts() else: return self.cannot_parse(safe) @@ -135,7 +135,8 @@ def reverse_last_first(self, name): def convert_name_to_obj(self, name, nick, honorific, suffix): name = ' '.join([x.strip() for x in [name, nick, suffix, honorific] if x]) - return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name)], **{'allow_quoted_nicknames': True}) + return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name)], + **{'allow_quoted_nicknames': True}) @classmethod def name_processing_failed(cls, subject_name): @@ -199,7 +200,7 @@ def parse(self, safe=False): if not self.orig_str: return '' - if not ' ' in self.name: + if ' ' not in self.name: self.name = self.get_object_class().new_from_tokens(self.name) return self.name.case_name_parts() else: @@ -209,7 +210,8 @@ def parse(self, safe=False): except Exception, e: return self.cannot_parse(safe, e) finally: - if ((isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name, RunningMatesNames)): + if (isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name, + RunningMatesNames): return self.name.case_name_parts() else: return self.cannot_parse(safe) diff --git a/name_cleaver/names.py b/name_cleaver/names.py index a98ee5a..dd91777 100644 --- a/name_cleaver/names.py +++ b/name_cleaver/names.py @@ -3,6 +3,7 @@ DEGREE_RE = 'j\.?d\.?|m\.?d\.?|ph\.?d\.?' SUFFIX_RE = '([js]r\.?|%s|[IVX]{2,})' % DEGREE_RE + class Name(object): scottish_re = r'(?i)\b(?Pma?c)(?!hin)(?P\w)\w+' @@ -10,7 +11,7 @@ def primary_name_parts(self): raise NotImplementedError("Subclasses of Name must implement primary_name_parts.") def non_empty_primary_name_parts(self): - return ' '.join([ x for x in self.primary_name_parts() if x ]) + return ' '.join([x for x in self.primary_name_parts() if x]) def is_mixed_case(self): return re.search(r'[A-Z][a-z]', self.non_empty_primary_name_parts()) @@ -39,7 +40,7 @@ class OrganizationName(Name): 'inst': 'Institute', 'corp': 'Corporation', 'co': 'Company', - 'fedn' : 'Federation', + 'fedn': 'Federation', 'fed': 'Federal', 'fzco': 'Company', 'usa': 'USA', @@ -65,7 +66,7 @@ class OrganizationName(Name): name = None - #suffix = None + # suffix = None def new(self, name): self.name = name @@ -77,9 +78,10 @@ def case_name_parts(self): self.name = self.uppercase_the_scots(self.name) if re.match(r'(?i)^\w*PAC$', self.name): - self.name = self.name.upper() # if there's only one word that ends in PAC, make the whole thing uppercase + # if there's only one word that ends in PAC, make the whole thing uppercase + self.name = self.name.upper() else: - self.name = re.sub(r'(?i)\bpac\b', 'PAC', self.name) # otherwise just uppercase the PAC part + self.name = re.sub(r'(?i)\bpac\b', 'PAC', self.name) # otherwise just uppercase the PAC part self.name = self.uppercase_the_scots(self.name) self.name = self.fix_case_for_possessives(self.name) @@ -87,7 +89,7 @@ def case_name_parts(self): return self def primary_name_parts(self): - return [ self.without_extra_phrases() ] + return [self.without_extra_phrases()] def __unicode__(self): return unicode(self.name) @@ -96,13 +98,13 @@ def __str__(self): return unicode(self.name).encode('utf-8') def without_extra_phrases(self): - """Removes parenthethical and dashed phrases""" + """Removes parenthetical and dashed phrases""" # the last parenthesis is optional, because sometimes they are truncated name = re.sub(r'\s*\([^)]*\)?\s*$', '', self.name) name = re.sub(r'(?i)\s* formerly.*$', '', name) name = re.sub(r'(?i)\s*and its affiliates$', '', name) name = re.sub(r'\bet al\b', '', name) - + # in some datasets, the name of an organization is followed by a hyphen and an abbreviated name, or a specific # department or geographic subdivision; we want to remove this extraneous stuff without breaking names like # Wal-Mart or Williams-Sonoma @@ -130,8 +132,8 @@ def expand(self): def kernel(self): """ The 'kernel' is an attempt to get at just the most pithy words in the name """ - stop_words = [ y.lower() for y in self.abbreviations.values() + self.filler_words ] - kernel = ' '.join([ x for x in self.expand().split() if x.lower() not in stop_words ]) + stop_words = [y.lower() for y in self.abbreviations.values() + self.filler_words] + kernel = ' '.join([x for x in self.expand().split() if x.lower() not in stop_words]) # this is a hack to get around the fact that this is the only two-word phrase we want to block # amongst our stop words. if we end up with more, we may need a better way to do this @@ -197,9 +199,9 @@ def new_from_tokens(self, *args, **kwargs): """ if kwargs.get('allow_quoted_nicknames'): - args = [ x.strip() for x in args if not re.match(r'^[(]', x) ] + args = [x.strip() for x in args if not re.match(r'^[(]', x)] else: - args = [ x.strip() for x in args if not re.match(r'^[("]', x) ] + args = [x.strip() for x in args if not re.match(r'^[("]', x)] if len(args) > 2: self.detect_and_fix_two_part_surname(args) @@ -263,8 +265,8 @@ def detect_and_fix_two_part_surname(self, args): i = 0 while i < len(args) - 1: if args[i].lower() in self.family_name_prefixes: - args[i] = ' '.join(args[i:i+2]) - del(args[i+1]) + args[i] = ' '.join(args[i:i + 2]) + del (args[i + 1]) break else: i += 1 @@ -322,7 +324,7 @@ def is_only_initials(self, name_part): def capitalize_and_punctuate_initials(self, name_part): if self.is_only_initials(name_part): if '.' not in name_part: - return ''.join([ '{0}.'.format(x.upper()) for x in name_part]) + return ''.join(['{0}.'.format(x.upper()) for x in name_part]) else: return name_part else: @@ -330,12 +332,13 @@ def capitalize_and_punctuate_initials(self, name_part): def primary_name_parts(self, include_middle=False): if include_middle: - return [ self.first, self.middle, self.last ] + return [self.first, self.middle, self.last] else: - return [ self.first, self.last ] + return [self.first, self.last] def as_dict(self): - return { 'first': self.first, 'middle': self.middle, 'last': self.last, 'honorific': self.honorific, 'suffix': self.suffix } + return {'first': self.first, 'middle': self.middle, 'last': self.last, 'honorific': self.honorific, + 'suffix': self.suffix} def __repr__(self): return self.as_dict() @@ -353,7 +356,8 @@ def plus_metadata(self, party, state): def __str__(self): if self.party or self.state: - party_state = u"-".join([ x for x in [self.party, self.state] if x ]) # because presidential candidates are listed without a state + party_state = u"-".join([x for x in [self.party, self.state] if + x]) # because presidential candidates are listed without a state return unicode(u"{0} ({1})".format(unicode(self.name_str()), party_state)).encode('utf-8') else: return unicode(self.name_str()).encode('utf-8') @@ -364,7 +368,6 @@ class PoliticianName(PoliticalMetadata, PersonName): class RunningMatesNames(PoliticalMetadata): - def __init__(self, mate1, mate2): self.mate1 = mate1 self.mate2 = mate2 @@ -376,7 +379,7 @@ def __repr__(self): return self.__str__() def mates(self): - return [ self.mate1, self.mate2 ] + return [self.mate1, self.mate2] def is_mixed_case(self): for mate in self.mates(): @@ -390,5 +393,3 @@ def case_name_parts(self): mate.case_name_parts() return self - - diff --git a/name_cleaver/test_name_cleaver.py b/name_cleaver/test_name_cleaver.py index e38868a..0a939e6 100644 --- a/name_cleaver/test_name_cleaver.py +++ b/name_cleaver/test_name_cleaver.py @@ -1,5 +1,5 @@ from cleaver import PoliticianNameCleaver, OrganizationNameCleaver, \ - IndividualNameCleaver, UnparseableNameException + IndividualNameCleaver, UnparseableNameException try: import unittest2 as unittest @@ -8,7 +8,6 @@ class TestPoliticianNameCleaver(unittest.TestCase): - def test_case_converts_in_non_mixed_case_names_only(self): self.assertEqual('Antonio dAlesio', str(PoliticianNameCleaver('Antonio dAlesio').parse())) @@ -29,19 +28,23 @@ def test_last_first(self): self.assertEqual('Albert Gore', str(PoliticianNameCleaver('Gore, Albert').parse())) def test_pile_it_on(self): - self.assertEqual('Milton Elmer McCullough, Jr.', str(PoliticianNameCleaver('Milton Elmer "Mac" McCullough, Jr (3)').parse())) + self.assertEqual('Milton Elmer McCullough, Jr.', + str(PoliticianNameCleaver('Milton Elmer "Mac" McCullough, Jr (3)').parse())) def test_pile_it_on_two(self): - self.assertEqual('William Steve Southerland, II', str(PoliticianNameCleaver('William Steve Southerland II (R)').parse())) + self.assertEqual('William Steve Southerland, II', + str(PoliticianNameCleaver('William Steve Southerland II (R)').parse())) def test_pile_it_on_three(self): - self.assertEqual('Edward Thomas O\'Donnell, Jr.', str(PoliticianNameCleaver('Edward Thomas O\'Donnell, Jr (D)').parse())) + self.assertEqual('Edward Thomas O\'Donnell, Jr.', + str(PoliticianNameCleaver('Edward Thomas O\'Donnell, Jr (D)').parse())) def test_standardize_running_mate_names(self): self.assertEqual('John Kasich & Mary Taylor', str(PoliticianNameCleaver('Kasich, John & Taylor, Mary').parse())) def test_standardize_running_mate_names_with_slash(self): - self.assertEqual('Mitt Romney & Paul D. Ryan', str(PoliticianNameCleaver('ROMNEY, MITT / RYAN, PAUL D.').parse())) + self.assertEqual('Mitt Romney & Paul D. Ryan', + str(PoliticianNameCleaver('ROMNEY, MITT / RYAN, PAUL D.').parse())) def test_we_dont_need_no_steeenking_nicknames(self): self.assertEqual('Robert M. McDonnell', str(PoliticianNameCleaver('McDonnell, Robert M (Bob)').parse())) @@ -81,13 +84,17 @@ def test_edgar_de_lisle_ross(self): self.assertEqual(None, name.suffix) def test_with_metadata(self): - self.assertEqual('Charles Schumer (D-NY)', str(PoliticianNameCleaver('Charles Schumer').parse().plus_metadata('D', 'NY'))) + self.assertEqual('Charles Schumer (D-NY)', + str(PoliticianNameCleaver('Charles Schumer').parse().plus_metadata('D', 'NY'))) self.assertEqual('Barack Obama (D)', str(PoliticianNameCleaver('Barack Obama').parse().plus_metadata('D', ''))) - self.assertEqual('Charles Schumer (NY)', str(PoliticianNameCleaver('Charles Schumer').parse().plus_metadata('', 'NY'))) - self.assertEqual('Jerry Leon Carroll', str(PoliticianNameCleaver('Jerry Leon Carroll').parse().plus_metadata('', ''))) # only this one guy is missing both at the moment + self.assertEqual('Charles Schumer (NY)', + str(PoliticianNameCleaver('Charles Schumer').parse().plus_metadata('', 'NY'))) + self.assertEqual('Jerry Leon Carroll', str(PoliticianNameCleaver('Jerry Leon Carroll').parse().plus_metadata('', + ''))) # only this one guy is missing both at the moment def test_running_mates_with_metadata(self): - self.assertEqual('Ted Strickland & Lee Fischer (D-OH)', str(PoliticianNameCleaver('STRICKLAND, TED & FISCHER, LEE').parse().plus_metadata('D', 'OH'))) + self.assertEqual('Ted Strickland & Lee Fischer (D-OH)', + str(PoliticianNameCleaver('STRICKLAND, TED & FISCHER, LEE').parse().plus_metadata('D', 'OH'))) def test_names_with_weird_parenthetical_stuff(self): self.assertEqual('Lynn Swann', str(PoliticianNameCleaver('SWANN, LYNN (COMMITTEE 1)').parse())) @@ -99,7 +106,8 @@ def test_capitalize_irish_names(self): self.assertEqual('Sean O\'Leary', str(PoliticianNameCleaver('SEAN O\'LEARY').parse())) def test_primary_name_parts(self): - self.assertEqual(['Robert', 'Geoff', 'Smith'], PoliticianNameCleaver('Smith, Robert Geoff').parse().primary_name_parts(include_middle=True)) + self.assertEqual(['Robert', 'Geoff', 'Smith'], + PoliticianNameCleaver('Smith, Robert Geoff').parse().primary_name_parts(include_middle=True)) self.assertEqual(['Robert', 'Smith'], PoliticianNameCleaver('Smith, Robert Geoff').parse().primary_name_parts()) def test_van_is_valid_first_name(self): @@ -113,9 +121,9 @@ def test_alternate_punctuation(self): class TestOrganizationNameCleaver(unittest.TestCase): - def test_capitalize_pac(self): - self.assertEqual('Nancy Pelosi Leadership PAC', str(OrganizationNameCleaver('NANCY PELOSI LEADERSHIP PAC').parse())) + self.assertEqual('Nancy Pelosi Leadership PAC', + str(OrganizationNameCleaver('NANCY PELOSI LEADERSHIP PAC').parse())) def test_make_single_word_names_ending_in_pac_all_uppercase(self): self.assertEqual('ECEPAC', str(OrganizationNameCleaver('ECEPAC').parse())) @@ -132,42 +140,53 @@ def test_capitalize_scottish_names(self): self.assertEqual('MacDonnell Douglas', str(OrganizationNameCleaver('MACDONNELL DOUGLAS').parse())) def test_dont_capitalize_just_anything_starting_with_mac(self): - self.assertEqual('Machinists/Aerospace Workers Union', str(OrganizationNameCleaver('MACHINISTS/AEROSPACE WORKERS UNION').parse())) + self.assertEqual('Machinists/Aerospace Workers Union', + str(OrganizationNameCleaver('MACHINISTS/AEROSPACE WORKERS UNION').parse())) def test_expand(self): self.assertEqual('Raytheon Corporation', OrganizationNameCleaver('Raytheon Corp.').parse().expand()) - self.assertEqual('Massachusetts Institute of Technology', OrganizationNameCleaver('Massachusetts Inst. of Technology').parse().expand()) + self.assertEqual('Massachusetts Institute of Technology', + OrganizationNameCleaver('Massachusetts Inst. of Technology').parse().expand()) def test_expand_with_two_tokens_to_expand(self): self.assertEqual('Merck & Company Incorporated', OrganizationNameCleaver('Merck & Co., Inc.').parse().expand()) def test_dont_strip_after_hyphens_too_soon_in_a_name(self): - self.assertEqual('US-Russia Business Council', OrganizationNameCleaver('US-Russia Business Council').parse().kernel()) + self.assertEqual('US-Russia Business Council', + OrganizationNameCleaver('US-Russia Business Council').parse().kernel()) self.assertEqual('Wal-Mart Stores', OrganizationNameCleaver('Wal-Mart Stores, Inc.').parse().kernel()) self.assertEqual('Williams-Sonoma', OrganizationNameCleaver('Williams-Sonoma, Inc.').parse().kernel()) - self.assertEqual('Austin American-Statesman', OrganizationNameCleaver('Austin American-Statesman').parse().kernel()) + self.assertEqual('Austin American-Statesman', + OrganizationNameCleaver('Austin American-Statesman').parse().kernel()) # these were new after the hyphen rewrite - self.assertEqual('Coca-Cola Company', OrganizationNameCleaver('Coca-Cola Co').parse().expand()) # used to return 'Coca' - self.assertEqual('Rolls-Royce PLC', OrganizationNameCleaver('Rolls-Royce PLC').parse().expand()) # used to return 'Rolls' + self.assertEqual('Coca-Cola Company', + OrganizationNameCleaver('Coca-Cola Co').parse().expand()) # used to return 'Coca' + self.assertEqual('Rolls-Royce PLC', + OrganizationNameCleaver('Rolls-Royce PLC').parse().expand()) # used to return 'Rolls' def test_drop_postname_hyphen_phrases(self): - self.assertEqual('Lawyers For Better Government-Illinois', OrganizationNameCleaver('LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases()) - self.assertEqual('Jobs Opportunity And Freedom Political Action Committee', OrganizationNameCleaver('JOBS OPPORTUNITY AND FREEDOM POLITICAL ACTION COMMITTEE - JOFPAC').parse().without_extra_phrases()) + self.assertEqual('Lawyers For Better Government-Illinois', OrganizationNameCleaver( + 'LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases()) + self.assertEqual('Jobs Opportunity And Freedom Political Action Committee', OrganizationNameCleaver( + 'JOBS OPPORTUNITY AND FREEDOM POLITICAL ACTION COMMITTEE - JOFPAC').parse().without_extra_phrases()) def test_kernel(self): """ Intended to get only the unique/meaningful words out of a name """ - self.assertEqual('Massachusetts Technology', OrganizationNameCleaver('Massachusetts Inst. of Technology').parse().kernel()) - self.assertEqual('Massachusetts Technology', OrganizationNameCleaver('Massachusetts Institute of Technology').parse().kernel()) + self.assertEqual('Massachusetts Technology', + OrganizationNameCleaver('Massachusetts Inst. of Technology').parse().kernel()) + self.assertEqual('Massachusetts Technology', + OrganizationNameCleaver('Massachusetts Institute of Technology').parse().kernel()) self.assertEqual('Walsh', OrganizationNameCleaver('The Walsh Group').parse().kernel()) self.assertEqual('Health Net', OrganizationNameCleaver('Health Net Inc').parse().kernel()) self.assertEqual('Health Net', OrganizationNameCleaver('Health Net, Inc.').parse().kernel()) - self.assertEqual('Distilled Spirits Council', OrganizationNameCleaver('Distilled Spirits Council of the U.S., Inc.').parse().kernel()) + self.assertEqual('Distilled Spirits Council', + OrganizationNameCleaver('Distilled Spirits Council of the U.S., Inc.').parse().kernel()) def test_handles_empty_names(self): self.assertEqual('', str(OrganizationNameCleaver('').parse())) @@ -234,7 +253,8 @@ def test_mr_and_mrs(self): self.assertEqual('Kenneth L. Lay', self.cleave_to_str('LAY, KENNETH L MR & MRS')) def test_primary_name_parts(self): - self.assertEqual(['Robert', 'Geoff', 'Smith'], self.cleaver('Smith, Robert Geoff').parse().primary_name_parts(include_middle=True)) + self.assertEqual(['Robert', 'Geoff', 'Smith'], + self.cleaver('Smith, Robert Geoff').parse().primary_name_parts(include_middle=True)) self.assertEqual(['Robert', 'Smith'], self.cleaver('Smith, Robert Geoff').parse().primary_name_parts()) def test_initialed_first_name(self): @@ -266,13 +286,12 @@ def test_considers_double_initial_a_first_name(self): class TestCapitalization(unittest.TestCase): - def test_overrides_dumb_python_titlecasing_for_apostrophes(self): - self.assertEqual('Phoenix Women\'s Health Center', str(OrganizationNameCleaver('PHOENIX WOMEN\'S HEALTH CENTER').parse())) + self.assertEqual('Phoenix Women\'s Health Center', + str(OrganizationNameCleaver('PHOENIX WOMEN\'S HEALTH CENTER').parse())) class TestOrganizationNameCleaverForIndustries(unittest.TestCase): - def test_capitalizes_letter_after_slash(self): self.assertEqual('Health Services/Hmos', str(OrganizationNameCleaver('HEALTH SERVICES/HMOS').parse())) self.assertEqual('Lawyers/Law Firms', str(OrganizationNameCleaver('LAWYERS/LAW FIRMS').parse())) @@ -283,34 +302,33 @@ def test_capitalizes_letter_after_hyphen(self): class TestUnicode(unittest.TestCase): - def test_individual(self): self.assertEqual(u'Tobias F\u00fcnke'.encode('utf-8'), - str(IndividualNameCleaver(u'F\u00fcnke, Tobias').parse())) + str(IndividualNameCleaver(u'F\u00fcnke, Tobias').parse())) def test_politician(self): self.assertEqual(u'Tobias F\u00fcnke'.encode('utf-8'), - str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse())) + str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse())) def test_politician_plus_metadata(self): self.assertEqual(u'Tobias F\u00fcnke (D-CA)'.encode('utf-8'), - str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse().plus_metadata('D', 'CA'))) + str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse().plus_metadata('D', 'CA'))) def test_politician_running_mates(self): self.assertEqual(u'Tobias F\u00fcnke & Lindsay F\u00fcnke'.encode('utf-8'), - str(PoliticianNameCleaver(u'F\u00fcnke, Tobias & F\u00fcnke, Lindsay').parse())) + str(PoliticianNameCleaver(u'F\u00fcnke, Tobias & F\u00fcnke, Lindsay').parse())) def test_running_mates_with_metadata(self): self.assertEqual(u'Ted Strickland & Le\u00e9 Fischer (D-OH)'.encode('utf-8'), - str(PoliticianNameCleaver(u'STRICKLAND, TED & FISCHER, LE\u00c9').parse().plus_metadata('D', 'OH'))) + str(PoliticianNameCleaver(u'STRICKLAND, TED & FISCHER, LE\u00c9').parse().plus_metadata('D', + 'OH'))) def test_organization(self): self.assertEqual(u'\u00C6tna, Inc.'.encode('utf-8'), - str(OrganizationNameCleaver(u'\u00C6tna, Inc.').parse())) + str(OrganizationNameCleaver(u'\u00C6tna, Inc.').parse())) class TestErrors(unittest.TestCase): - def test_unparseable_politician_name(self): with self.assertRaises(UnparseableNameException): PoliticianNameCleaver("mr & mrs").parse() @@ -320,33 +338,33 @@ def test_unparseable_individual_name(self): IndividualNameCleaver("mr & mrs").parse() # this ought to have a test, but I'm not sure how to break this one. - #def test_unparseable_organization_name(self): + # def test_unparseable_organization_name(self): # with self.assertRaises(UnparseableNameException): # OrganizationNameCleaver("####!!!").parse() def test_parse_safe__individual(self): pass - #with self.assertRaises(UnparseableNameException): + # with self.assertRaises(UnparseableNameException): # IndividualNameCleaver("BARDEN PHD J D, R CHRISTOPHER").parse() - #self.assertEqual('BARDEN PHD J D, R CHRISTOPHER', str(IndividualNameCleaver('BARDEN PHD J D, R CHRISTOPHER').parse(safe=True))) + # self.assertEqual('BARDEN PHD J D, R CHRISTOPHER', str(IndividualNameCleaver('BARDEN PHD J D, R CHRISTOPHER').parse(safe=True))) - #with self.assertRaises(UnparseableNameException): + # with self.assertRaises(UnparseableNameException): # IndividualNameCleaver("gobbledy blah bloop!!!.p,.lcrg%%% #$<").parse() - #self.assertEqual('gobbledy blah bloop!!!.p,.lcrg%%% #$<', str(IndividualNameCleaver('gobbledy blah bloop!!!.p,.lcrg%%% #$<').parse(safe=True))) + # self.assertEqual('gobbledy blah bloop!!!.p,.lcrg%%% #$<', str(IndividualNameCleaver('gobbledy blah bloop!!!.p,.lcrg%%% #$<').parse(safe=True))) def test_parse_safe__politician(self): pass - #with self.assertRaises(UnparseableNameException): + # with self.assertRaises(UnparseableNameException): # PoliticianNameCleaver("BARDEN PHD J D, R CHRISTOPHER").parse() - #self.assertEqual('BARDEN PHD J D, R CHRISTOPHER', str(PoliticianNameCleaver('BARDEN PHD J D, R CHRISTOPHER').parse(safe=True))) + # self.assertEqual('BARDEN PHD J D, R CHRISTOPHER', str(PoliticianNameCleaver('BARDEN PHD J D, R CHRISTOPHER').parse(safe=True))) - #with self.assertRaises(UnparseableNameException): + # with self.assertRaises(UnparseableNameException): # PoliticianNameCleaver("gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<").parse() - #self.assertEqual('gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<', str(PoliticianNameCleaver('gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<').parse(safe=True))) + # self.assertEqual('gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<', str(PoliticianNameCleaver('gobbledy gook bah bah bloop!!!.p,.lcrg%%% #$<').parse(safe=True))) def test_parse_safe__organization(self): self.assertEqual('', OrganizationNameCleaver(None).parse(safe=True)) From 0f621f42a20529179ce705596296d53d96195345 Mon Sep 17 00:00:00 2001 From: Bidhan Date: Thu, 18 May 2017 13:04:38 +0545 Subject: [PATCH 3/7] Ignore files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index e5d82b7..6425de4 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ dist .noseids build/* .idea/ +venv.sh +**/__pycache__/* +virtualenvs/ From ecfdb53d7cd02699f82f4764576a0abcd25403d7 Mon Sep 17 00:00:00 2001 From: Bidhan Date: Thu, 18 May 2017 16:29:29 +0545 Subject: [PATCH 4/7] - Add support for Python 3 - Remove spaces from regex in convert_running_mates_names_to_obj method of PoliticianNameCleaver - Fix test cases --- name_cleaver/cleaver.py | 9 +++++---- name_cleaver/names.py | 22 ++++++++++------------ name_cleaver/test_name_cleaver.py | 15 ++++++++------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/name_cleaver/cleaver.py b/name_cleaver/cleaver.py index ecc09cd..b99baff 100644 --- a/name_cleaver/cleaver.py +++ b/name_cleaver/cleaver.py @@ -3,6 +3,7 @@ from names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \ OrganizationName from nicknames import NICKNAMES +from builtins import str class BaseNameCleaver(object): @@ -47,7 +48,7 @@ def parse(self, safe=False): name = self.reverse_last_first(name) self.name = self.convert_name_to_obj(name, nick, honorific, suffix) - except Exception, e: + except Exception as e: return self.cannot_parse(safe, e) finally: if isinstance(self.name, self.object_class) and self.name.last: @@ -207,7 +208,7 @@ def parse(self, safe=False): try: self.strip_party() self.name = self.convert_name_to_obj(self.name) # important for "last, first", and also running mates - except Exception, e: + except Exception as e: return self.cannot_parse(safe, e) finally: if (isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name, @@ -231,7 +232,7 @@ def convert_regular_name_to_obj(self, name): return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name) if x]) def convert_running_mates_names_to_obj(self, name): - return RunningMatesNames(*[self.convert_name_to_obj(x) for x in re.split(' [&/] ', name)]) + return RunningMatesNames(*[self.convert_name_to_obj(x) for x in re.split('[&/]', name)]) class OrganizationNameCleaver(BaseNameCleaver): @@ -248,7 +249,7 @@ def parse(self, safe=False): self.name = self.name.strip() self.name = self.get_object_class().new(self.name) - except Exception, e: + except Exception as e: return self.cannot_parse(safe, e) finally: if isinstance(self.name, self.object_class): diff --git a/name_cleaver/names.py b/name_cleaver/names.py index dd91777..f48a7fd 100644 --- a/name_cleaver/names.py +++ b/name_cleaver/names.py @@ -1,4 +1,6 @@ import re +from builtins import str +from future.utils import python_2_unicode_compatible DEGREE_RE = 'j\.?d\.?|m\.?d\.?|ph\.?d\.?' SUFFIX_RE = '([js]r\.?|%s|[IVX]{2,})' % DEGREE_RE @@ -91,11 +93,8 @@ def case_name_parts(self): def primary_name_parts(self): return [self.without_extra_phrases()] - def __unicode__(self): - return unicode(self.name) - def __str__(self): - return unicode(self.name).encode('utf-8') + return self.name def without_extra_phrases(self): """Removes parenthetical and dashed phrases""" @@ -132,7 +131,7 @@ def expand(self): def kernel(self): """ The 'kernel' is an attempt to get at just the most pithy words in the name """ - stop_words = [y.lower() for y in self.abbreviations.values() + self.filler_words] + stop_words = [y.lower() for y in list(self.abbreviations.values()) + self.filler_words] kernel = ' '.join([x for x in self.expand().split() if x.lower() not in stop_words]) # this is a hack to get around the fact that this is the only two-word phrase we want to block @@ -148,6 +147,7 @@ def crp_style_firm_name(self, with_et_al=True): return ', '.join(self.kernel().split()[0:2]) +@python_2_unicode_compatible class PersonName(Name): honorific = None first = None @@ -271,11 +271,8 @@ def detect_and_fix_two_part_surname(self, args): else: i += 1 - def __unicode__(self): - return unicode(self.name_str()) - def __str__(self): - return unicode(self.name_str()).encode('utf-8') + return self.name_str() def name_str(self): return ' '.join([x.strip() for x in [ @@ -344,6 +341,7 @@ def __repr__(self): return self.as_dict() +@python_2_unicode_compatible class PoliticalMetadata(object): party = None state = None @@ -358,9 +356,9 @@ def __str__(self): if self.party or self.state: party_state = u"-".join([x for x in [self.party, self.state] if x]) # because presidential candidates are listed without a state - return unicode(u"{0} ({1})".format(unicode(self.name_str()), party_state)).encode('utf-8') + return u"{0} ({1})".format(self.name_str(), party_state) else: - return unicode(self.name_str()).encode('utf-8') + return self.name_str() class PoliticianName(PoliticalMetadata, PersonName): @@ -373,7 +371,7 @@ def __init__(self, mate1, mate2): self.mate2 = mate2 def name_str(self): - return u' & '.join([unicode(self.mate1), unicode(self.mate2)]) + return u' & '.join([str(self.mate1), str(self.mate2)]) def __repr__(self): return self.__str__() diff --git a/name_cleaver/test_name_cleaver.py b/name_cleaver/test_name_cleaver.py index 0a939e6..99ff78c 100644 --- a/name_cleaver/test_name_cleaver.py +++ b/name_cleaver/test_name_cleaver.py @@ -1,5 +1,6 @@ from cleaver import PoliticianNameCleaver, OrganizationNameCleaver, \ IndividualNameCleaver, UnparseableNameException +from builtins import str try: import unittest2 as unittest @@ -114,7 +115,7 @@ def test_van_is_valid_first_name(self): self.assertEqual(['Van', 'Morrison'], PoliticianNameCleaver('Van Morrison').parse().primary_name_parts()) def test_alternate_running_mates_format(self): - self.assertEqual('Obama/Biden 2012', str(PoliticianNameCleaver('2012, Obama/Biden').parse())) + self.assertEqual('Obama 2012 & Biden', str(PoliticianNameCleaver('2012, Obama/Biden').parse())) def test_alternate_punctuation(self): self.assertEqual('Charles W. Boustany, Jr.', str(PoliticianNameCleaver('Charles W. Boustany Jr.').parse())) @@ -303,28 +304,28 @@ def test_capitalizes_letter_after_hyphen(self): class TestUnicode(unittest.TestCase): def test_individual(self): - self.assertEqual(u'Tobias F\u00fcnke'.encode('utf-8'), + self.assertEqual(u'Tobias F\u00fcnke', str(IndividualNameCleaver(u'F\u00fcnke, Tobias').parse())) def test_politician(self): - self.assertEqual(u'Tobias F\u00fcnke'.encode('utf-8'), + self.assertEqual(u'Tobias F\u00fcnke', str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse())) def test_politician_plus_metadata(self): - self.assertEqual(u'Tobias F\u00fcnke (D-CA)'.encode('utf-8'), + self.assertEqual(u'Tobias F\u00fcnke (D-CA)', str(PoliticianNameCleaver(u'F\u00fcnke, Tobias').parse().plus_metadata('D', 'CA'))) def test_politician_running_mates(self): - self.assertEqual(u'Tobias F\u00fcnke & Lindsay F\u00fcnke'.encode('utf-8'), + self.assertEqual(u'Tobias F\u00fcnke & Lindsay F\u00fcnke', str(PoliticianNameCleaver(u'F\u00fcnke, Tobias & F\u00fcnke, Lindsay').parse())) def test_running_mates_with_metadata(self): - self.assertEqual(u'Ted Strickland & Le\u00e9 Fischer (D-OH)'.encode('utf-8'), + self.assertEqual(u'Ted Strickland & Le\u00e9 Fischer (D-OH)', str(PoliticianNameCleaver(u'STRICKLAND, TED & FISCHER, LE\u00c9').parse().plus_metadata('D', 'OH'))) def test_organization(self): - self.assertEqual(u'\u00C6tna, Inc.'.encode('utf-8'), + self.assertEqual(u'\u00C6tna, Inc.', str(OrganizationNameCleaver(u'\u00C6tna, Inc.').parse())) From 955234ca49a995517df6a2126aab8b1ad53d7816 Mon Sep 17 00:00:00 2001 From: Bidhan Date: Thu, 18 May 2017 17:24:22 +0545 Subject: [PATCH 5/7] Fix imports --- name_cleaver/__init__.py | 2 +- name_cleaver/cleaver.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/name_cleaver/__init__.py b/name_cleaver/__init__.py index f2ef2ed..ec24fc8 100644 --- a/name_cleaver/__init__.py +++ b/name_cleaver/__init__.py @@ -1 +1 @@ -from cleaver import PoliticianNameCleaver, OrganizationNameCleaver, IndividualNameCleaver +from .cleaver import PoliticianNameCleaver, OrganizationNameCleaver, IndividualNameCleaver diff --git a/name_cleaver/cleaver.py b/name_cleaver/cleaver.py index b99baff..964c96c 100644 --- a/name_cleaver/cleaver.py +++ b/name_cleaver/cleaver.py @@ -1,8 +1,8 @@ import re -from exception import UnparseableNameException -from names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \ +from .exception import UnparseableNameException +from .names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \ OrganizationName -from nicknames import NICKNAMES +from .nicknames import NICKNAMES from builtins import str From e9584a318d2ed4f975e3b2578323f891a443790d Mon Sep 17 00:00:00 2001 From: Bidhan Date: Thu, 18 May 2017 17:41:05 +0545 Subject: [PATCH 6/7] Fix import in test --- name_cleaver/test_name_cleaver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/name_cleaver/test_name_cleaver.py b/name_cleaver/test_name_cleaver.py index 99ff78c..bd512fe 100644 --- a/name_cleaver/test_name_cleaver.py +++ b/name_cleaver/test_name_cleaver.py @@ -1,4 +1,4 @@ -from cleaver import PoliticianNameCleaver, OrganizationNameCleaver, \ +from .cleaver import PoliticianNameCleaver, OrganizationNameCleaver, \ IndividualNameCleaver, UnparseableNameException from builtins import str From 6f7bcf88355634a37a669c8dff7f95c0602f6394 Mon Sep 17 00:00:00 2001 From: Bidhan Date: Thu, 18 May 2017 18:05:05 +0545 Subject: [PATCH 7/7] Add future as dependency --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index e698442..2b14a7a 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,9 @@ author_email='arowland@sunlightfoundation.com', url='http://github.com/sunlightlabs/name-cleaver/', packages=find_packages(), + install_requires=[ + 'future', + ], license='BSD License', platforms=["any"], classifiers=[