From 99a5460b6af785494f3531bf27fe7555d111b344 Mon Sep 17 00:00:00 2001 From: Victor Trac Date: Wed, 20 Aug 2014 14:06:42 -0500 Subject: [PATCH] fixing bug with cleaving hyphens --- name_cleaver/names.py | 6 ++++-- name_cleaver/test_name_cleaver.py | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/name_cleaver/names.py b/name_cleaver/names.py index 5481453..a98ee5a 100644 --- a/name_cleaver/names.py +++ b/name_cleaver/names.py @@ -111,10 +111,12 @@ def without_extra_phrases(self): if "-" in name: hyphen_parts = name.rsplit("-", 1) # if the part after the hyphen is shorter than the part before, - # AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X), # AND the hyphen is preceded by either whitespace or at least four characters, + # AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X), # discard the hyphen and whatever follows - if len(hyphen_parts[1]) < len(hyphen_parts[0]) and re.search(r'(\w{4,}|\s+)$', hyphen_parts[0]) and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]): + if len(hyphen_parts[1]) < len(hyphen_parts[0]) \ + and re.search(r'^(\s+)|^(\w{0,4})$', hyphen_parts[1]) \ + and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]): name = hyphen_parts[0].strip() return name diff --git a/name_cleaver/test_name_cleaver.py b/name_cleaver/test_name_cleaver.py index 4a172a6..e38868a 100644 --- a/name_cleaver/test_name_cleaver.py +++ b/name_cleaver/test_name_cleaver.py @@ -144,13 +144,15 @@ def test_expand_with_two_tokens_to_expand(self): def test_dont_strip_after_hyphens_too_soon_in_a_name(self): self.assertEqual('US-Russia Business Council', OrganizationNameCleaver('US-Russia Business Council').parse().kernel()) self.assertEqual('Wal-Mart Stores', OrganizationNameCleaver('Wal-Mart Stores, Inc.').parse().kernel()) + self.assertEqual('Williams-Sonoma', OrganizationNameCleaver('Williams-Sonoma, Inc.').parse().kernel()) + self.assertEqual('Austin American-Statesman', OrganizationNameCleaver('Austin American-Statesman').parse().kernel()) # these were new after the hyphen rewrite self.assertEqual('Coca-Cola Company', OrganizationNameCleaver('Coca-Cola Co').parse().expand()) # used to return 'Coca' self.assertEqual('Rolls-Royce PLC', OrganizationNameCleaver('Rolls-Royce PLC').parse().expand()) # used to return 'Rolls' def test_drop_postname_hyphen_phrases(self): - self.assertEqual('Lawyers For Better Government', OrganizationNameCleaver('LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases()) + self.assertEqual('Lawyers For Better Government-Illinois', OrganizationNameCleaver('LAWYERS FOR BETTER GOVERNMENT-ILLINOIS').parse().without_extra_phrases()) self.assertEqual('Jobs Opportunity And Freedom Political Action Committee', OrganizationNameCleaver('JOBS OPPORTUNITY AND FREEDOM POLITICAL ACTION COMMITTEE - JOFPAC').parse().without_extra_phrases()) def test_kernel(self):