sunlightlabs · bidhan-a · Aug 20, 2014 · May 18, 2017 · May 18, 2017 · May 18, 2017
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,7 @@ dist
 *.swn
 .noseids
 build/*
+.idea/
+venv.sh
+**/__pycache__/*
+virtualenvs/
diff --git a/name_cleaver/__init__.py b/name_cleaver/__init__.py
@@ -1 +1 @@
-from cleaver import PoliticianNameCleaver, OrganizationNameCleaver, IndividualNameCleaver
+from .cleaver import PoliticianNameCleaver, OrganizationNameCleaver, IndividualNameCleaver
diff --git a/name_cleaver/cleaver.py b/name_cleaver/cleaver.py
@@ -1,8 +1,9 @@
 import re
-from exception import UnparseableNameException
-from names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \
+from .exception import UnparseableNameException
+from .names import SUFFIX_RE, DEGREE_RE, PersonName, PoliticianName, RunningMatesNames, \
     OrganizationName
-from nicknames import NICKNAMES
+from .nicknames import NICKNAMES
+from builtins import str
 
 
 class BaseNameCleaver(object):
@@ -15,7 +16,7 @@ def cannot_parse(self, safe, e=None):
             return self.orig_str
         else:
             # uncomment for debugging
-            #if e:
+            # if e:
             #   print e
             raise UnparseableNameException(u"Couldn't parse name: {0}".format(self.name))
 
@@ -33,7 +34,7 @@ def parse(self, safe=False):
         if not self.orig_str:
             return ''
 
-        if not ' ' in self.name:
+        if ' ' not in self.name:
             self.name = self.get_object_class().new_from_tokens(self.name)
             return self.name.case_name_parts()
         else:
@@ -47,10 +48,10 @@ def parse(self, safe=False):
 
                 name = self.reverse_last_first(name)
                 self.name = self.convert_name_to_obj(name, nick, honorific, suffix)
-            except Exception, e:
+            except Exception as e:
                 return self.cannot_parse(safe, e)
             finally:
-                if (isinstance(self.name, self.object_class) and self.name.last):
+                if isinstance(self.name, self.object_class) and self.name.last:
                     return self.name.case_name_parts()
                 else:
                     return self.cannot_parse(safe)
@@ -135,7 +136,8 @@ def reverse_last_first(self, name):
     def convert_name_to_obj(self, name, nick, honorific, suffix):
         name = ' '.join([x.strip() for x in [name, nick, suffix, honorific] if x])
 
-        return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name)], **{'allow_quoted_nicknames': True})
+        return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name)],
+                                                       **{'allow_quoted_nicknames': True})
 
     @classmethod
     def name_processing_failed(cls, subject_name):
@@ -199,17 +201,18 @@ def parse(self, safe=False):
         if not self.orig_str:
             return ''
 
-        if not ' ' in self.name:
+        if ' ' not in self.name:
             self.name = self.get_object_class().new_from_tokens(self.name)
             return self.name.case_name_parts()
         else:
             try:
                 self.strip_party()
                 self.name = self.convert_name_to_obj(self.name)  # important for "last, first", and also running mates
-            except Exception, e:
+            except Exception as e:
                 return self.cannot_parse(safe, e)
             finally:
-                if ((isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name, RunningMatesNames)):
+                if (isinstance(self.name, self.object_class) and self.name.last) or isinstance(self.name,
+                                                                                               RunningMatesNames):
                     return self.name.case_name_parts()
                 else:
                     return self.cannot_parse(safe)
@@ -229,7 +232,7 @@ def convert_regular_name_to_obj(self, name):
         return self.get_object_class().new_from_tokens(*[x for x in re.split('\s+', name) if x])
 
     def convert_running_mates_names_to_obj(self, name):
-        return RunningMatesNames(*[self.convert_name_to_obj(x) for x in re.split(' [&/] ', name)])
+        return RunningMatesNames(*[self.convert_name_to_obj(x) for x in re.split('[&/]', name)])
 
 
 class OrganizationNameCleaver(BaseNameCleaver):
@@ -246,7 +249,7 @@ def parse(self, safe=False):
             self.name = self.name.strip()
 
             self.name = self.get_object_class().new(self.name)
-        except Exception, e:
+        except Exception as e:
             return self.cannot_parse(safe, e)
         finally:
             if isinstance(self.name, self.object_class):

diff --git a/name_cleaver/names.py b/name_cleaver/names.py
@@ -1,16 +1,19 @@
 import re
+from builtins import str
+from future.utils import python_2_unicode_compatible
 
 DEGREE_RE = 'j\.?d\.?|m\.?d\.?|ph\.?d\.?'
 SUFFIX_RE = '([js]r\.?|%s|[IVX]{2,})' % DEGREE_RE
 
+
 class Name(object):
     scottish_re = r'(?i)\b(?P<mc>ma?c)(?!hin)(?P<first_letter>\w)\w+'
 
     def primary_name_parts(self):
         raise NotImplementedError("Subclasses of Name must implement primary_name_parts.")
 
     def non_empty_primary_name_parts(self):
-        return ' '.join([ x for x in self.primary_name_parts() if x ])
+        return ' '.join([x for x in self.primary_name_parts() if x])
 
     def is_mixed_case(self):
         return re.search(r'[A-Z][a-z]', self.non_empty_primary_name_parts())
@@ -39,7 +42,7 @@ class OrganizationName(Name):
         'inst': 'Institute',
         'corp': 'Corporation',
         'co': 'Company',
-        'fedn' : 'Federation',
+        'fedn': 'Federation',
         'fed': 'Federal',
         'fzco': 'Company',
         'usa': 'USA',
@@ -65,7 +68,7 @@ class OrganizationName(Name):
 
     name = None
 
-    #suffix = None
+    # suffix = None
 
     def new(self, name):
         self.name = name
@@ -77,32 +80,30 @@ def case_name_parts(self):
             self.name = self.uppercase_the_scots(self.name)
 
             if re.match(r'(?i)^\w*PAC$', self.name):
-                self.name = self.name.upper() # if there's only one word that ends in PAC, make the whole thing uppercase
+                # if there's only one word that ends in PAC, make the whole thing uppercase
+                self.name = self.name.upper()
             else:
-                self.name = re.sub(r'(?i)\bpac\b', 'PAC', self.name) # otherwise just uppercase the PAC part
+                self.name = re.sub(r'(?i)\bpac\b', 'PAC', self.name)  # otherwise just uppercase the PAC part
 
             self.name = self.uppercase_the_scots(self.name)
             self.name = self.fix_case_for_possessives(self.name)
 
         return self
 
     def primary_name_parts(self):
-        return [ self.without_extra_phrases() ]
-
-    def __unicode__(self):
-        return unicode(self.name)
+        return [self.without_extra_phrases()]
 
     def __str__(self):
-        return unicode(self.name).encode('utf-8')
+        return self.name
 
     def without_extra_phrases(self):
-        """Removes parenthethical and dashed phrases"""
+        """Removes parenthetical and dashed phrases"""
         # the last parenthesis is optional, because sometimes they are truncated
         name = re.sub(r'\s*\([^)]*\)?\s*$', '', self.name)
         name = re.sub(r'(?i)\s* formerly.*$', '', name)
         name = re.sub(r'(?i)\s*and its affiliates$', '', name)
         name = re.sub(r'\bet al\b', '', name)
-        
+
         # in some datasets, the name of an organization is followed by a hyphen and an abbreviated name, or a specific
         # department or geographic subdivision; we want to remove this extraneous stuff without breaking names like
         # Wal-Mart or Williams-Sonoma
@@ -111,10 +112,12 @@ def without_extra_phrases(self):
         if "-" in name:
             hyphen_parts = name.rsplit("-", 1)
             # if the part after the hyphen is shorter than the part before,
-            # AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X),
             # AND the hyphen is preceded by either whitespace or at least four characters,
+            # AND isn't either a number (often occurs in Union names) or a single letter (e.g., Tech-X),
             # discard the hyphen and whatever follows
-            if len(hyphen_parts[1]) < len(hyphen_parts[0]) and re.search(r'(\w{4,}|\s+)$', hyphen_parts[0]) and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]):
+            if len(hyphen_parts[1]) < len(hyphen_parts[0]) \
+                    and re.search(r'^(\s+)|^(\w{0,4})$', hyphen_parts[1]) \
+                    and not re.match(r'^([a-zA-Z]|[0-9]+)$', hyphen_parts[1]):
                 name = hyphen_parts[0].strip()
 
         return name
@@ -128,8 +131,8 @@ def expand(self):
 
     def kernel(self):
         """ The 'kernel' is an attempt to get at just the most pithy words in the name """
-        stop_words = [ y.lower() for y in self.abbreviations.values() + self.filler_words ]
-        kernel = ' '.join([ x for x in self.expand().split() if x.lower() not in stop_words ])
+        stop_words = [y.lower() for y in list(self.abbreviations.values()) + self.filler_words]
+        kernel = ' '.join([x for x in self.expand().split() if x.lower() not in stop_words])
 
         # this is a hack to get around the fact that this is the only two-word phrase we want to block
         # amongst our stop words. if we end up with more, we may need a better way to do this
@@ -144,6 +147,7 @@ def crp_style_firm_name(self, with_et_al=True):
             return ', '.join(self.kernel().split()[0:2])
 
 
+@python_2_unicode_compatible
 class PersonName(Name):
     honorific = None
     first = None
@@ -195,9 +199,9 @@ def new_from_tokens(self, *args, **kwargs):
         """
 
         if kwargs.get('allow_quoted_nicknames'):
-            args = [ x.strip() for x in args if not re.match(r'^[(]', x) ]
+            args = [x.strip() for x in args if not re.match(r'^[(]', x)]
         else:
-            args = [ x.strip() for x in args if not re.match(r'^[("]', x) ]
+            args = [x.strip() for x in args if not re.match(r'^[("]', x)]
 
         if len(args) > 2:
             self.detect_and_fix_two_part_surname(args)
@@ -261,17 +265,14 @@ def detect_and_fix_two_part_surname(self, args):
         i = 0
         while i < len(args) - 1:
             if args[i].lower() in self.family_name_prefixes:
-                args[i] = ' '.join(args[i:i+2])
-                del(args[i+1])
+                args[i] = ' '.join(args[i:i + 2])
+                del (args[i + 1])
                 break
             else:
                 i += 1
 
-    def __unicode__(self):
-        return unicode(self.name_str())
-
     def __str__(self):
-        return unicode(self.name_str()).encode('utf-8')
+        return self.name_str()
 
     def name_str(self):
         return ' '.join([x.strip() for x in [
@@ -320,25 +321,27 @@ def is_only_initials(self, name_part):
     def capitalize_and_punctuate_initials(self, name_part):
         if self.is_only_initials(name_part):
             if '.' not in name_part:
-                return ''.join([ '{0}.'.format(x.upper()) for x in name_part])
+                return ''.join(['{0}.'.format(x.upper()) for x in name_part])
             else:
                 return name_part
         else:
             return name_part
 
     def primary_name_parts(self, include_middle=False):
         if include_middle:
-            return [ self.first, self.middle, self.last ]
+            return [self.first, self.middle, self.last]
         else:
-            return [ self.first, self.last ]
+            return [self.first, self.last]
 
     def as_dict(self):
-        return { 'first': self.first, 'middle': self.middle, 'last': self.last, 'honorific': self.honorific, 'suffix': self.suffix }
+        return {'first': self.first, 'middle': self.middle, 'last': self.last, 'honorific': self.honorific,
+                'suffix': self.suffix}
 
     def __repr__(self):
         return self.as_dict()
 
 
+@python_2_unicode_compatible
 class PoliticalMetadata(object):
     party = None
     state = None
@@ -351,30 +354,30 @@ def plus_metadata(self, party, state):
 
     def __str__(self):
         if self.party or self.state:
-            party_state = u"-".join([ x for x in [self.party, self.state] if x ]) # because presidential candidates are listed without a state
-            return unicode(u"{0} ({1})".format(unicode(self.name_str()), party_state)).encode('utf-8')
+            party_state = u"-".join([x for x in [self.party, self.state] if
+                                     x])  # because presidential candidates are listed without a state
+            return u"{0} ({1})".format(self.name_str(), party_state)
         else:
-            return unicode(self.name_str()).encode('utf-8')
+            return self.name_str()
 
 
 class PoliticianName(PoliticalMetadata, PersonName):
     pass
 
 
 class RunningMatesNames(PoliticalMetadata):
-
     def __init__(self, mate1, mate2):
         self.mate1 = mate1
         self.mate2 = mate2
 
     def name_str(self):
-        return u' & '.join([unicode(self.mate1), unicode(self.mate2)])
+        return u' & '.join([str(self.mate1), str(self.mate2)])
 
     def __repr__(self):
         return self.__str__()
 
     def mates(self):
-        return [ self.mate1, self.mate2 ]
+        return [self.mate1, self.mate2]
 
     def is_mixed_case(self):
         for mate in self.mates():
@@ -388,5 +391,3 @@ def case_name_parts(self):
             mate.case_name_parts()
 
         return self
-
-