Skip to content

Commit

Permalink
Parse articles as unicode. Fixes bad offsets problem. (#166)
Browse files Browse the repository at this point in the history
  • Loading branch information
normangilmore authored Oct 25, 2017
1 parent 61ed3e0 commit ec960ea
Showing 1 changed file with 6 additions and 7 deletions.
13 changes: 6 additions & 7 deletions data/parse_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def parse_document(fullpath, filename):
return parse_article(raw_text, filename)

def parse_article(raw_text, filename):

# Convert to UTF-8, removing the Windows BOM if present
raw_text = raw_text.decode('utf-8-sig', errors='strict')
# extract info from the file name
article_number, city, state, periodical, periodical_code = parse_filename(filename)

Expand Down Expand Up @@ -144,10 +147,6 @@ def parse_header(raw_text):
lines = raw_text.splitlines()
header_rownum = 1
for i, line in enumerate(lines):
# Remove windows UTF-8 junk
if line.startswith(codecs.BOM_UTF8):
line = line[3:]

# Skip blank lines
if not line.strip():
continue
Expand Down Expand Up @@ -358,9 +357,9 @@ def merge_safe(a, b):
return reduce(merge_safe, bits_to_join, '')

def requires_separator(a, b):
no_separator_chars_left = ['\n', '\x9c']
no_separator_chars_right = ['\n', ',', '.',]
no_separator_pair_right = ['"\n', '" ']
no_separator_chars_left = [u'\n', u'\u009c']
no_separator_chars_right = [u'\n', u',', u'.',]
no_separator_pair_right = [u'"\n', u'" ']
return (a[-1] not in no_separator_chars_left
and b[0] not in no_separator_chars_right
and b[0:2] not in no_separator_pair_right)
Expand Down

0 comments on commit ec960ea

Please sign in to comment.