Skip to content

Commit

Permalink
fixed bug (hyphen at end of char class) in regexes.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Hobson Lane committed Sep 3, 2018
1 parent a138a73 commit fd112fe
Show file tree
Hide file tree
Showing 58 changed files with 19 additions and 9 deletions.
Empty file modified .coveragerc
100644 → 100755
Empty file.
Empty file modified .gitignore
100644 → 100755
Empty file.
Empty file modified .pre-commit-config.yaml
100644 → 100755
Empty file.
Empty file modified .travis.yml
100644 → 100755
Empty file.
Empty file modified AUTHORS.rst
100644 → 100755
Empty file.
Empty file modified CHANGELOG.rst
100644 → 100755
Empty file.
Empty file modified CHANGES.rst
100644 → 100755
Empty file.
Empty file modified LICENSE.txt
100644 → 100755
Empty file.
Empty file modified README.rst
100644 → 100755
Empty file.
Empty file modified docs/Makefile
100644 → 100755
Empty file.
Empty file modified docs/_static/.gitignore
100644 → 100755
Empty file.
Empty file modified docs/authors.rst
100644 → 100755
Empty file.
Empty file modified docs/changelog.rst
100644 → 100755
Empty file.
Empty file modified docs/conf.py
100644 → 100755
Empty file.
Empty file modified docs/index.rst
100644 → 100755
Empty file.
Empty file modified docs/license.rst
100644 → 100755
Empty file.
Empty file modified pytest.ini
100644 → 100755
Empty file.
Empty file modified requirements.txt
100644 → 100755
Empty file.
Empty file modified setup.cfg
100644 → 100755
Empty file.
Empty file modified setup.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/__init__.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/charlist.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/constants.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/ascii-equivalents.csv
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/ascii-equivalents.ods
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emojione-activities.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emojione-animals-and-nature.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emojione-flags.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emojione-food-and-drink.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emojione-objects.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emojione-smilies.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emojione-symbols.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emojione-travel-and-places.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emoticons-from-wikipedia.csv
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/emoticons-from-wikipedia.ods
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/goodreads-omniscient-books.txt
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/tlds-from-iana.csv
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/uri-schemes.csv
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/uri-schemes.xhtml.csv
100644 → 100755
Empty file.
Empty file modified src/pugnlp/data/wsj_pugnlp.detector_morse.Detector.json.gz
100644 → 100755
Empty file.
Empty file modified src/pugnlp/detector_morse.py
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion src/pugnlp/futil.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def touch_p(filepath, times=None, mkdir=True):
def sudo_yield_file_lines(file_path='/etc/NetworkManager/system-connections/*'):
r"""Cat a file iterating/yielding one line at a time,
shell will exeucte: `sudo cat $file_path` so if your shell doesn't have sudo or cat, no joy
shell will execute: `sudo cat $file_path` so if your shell doesn't have sudo or cat, no joy
Input:
file_path(str): glob stars are fine
Expand Down
Empty file modified src/pugnlp/penn_treebank_tokenizer.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/plots.py
100644 → 100755
Empty file.
10 changes: 5 additions & 5 deletions src/pugnlp/regexes.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,18 @@
nonphrase = re.compile(r"[^-\w\s/&']")
parenthetical_time = re.compile(r'([^(]*)\(\s*(\d+)\s*(?:min)?\s*\)([^(]*)', re.IGNORECASE)

fqdn = r'(\b[a-zA-Z0-9-.]+\b([.]' + r'|'.join(constants.tld_iana) + r'\b)\b)' # noqa
fqdn_popular = r'(\b[a-zA-Z0-9-.]+\b([.]' + r'|'.join(constants.tld_popular) + r'\b)\b)'
username = r'(\b[a-zA-Z0-9-.!#$%&*+-/=?^_`{|}~]+\b)'
fqdn = r'(\b[-.a-zA-Z0-9]+\b([.]' + r'|'.join(constants.tld_iana) + r'\b)\b)' # noqa
fqdn_popular = r'(\b[-.a-zA-Z0-9]+\b([.]' + r'|'.join(constants.tld_popular) + r'\b)\b)'
username = r'(\b[-.a-zA-Z0-9!#$%&*+/=?^_`{|}~]+\b)'

email = re.compile(r'(\b' + username + r'\b@\b' + fqdn + r'\b)')
email_popular = re.compile(r'(\b' + username + r'\b@\b' + fqdn_popular + r'\b)')

# TODO: unmatched surrounding symbols are accepted/consumed, likewise for multiple dots/ats
at = r'(([-@="_(\[{\|\s]+(at|At|AT)[-@="_)\]\}\|\s]+)|[@])'
dot = r'(([-.="_(\[{\|\s]+(dot|dt|Dot|DOT)[-.="_)\]\}\|\s]+)|[.])'
fqdn_obfuscated = r'(\b(([a-zA-Z0-9-]+' + dot + r'){1,7})(' + r'|'.join(constants.tld_iana) + r')\b)'
fqdn_popular_obfuscated = r'(\b(([a-zA-Z0-9-]+' + dot + r'){1,7})(' + r'|'.join(constants.tld_popular) + r')\b)'
fqdn_obfuscated = r'(\b(([-a-zA-Z0-9]+' + dot + r'){1,7})(' + r'|'.join(constants.tld_iana) + r')\b)'
fqdn_popular_obfuscated = r'(\b(([-a-zA-Z0-9]+' + dot + r'){1,7})(' + r'|'.join(constants.tld_popular) + r')\b)'
username_obfuscated = r'(([a-zA-Z0-9!#$%&*+/?^`~]+' + dot + r'?){1,7})'
email_obfuscated = re.compile(r'(\b' + username_obfuscated + at + fqdn_obfuscated + r'\b)')
email_popular_obfuscated = re.compile(r'(\b' + username_obfuscated + at + fqdn_popular_obfuscated + r'\b)')
Expand Down
Empty file modified src/pugnlp/scripts/__init__.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/scripts/bon_lsi.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/segmentation.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/skeleton.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/stats.py
100644 → 100755
Empty file.
Empty file modified src/pugnlp/tutil.py
100644 → 100755
Empty file.
16 changes: 13 additions & 3 deletions src/pugnlp/util.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,28 @@ def remove_invalid_chars(str_or_seq, valid_regex=r'\w'):
return seq[0] if isinstance(str_or_seq, str) else seq


def clean_columns(columns, valid_regex=r'\w', lower=True):
def clean_columns(columns, valid_regex=r'\w', lower=True, max_len=32):
""" Ensure all column name strings are valid python variable/attribute names
>>> df = pd.DataFrame(np.zeros((2, 3)), columns=['WAT??', "Don't do th!s, way too long. ya-think????", 'ok-this123.456'])
>>> df.columns = clean_columns(df.columns, max_len=12)
>>> df.head()
wat dont_do_ths_ okthis123456
0 0.0 0.0 0.0
1 0.0 0.0 0.0
"""
rettype = None
if isinstance(columns, str):
rettype = type(columns)
columns = [columns]

columns = [c.strip() for c in columns]
# # unneccessary because these are invalid characters removed below
# # unnecessary because these are invalid characters removed below
# columns = [(c[1:-1] if c[0] in '\'"' and c[-1] == c[0] else c) for c in columns]
# columns = [(c[1:-1] if c[0] in '{([<' and c[-1] in '})]>' else c) for c in columns]
columns = [re.sub('\s', '_', c).lower() for c in columns]
columns = [re.sub('\s+', '_', c).lower() for c in columns]
columns = remove_invalid_chars(columns, valid_regex=r'\w')
columns = [c[:max_len] for c in columns]
columns = np.array(columns) if rettype is None else rettype(columns[0])
return columns

Expand Down
Empty file modified test-requirements.txt
100644 → 100755
Empty file.
Empty file modified tests/conftest.py
100644 → 100755
Empty file.
Empty file modified tests/test_skeleton.py
100644 → 100755
Empty file.
Empty file modified tests/travis_install.sh
100644 → 100755
Empty file.
Empty file modified tox.ini
100644 → 100755
Empty file.

0 comments on commit fd112fe

Please sign in to comment.