AsciiDammit.py

#!/usr/bin/python2
#coding=utf-8
"""ASCII, Dammit

Stupid library to turn MS chars (like smart quotes) and ISO-Latin
chars into ASCII, dammit. Will do plain text approximations, or more
accurate HTML representations. Can also be jiggered to just fix the
smart quotes and leave the rest of ISO-Latin alone. Now fixed to
handle unicode strings correctly

Sources:
 http://www.cs.tut.fi/~jkorpela/latin1/all.html
 http://www.webreference.com/html/reference/character/isolat1.html

1.0 Initial Release (2004-11-28)
1.1 Fixed handling of unicode strings (2012-01-15) - Tom Najdek

The author hereby irrevocably places this work in the public domain.
To the extent that this statement does not divest the copyright,
the copyright holder hereby grants irrevocably to every recipient
all rights in this work otherwise reserved under copyright.
"""

__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "$Revision: 1.3 $"
__date__ = "$Date: 2009/04/28 10:45:03 $"
__license__ = "Public domain"

import re
import string
import types

CHARS = { '\x80' : ('EUR', 'euro'),
          '\x81' : ' ',
          '\x82' : (',', 'sbquo'),
          '\x83' : ('f', 'fnof'),
          '\x84' : (',,', 'bdquo'),
          '\x85' : ('...', 'hellip'),
          '\x86' : ('+', 'dagger'),
          '\x87' : ('++', 'Dagger'),
          '\x88' : ('^', 'caret'),
          '\x89' : '%',
          '\x8A' : ('S', 'Scaron'),
          '\x8B' : ('<', 'lt;'),
          '\x8C' : ('OE', 'OElig'),
          '\x8D' : '?',
          '\x8E' : 'Z',
          '\x8F' : '?',
          '\x90' : '?',
          '\x91' : ("'", 'lsquo'),
          '\x92' : ("'", 'rsquo'),
          '\x93' : ('"', 'ldquo'),
          '\x94' : ('"', 'rdquo'),
          '\x95' : ('*', 'bull'),
          '\x96' : ('-', 'ndash'),
          '\x97' : ('--', 'mdash'),
          '\x98' : ('~', 'tilde'),
          '\x99' : ('(TM)', 'trade'),
          '\x9a' : ('s', 'scaron'),
          '\x9b' : ('>', 'gt'),
          '\x9c' : ('oe', 'oelig'),
          '\x9d' : '?',
          '\x9e' : 'z',
          '\x9f' : ('Y', 'Yuml'),
          '\xa0' : (' ', 'nbsp'),
          '\xa1' : ('!', 'iexcl'),
          '\xa2' : ('c', 'cent'),
          '\xa3' : ('GBP', 'pound'),
          '\xa4' : ('$', 'curren'), #This approximation is especially lame.
          '\xa5' : ('YEN', 'yen'),
          '\xa6' : ('|', 'brvbar'),
          '\xa7' : ('S', 'sect'),
          '\xa8' : ('..', 'uml'),
          '\xa9' : ('', 'copy'),
          '\xaa' : ('(th)', 'ordf'),
          '\xab' : ('<<', 'laquo'),
          '\xac' : ('!', 'not'),
          '\xad' : (' ', 'shy'),
          '\xae' : ('(R)', 'reg'),
          '\xaf' : ('-', 'macr'),
          '\xb0' : ('o', 'deg'),
          '\xb1' : ('+-', 'plusmm'),
          '\xb2' : ('2', 'sup2'),
          '\xb3' : ('3', 'sup3'),
          '\xb4' : ("'", 'acute'),
          '\xb5' : ('u', 'micro'),
          '\xb6' : ('P', 'para'),
          '\xb7' : ('*', 'middot'),
          '\xb8' : (',', 'cedil'),
          '\xb9' : ('1', 'sup1'),
          '\xba' : ('(th)', 'ordm'),
          '\xbb' : ('>>', 'raquo'),
          '\xbc' : ('1/4', 'frac14'),
          '\xbd' : ('1/2', 'frac12'),
          '\xbe' : ('3/4', 'frac34'),
          '\xbf' : ('?', 'iquest'),          
          '\xc0' : ('A', "Agrave"),
          '\xc1' : ('A', "Aacute"),
          '\xc2' : ('A', "Acirc"),
          '\xc3' : ('A', "Atilde"),
          '\xc4' : ('A', "Auml"),
          '\xc5' : ('A', "Aring"),
          '\xc6' : ('AE', "Aelig"),
          '\xc7' : ('C', "Ccedil"),
          '\xc8' : ('E', "Egrave"),
          '\xc9' : ('E', "Eacute"),
          '\xca' : ('E', "Ecirc"),
          '\xcb' : ('E', "Euml"),
          '\xcc' : ('I', "Igrave"),
          '\xcd' : ('I', "Iacute"),
          '\xce' : ('I', "Icirc"),
          '\xcf' : ('I', "Iuml"),
          '\xd0' : ('D', "Eth"),
          '\xd1' : ('N', "Ntilde"),
          '\xd2' : ('O', "Ograve"),
          '\xd3' : ('O', "Oacute"),
          '\xd4' : ('O', "Ocirc"),
          '\xd5' : ('O', "Otilde"),
          '\xd6' : ('O', "Ouml"),
          '\xd7' : ('*', "times"),
          '\xd8' : ('O', "Oslash"),
          '\xd9' : ('U', "Ugrave"),
          '\xda' : ('U', "Uacute"),
          '\xdb' : ('U', "Ucirc"),
          '\xdc' : ('U', "Uuml"),
          '\xdd' : ('Y', "Yacute"),
          '\xde' : ('b', "Thorn"),
          '\xdf' : ('B', "szlig"),
          '\xe0' : ('a', "agrave"),
          '\xe1' : ('a', "aacute"),
          '\xe2' : ('a', "acirc"),
          '\xe3' : ('a', "atilde"),
          '\xe4' : ('a', "auml"),
          '\xe5' : ('a', "aring"),
          '\xe6' : ('ae', "aelig"),
          '\xe7' : ('c', "ccedil"),
          '\xe8' : ('e', "egrave"),
          '\xe9' : ('e', "eacute"),
          '\xea' : ('e', "ecirc"),
          '\xeb' : ('e', "euml"),
          '\xec' : ('i', "igrave"),
          '\xed' : ('i', "iacute"),
          '\xee' : ('i', "icirc"),
          '\xef' : ('i', "iuml"),
          '\xf0' : ('o', "eth"),
          '\xf1' : ('n', "ntilde"),
          '\xf2' : ('o', "ograve"),
          '\xf3' : ('o', "oacute"),
          '\xf4' : ('o', "ocirc"),
          '\xf5' : ('o', "otilde"),
          '\xf6' : ('o', "ouml"),
          '\xf7' : ('/', "divide"),
          '\xf8' : ('o', "oslash"),
          '\xf9' : ('u', "ugrave"),
          '\xfa' : ('u', "uacute"),
          '\xfb' : ('u', "ucirc"),
          '\xfc' : ('u', "uuml"),
          '\xfd' : ('y', "yacute"),
          '\xfe' : ('b', "thorn"),
          '\xff' : ('y', "yuml"),
          }

def _makeRE(limit):
    """Returns a regular expression object that will match special characters
    up to the given limit."""
    return re.compile("([\x80-\\x%s])" % limit, re.M)
ALL = _makeRE('ff')
ONLY_WINDOWS = _makeRE('9f')

def _replHTML(match):
    "Replace the matched character with its HTML equivalent."
    return _repl(match, 1)
          
def _repl(match, html=0):
    "Replace the matched character with its HTML or ASCII equivalent."
    g = match.group(0)
    a = CHARS.get(g,g)
    if type(a) == types.TupleType:
        a = a[html]
        if html:
            a = '&' + a + ';'
    return a

def _dammit(t, html=0, fixWindowsOnly=0):
    "Turns ISO-Latin-1 into an ASCII representation, dammit."
    if(type(t) == unicode):
      t = t.encode('raw_unicode_escape')
    r = ALL
    if fixWindowsOnly:
        r = ONLY_WINDOWS
    m = _repl
    if html:
        m = _replHTML

    return re.sub(r, m, t)

def asciiDammit(t, fixWindowsOnly=0):
    "Turns ISO-Latin-1 into a plain ASCII approximation, dammit."
    return _dammit(t, 0, fixWindowsOnly)

def htmlDammit(t, fixWindowsOnly=0):
    "Turns ISO-Latin-1 into plain ASCII with HTML codes, dammit."
    return _dammit(t, 1, fixWindowsOnly=fixWindowsOnly)

def demoronise(t):
    """Helper method named in honor of the original smart quotes
    remover, The Demoroniser:

    http://www.fourmilab.ch/webtools/demoroniser/"""
    return asciiDammit(t, 1)

if __name__ == '__main__':
    french = '\x93Sacr\xe9 bleu!\x93'
    print "\nFirst we mangle some French."
    print asciiDammit(french)
    print htmlDammit(french)

    print "\nAnd now we fix the MS-quotes but leave the French alone."
    print demoronise(french)
    print htmlDammit(french, 1)

    print "\nLet's try some french in unicode"
    frenchu = u'sacré bleu'
    print asciiDammit(frenchu)

    dejavu = u'Déjà Vu'
    print "\nIt's usually a glitch in the Matrix. It happens when they change something."
    print asciiDammit(dejavu)