Here is a script that, for tolerant removal of HTML links from web pages, assumes that the links are, for example, in ° with a semicolon after them though ( Preheat oven to 350° F for example):
from htmlentitydefs import name2codepoint # Get the whitespace characters nums_dict = {0: ' ', 1: '\t', 2: '\r', 3: '\n'} chars_dict = dict((x, y) for y, x in nums_dict.items()) nums_dict2XML = {0: ' ', 1: '	', 2: ' ', 3: ' '} chars_dict2XML = dict((nums_dict[i], nums_dict2XML[i]) for i in nums_dict2XML) s = '1234567890ABCDEF' hex_dict = {} for i in s: hex_dict[i.lower()] = None hex_dict[i.upper()] = None del s def is_hex(s): if not s: return False for i in s: if i not in hex_dict: return False return True class Unescape: def __init__(self, s, ignore_whitespace=False): # Converts HTML character references into a unicode string to allow manipulation self.s = s self.ignore_whitespace = ignore_whitespace self.lst = self.process(ignore_whitespace) def process(self, ignore_whitespace): def get_char(c): if ignore_whitespace: return c else: if c in chars_dict: return chars_dict[c] else: return c r = [] lst = self.s.split('&') xx = 0 yy = 0 for item in lst: if xx: split = item.split(';') if split[0].lower() in name2codepoint: # A character reference, eg '&' a = unichr(name2codepoint[split[0].lower()]) r.append(get_char(a)) # TOKEN CHECK? r.append(';'.join(split[1:])) elif split[0] and split[0][0] == '#' and split[0][1:].isdigit(): # A character number eg '4' a = unichr(int(split[0][1:])) r.append(get_char(a)) r.append(';'.join(split[1:])) elif split[0] and split[0][0] == '#' and split[0][1:2].lower() == 'x' and is_hex(split[0][2:]): # A hexadecimal encoded character a = unichr(int(split[0][2:].lower(), 16)) # Hex -> base 16 r.append(get_char(a)) r.append(';'.join(split[1:])) else: r.append('&%s' % ';'.join(split)) else: r.append(item) xx += 1 yy += len(r[-1]) return r def get_value(self): # Convert back into HTML, preserving # whitespace if self.ignore_whitespace is `False` r = [] for i in self.lst: if type(i) == int: r.append(nums_dict2XML[i]) else: r.append(i) return ''.join(r) def unescape(s): # Get the string value from escaped HTML `s`, ignoring # explicit whitespace like tabs/spaces etc inst = Unescape(s, ignore_whitespace=True) return ''.join(inst.lst) if __name__ == '__main__': print unescape('Preheat oven to 350° F') print unescape('Welcome to Lorem Ipsum Inc®')
EDIT: Here's a simpler solution that replaces character references with characters, not &#xx; links:
from htmlentitydefs import name2codepoint def unescape(s): for name in name2codepoint: s = s.replace('&%s;' % name, unichr(name2codepoint[name])) return s print unescape('Preheat oven to 350° F') print unescape('Welcome to Lorem Ipsum Inc®')
source share