How to disable special characters from BeautifulSoup output?

I am facing issues with special characters like & deg; and? which are a Fahrenheit sign and a registered mark,

when I print a line that contains special characters, it produces the following output:

Preheat oven to 350° F Welcome to Lorem Ipsum Inc® 

Is there a way to produce exact characters, not their codes? Please let me know.

+4
source share
4 answers
 $ python -c'from BeautifulSoup import BeautifulSoup > print BeautifulSoup("""<html>Preheat oven to 350&deg; F > Welcome to Lorem Ipsum Inc&reg;""", > convertEntities=BeautifulSoup.HTML_ENTITIES).contents[0].string' Preheat oven to 350° F Welcome to Lorem Ipsum Inc® 
+8
source

Here is a script that, for tolerant removal of HTML links from web pages, assumes that the links are, for example, in &deg; with a semicolon after them though ( Preheat oven to 350&deg; F for example):

 from htmlentitydefs import name2codepoint # Get the whitespace characters nums_dict = {0: ' ', 1: '\t', 2: '\r', 3: '\n'} chars_dict = dict((x, y) for y, x in nums_dict.items()) nums_dict2XML = {0: '&#32;', 1: '&#09;', 2: '&#13;', 3: '&#10;'} chars_dict2XML = dict((nums_dict[i], nums_dict2XML[i]) for i in nums_dict2XML) s = '1234567890ABCDEF' hex_dict = {} for i in s: hex_dict[i.lower()] = None hex_dict[i.upper()] = None del s def is_hex(s): if not s: return False for i in s: if i not in hex_dict: return False return True class Unescape: def __init__(self, s, ignore_whitespace=False): # Converts HTML character references into a unicode string to allow manipulation self.s = s self.ignore_whitespace = ignore_whitespace self.lst = self.process(ignore_whitespace) def process(self, ignore_whitespace): def get_char(c): if ignore_whitespace: return c else: if c in chars_dict: return chars_dict[c] else: return c r = [] lst = self.s.split('&') xx = 0 yy = 0 for item in lst: if xx: split = item.split(';') if split[0].lower() in name2codepoint: # A character reference, eg '&amp;' a = unichr(name2codepoint[split[0].lower()]) r.append(get_char(a)) # TOKEN CHECK? r.append(';'.join(split[1:])) elif split[0] and split[0][0] == '#' and split[0][1:].isdigit(): # A character number eg '&#52;' a = unichr(int(split[0][1:])) r.append(get_char(a)) r.append(';'.join(split[1:])) elif split[0] and split[0][0] == '#' and split[0][1:2].lower() == 'x' and is_hex(split[0][2:]): # A hexadecimal encoded character a = unichr(int(split[0][2:].lower(), 16)) # Hex -> base 16 r.append(get_char(a)) r.append(';'.join(split[1:])) else: r.append('&%s' % ';'.join(split)) else: r.append(item) xx += 1 yy += len(r[-1]) return r def get_value(self): # Convert back into HTML, preserving # whitespace if self.ignore_whitespace is `False` r = [] for i in self.lst: if type(i) == int: r.append(nums_dict2XML[i]) else: r.append(i) return ''.join(r) def unescape(s): # Get the string value from escaped HTML `s`, ignoring # explicit whitespace like tabs/spaces etc inst = Unescape(s, ignore_whitespace=True) return ''.join(inst.lst) if __name__ == '__main__': print unescape('Preheat oven to 350&deg; F') print unescape('Welcome to Lorem Ipsum Inc&reg;') 

EDIT: Here's a simpler solution that replaces character references with characters, not &#xx; links:

 from htmlentitydefs import name2codepoint def unescape(s): for name in name2codepoint: s = s.replace('&%s;' % name, unichr(name2codepoint[name])) return s print unescape('Preheat oven to 350&deg; F') print unescape('Welcome to Lorem Ipsum Inc&reg;') 
+2
source

In Beautiful Soup 4:

 my_text = """Preheat oven to 350&deg; F Welcome to Lorem Ipsum Inc&reg; """ soup = BeautifulSoup(my_text, 'html.parser') print(soup) 

Result:

 Preheat oven to 350° F Welcome to Lorem Ipsum Inc® 
+1
source

I think somewhere the program quotes & deg and & reg; reg without a semicolon. Try using "& deg" + ";" and "& reg" + ";" in your HTML file if it is really an HTML file. And please explain the context.

0
source

Source: https://habr.com/ru/post/1310230/


All Articles