BeautifulSoup replaceWith () method adds escaped html, does not require it

I have a python method ( thanks to this snippet ) that accepts some html and wraps tags <a>around ONLY unformatted links using BeautifulSoup and Django urlize:

from django.utils.html import urlize
from bs4 import BeautifulSoup

def html_urlize(self, text):
    soup = BeautifulSoup(text, "html.parser")

    print(soup)

    textNodes = soup.findAll(text=True)
    for textNode in textNodes:
        if textNode.parent and getattr(textNode.parent, 'name') == 'a':
            continue  # skip already formatted links
        urlizedText = urlize(textNode)
        textNode.replaceWith(urlizedText)

    print(soup)

    return str(soup)

Example input text (as the output of the first print statement):

this is a formatted link <a href="http://google.ca">http://google.ca</a>, this one is unformatted and should become formatted: http://google.ca

The resulting return text (as the output of the second print statement) is as follows:

this is a formatted link <a href="http://google.ca">http://google.ca</a>, this one is unformatted and should become formatted: &lt;a href="http://google.ca"&gt;http://google.ca&lt;/a&gt;

As you can see, this is formatting the link, but it does it with escaped html, so when I print it in the template {{ my.html|safe }}, it does not appear as html.

, , urlize, unescaped, html? , - , ? , django.utils.html.

: , : textNode.replaceWith(urlizedText).

+4
2

urlizedText BeautifulSoup, , ( , )

from django.utils.html import urlize
from bs4 import BeautifulSoup

def html_urlize(self, text):
    soup = BeautifulSoup(text, "html.parser")

    print(soup)

    textNodes = soup.findAll(text=True)
    for textNode in textNodes:
        if textNode.parent and getattr(textNode.parent, 'name') == 'a':
            continue  # skip already formatted links
        urlizedText = urlize(textNode)
        textNode.replaceWith(BeautifulSoup(urlizedText, "html.parser"))

    print(soup)

    return str(soup)
+3

, BeautifulSoup node , HTML.

urlize (, , , ).

from django.utils.html import urlize
from bs4 import BeautifulSoup

def html_urlize(self, text):
    soup = BeautifulSoup(text, "html.parser")

    finalFragments = []
    textNodes = soup.findAll(text=True)
    for textNode in textNodes:
        if getattr(textNode.parent, 'name') == 'a':
            finalFragments.append(str(textNode.parent))
        else:
            finalFragments.append(urlize(textNode))

    return str("".join(finalFragments))

, , , , urlize -

{{input_string|urlize}}
0

Source: https://habr.com/ru/post/1610192/


All Articles