Extract text from HTML string using JavaScript

I am trying to get the inner text of an HTML string using a JS function (the string is passed as an argument). Here is the code:

function extractContent(value) { var content_holder = ""; for(var i=0;i<value.length;i++) { if(value.charAt(i) === '>') { continue; while(value.charAt(i) != '<') { content_holder += value.charAt(i); } } } console.log(content_holder); } extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>"); 

The problem is that nothing is printed on the console (content_holder remains empty). I think the problem is caused by the operator ===.

+11
source share
7 answers

Create an element, save the HTML in it and get its textContent :

 function extractContent(s) { var span = document.createElement('span'); span.innerHTML = s; return span.textContent || span.innerText; }; alert(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>")); 

Here is a version that allows you to have spaces between nodes, although you probably only want this for block level elements:

 function extractContent(s, space) { var span= document.createElement('span'); span.innerHTML= s; if(space) { var children= span.querySelectorAll('*'); for(var i = 0 ; i < children.length ; i++) { if(children[i].textContent) children[i].textContent+= ' '; else children[i].innerText+= ' '; } } return [span.textContent || span.innerText].toString().replace(/ +/g,' '); }; console.log(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>. Nice to <em>see</em><strong><em>you!</em></strong>")); console.log(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>. Nice to <em>see</em><strong><em>you!</em></strong>",true)); 
+25
source

Version of one line (more precisely, one application):

 function extractContent(html) { return (new DOMParser).parseFromString(html, "text/html") . documentElement . textContent; } 
+16
source

use this regax to remove html tags and save only inner text in html

it shows that HelloW3c only checks it

 var content_holder = value.replace(/<(?:.|\n)*?>/gm, ''); 
+2
source

Try the following: -

 <!DOCTYPE html> <html> <body> <script type="text/javascript"> function extractContent(value){ var div = document.createElement('div') div.innerHTML=value; var text= div.textContent; return text; } window.onload=function() { alert(extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>")); }; </script> </body> </html> 
+1
source

textContext is a very good method to achieve the desired results, but sometimes we do not want to load the DOM. Such a simple solution would be the following regular expression:

 let htmlString = "<p>Hello</p><a href='http://w3c.org'>W3C</a>" let plainText = htmlString.replace(/<[^>]+>/g, ''); 
0
source

You can temporarily write it to a block level element that is outside the page. Some things like this:

HTML:

 <div id="tmp" style="position:absolute;top:-400px;left:-400px;"> </div> 

JavaScript:

 <script type="text/javascript"> function extractContent(value){ var div=document.getElementById('tmp'); div.innerHTML=value; console.log(div.children[0].innerHTML);//console out p } extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>"); </script> 
-1
source

you need an array to store values

  function extractContent(value) { var content_holder = new Array(); for(var i=0;i<value.length;i++) { if(value.charAt(i) === '>') { continue; while(value.charAt(i) != '<') { content_holder.push(value.charAt(i)); console.log(content_holder[i]); } } } }extractContent("<p>Hello</p><a href='http://w3c.org'>W3C</a>"); 
-3
source

Source: https://habr.com/ru/post/983423/


All Articles