Extract title tags from plain text

I am working on one task to extract a title tag from a given normal text (this is not an HTML DOM). I have below cases where you need to extract the header tags:

Case 1:

<html>
<head>
           <title>Title of the document</title>
</head>
<body>
The content of the document......
</body>
</html>

Expected: Title of the document

Case 2:

<html>
<head>
           <title>Title of the document</title>
           <title>Continuing title</title>
</head>
<body>
The content of the document......
</body>
</html>

Expected: Title of the document Continuing title

Case 3 (nested title tags)

<html>
<head>
           <title>Title of the document
           <title>Continuing title</title></title>
</head>
<body>
The content of the document......
</body>
</html>

Expected: Title of the document Continuing title

I wanted to extract header tags using regular expression in javascript. Reg-ex should work for the above case.

Does anyone know about this .. please let me know ... Thanks in Advance

+3
source share
2 answers

HTML regexen! , . , , regexen. , , , <title>/</title>, .

( 1: , , DOM, , . .)

, ? , . JavaScript, . JavaScript, , HTML, , , . JavaScript, , JavaScript.

: title HTML, . , , , , , . : , CDATA .. ( , .) , ! , Safari, Firefox , , Title of the document <title> Continuing title. , , , . ( !) - :

// Edit 2: Made this function case-insensitive where it needed to be.
// Edit 3: Used substring() instead of replace() to remove the extraneous
//         title tags and fixed the "not matching" case.
function getTitle(html) {
  return (html.replace( /<!\[CDATA\[(.+?)\]\]>/g
                      , function (_match, body) {
                          return body.replace(/&/g, '&amp;')
                                     .replace(/</g, '&lt;')
                                     .replace(/>/g, '&gt;')
                        } )
              .replace(/<!--.+?-->/g, '')
              .match(/<title>.+?<\/title>/ig) || [])
              .map(function (t) { return t.substring(7, t.length - 8) })
              .join(' ')
}

HTML, , , , . -, CDATA. <![CDATA[ ]]>. . ( ), - title. 3: , -, .match() null [], ; , . ( edit 3: regexen ) , , . , , . ( ), ( }) .match(/<title>(.+?)<\/title>/)[0]. , ( ), ( (, , ), , () <title>...</title> s) , , - . , .


1: , ; , DOM. , .

DOM JavaScript, , HTML title:

var titles    = document.getElementsByTagName('title')
var titleText = titles.length > 0 ? titles[0].text : ''

, HTML, , (, , ), - . Firefox, Safari , , Title of the document <title> Continuing title. , , :

var titles    = document.getElementsByTagName('title')
var tlength   = titles.length
var titleText = ''
for (var i = 0; i < tlength; ++i)
  titleText += titles[i].text

, <title>, , , , . , <title> , - HTML, , replace, . <title>

// Edit 2: Case-insensitivity
var titles    = document.getElementsByTagName('title')
var titleText = titles.length > 0 ? titles[0].text.replace(/<title>/ig,'') : ''

<title>

// Edit 2: Case-insensitivity
var titles    = document.getElementsByTagName('title')
var tlength   = titles.length
var titleText = ''
for (var i = 0; i < tlength; ++i)
  titleText += titles[i].text.replace(/<title>/ig,'')

<title> , ; , , , . , . , ( ), HTML.

+2

"-HTML". HTML:

function extractTitle(text) {
  var m = /<title>(.*)<\/title>/.exec(text); 
  if (m && m[1]) {
    return m[1].replace(/<\/?title>/g," ").replace(/\s+/," ");
  }
  return; // returns undefined
}
+1

Source: https://habr.com/ru/post/1748809/