I want to extract useful text from an html document, and I used html-agility-pack for the same. Here is my code:
string convertedContent = HttpUtility.HtmlDecode(
ConvertHtml(HtmlAgilityPack.HtmlEntity.DeEntitize(htmlAsString))
);
ConvertHtml:
public string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
StringWriter sw = new StringWriter();
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
ConvertTo:
public void ConvertTo(HtmlAgilityPack.HtmlNode node, TextWriter outText)
{
string html;
switch (node.NodeType)
{
case HtmlAgilityPack.HtmlNodeType.Comment:
break;
case HtmlAgilityPack.HtmlNodeType.Document:
foreach (HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText);
}
break;
case HtmlAgilityPack.HtmlNodeType.Text:
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
break;
html = ((HtmlTextNode)node).Text;
if (HtmlNode.IsOverlappedClosingElement(html))
break;
if (html.Trim().Length > 0)
{
outText.Write(HtmlEntity.DeEntitize(html) + " ");
}
break;
case HtmlAgilityPack.HtmlNodeType.Element:
switch (node.Name)
{
case "p":
outText.Write("\r\n");
break;
}
if (node.HasChildNodes)
{
foreach (HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText);
}
}
break;
}
}
Now in some cases when the html pages are distorted (for example, the next page - http://rareseeds.com/cart/products/Purple_of_Romagna_Artichoke-646-72.html has the wrong meta tag, for example <meta content="text/html; charset=uft-8" http-equiv="Content-Type">) [Note that "uft" instead utf] my code runs while I am trying to load an html document.
Can someone tell me how I can overcome these distorted html pages and still extract the corresponding text from the html document?
Thanks Kapil
Kapil