How to use libxml2 to parse dirty html in C programming

Maybe html is dirty for example, premature end of data in tag

How can i do this? Thanks

+4
source share
2 answers

Using the libxml2 HTML parser, it normalizes dirty HTML into a normalized tree. see htmlDocPtr htmlParseFile(const char * filename, const char * encoding)

http://xmlsoft.org/html/libxml-HTMLparser.html

+9
source

I faced such a big problem due to lack of knowledge. Therefore, I am writing the entire demo program for parsing HTML using the libxml2 library.

 #include <stdio.h> #include <string.h> #include <stdlib.h> #include <libxml/HTMLparser.h> void traverse_dom_trees(xmlNode * a_node) { xmlNode *cur_node = NULL; if(NULL == a_node) { //printf("Invalid argument a_node %p\n", a_node); return; } for (cur_node = a_node; cur_node; cur_node = cur_node->next) { if (cur_node->type == XML_ELEMENT_NODE) { /* Check for if current node should be exclude or not */ printf("Node type: Text, name: %s\n", cur_node->name); } else if(cur_node->type == XML_TEXT_NODE) { /* Process here text node, It is available in cpStr :TODO: */ printf("node type: Text, node content: %s, content length %d\n", (char *)cur_node->content, strlen((char *)cur_node->content)); } traverse_dom_trees(cur_node->children); } } int main(int argc, char **argv) { htmlDocPtr doc; xmlNode *roo_element = NULL; if (argc != 2) { printf("\nInvalid argument\n"); return(1); } /* Macro to check API for match with the DLL we are using */ LIBXML_TEST_VERSION doc = htmlReadFile(argv[1], NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); if (doc == NULL) { fprintf(stderr, "Document not parsed successfully.\n"); return 0; } roo_element = xmlDocGetRootElement(doc); if (roo_element == NULL) { fprintf(stderr, "empty document\n"); xmlFreeDoc(doc); return 0; } printf("Root Node is %s\n", roo_element->name); traverse_dom_trees(roo_element); xmlFreeDoc(doc); // free document xmlCleanupParser(); // Free globals return 0; } 
+4
source

Source: https://habr.com/ru/post/1402180/


All Articles