I leave here an example of parsing Html with the Jericho library. It is in Spanish, but you can understand.
This code extracts and creates a string with the following characters of the html page (previous download, but you can open how YoK answers in front of me): Name: Description: Keywords of the page Text of the page Links to the page Url of the page images
private void parserHtml()throws IOException{ MicrosoftTagTypes.register(); PHPTagTypes.register(); PHPTagTypes.PHP_SHORT.deregister(); // remove PHP short tags for this example otherwise they override processing instructions MasonTagTypes.register(); fl = new File(pathFich); if (!fl.isFile()){ Log.e("Lector","ERROR_LeerHtml: El fichero no existe."); pagina ="Error al abrir el fichero."; return; } Source source=new Source(new FileInputStream(fl)); // Call fullSequentialParse manually as most of the source will be parsed. source.fullSequentialParse(); //Titulo Element titleElement=source.getFirstElement(HTMLElementName.TITLE); setTitulo(( (titleElement != null) && (titleElement.getContent().toString() != "") ) ? titleElement.getContent().toString() : "(Sin T\u00edtulo)"); //Caracteristicas de la pagina encoding = source.getEncoding(); encodingSpecificactionInfo = source.getEncodingSpecificationInfo(); encodingPreliminaryInfo = source.getPreliminaryEncodingInfo(); //Descripcion //setDescripcion(getMetaValue(source,"description")); String aux = getMetaValue(source,"description"); setDescripcion(( (aux != null) && (aux != "") ) ? aux : "Sin Descripci\u00f3n" ); //Keywords de la pagina //setKeywords(getMetaValue(source,"keywords")); aux = getMetaValue(source,"keywords"); setKeywords(( (aux != null) && (aux != "") ) ? aux : "Sin Palabras Clave" ); //Links de la pagina List<Element> urls = source.getAllElements(HTMLElementName.A); for (Element linkElement : urls) { String href=linkElement.getAttributeValue("href"); if (href==null) continue; String etiqu = linkElement.getContent().getTextExtractor().toString(); links.addUrl(href, etiqu); } //Imagenes de la pagina List<Element> imgs = source.getAllElements(HTMLElementName.IMG); for (Element linkElement : imgs) { String src = linkElement.getAttributeValue("src"); if (src == null) continue; String alt = linkElement.getAttributeValue("alt"); if (alt == null) alt = "Sin Etiqueta"; imagenes.addUrl(src, alt); } //Obtenemos el texto de la p\u00e1gina y lo dividimos en frases para que no salga en una sola //linea. Para ello usamos el metodo subDivText setTextoPagina(subDivText(source.getTextExtractor().setIncludeAttributes(true).toString())); setTextoPaginaCompleto(textoPagina); pagina = "T\u00edtulo: " + titulo + "\n\nDescripci\u00f3n: " + descripcion + "\n\nPalabras Clave: " + keywords + "\n\nTexto: " + textoPagina + "\n\nLinks: " + links.toString() + "\n\nIm\u00e1genes: " +imagenes.toString(); /* pagina = "T\u00edtulo: " + titulo + "\n\nDescripci\u00f3n: " + descripcion + "\n\nPalabras Clave: " + keywords + "\n\nTexto: " + textoPagina;*/ return; }
source share