Extract text and images from PDF using iText5

I need help extracting text and images from a PDF file with display or links to images in the extracted text using the Java iText5 library. If iText5 is the wrong tool for this, please advise me by recommending another Java library that performs the same function.

This is still what I did

import java.io.IOException; import com.itextpdf.text.Document; import com.itextpdf.text.DocumentException; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.PdfTextExtractor; import com.itextpdf.text.Paragraph; public class Iconverter { /** * @param args */ static int PAGE_NUMBER; /** The new document to which we've added a border rectangle. */ public static final String RESULT = "/home/sarah/Java for Dummies 4th Edition/Img%s.%s"; public static void main(String[] args) { String docText = ""; String pdfName = "/home/sarah/Java for Dummies 4th Edition.pdf"; Document document = new Document(); document.open(); try { PdfReader reader = new PdfReader(pdfName); PAGE_NUMBER = reader.getNumberOfPages(); for(int i = 1; i <=PAGE_NUMBER; i++){ docText = PdfTextExtractor.getTextFromPage(reader, i); } extractImages(pdfName); document.add(new Paragraph("..")); } catch (Exception e) { e.printStackTrace(); } document.close(); } /** * Parses a PDF and extracts all the images. * @param src the source PDF * @param dest the resulting PDF */ public static void extractImages(String filename) throws IOException, DocumentException { PdfReader reader = new PdfReader(filename); PdfReaderContentParser parser = new PdfReaderContentParser(reader); MyImageRenderListener listener = new MyImageRenderListener(RESULT); for (int i = 1; i <= PAGE_NUMBER; i++) { parser.processContent(i, listener); } } } import java.awt.image.BufferedImage; import java.io.FileOutputStream; import java.io.IOException; import javax.imageio.ImageIO; import com.itextpdf.text.pdf.PdfName; import com.itextpdf.text.pdf.parser.ImageRenderInfo; import com.itextpdf.text.pdf.parser.PdfImageObject; import com.itextpdf.text.pdf.parser.RenderListener; import com.itextpdf.text.pdf.parser.TextRenderInfo; public class MyImageRenderListener implements RenderListener { /** The new document to which we've added a border rectangle. */ protected String path = ""; /** * Creates a RenderListener that will look for images. */ public MyImageRenderListener(String path) { this.path = path; } /** * @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock() */ public void beginTextBlock() { } /** * @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock() */ public void endTextBlock() { } /** * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage( * com.itextpdf.text.pdf.parser.ImageRenderInfo) */ public void renderImage(ImageRenderInfo renderInfo) { try { String filename; FileOutputStream os; PdfImageObject image = renderInfo.getImage(); PdfName filter = (PdfName)image.get(PdfName.FILTER); if (PdfName.DCTDECODE.equals(filter)) { filename = String.format(path, renderInfo.getRef().getNumber(), "JPG"); os = new FileOutputStream(filename); os.write(image.getStreamBytes()); os.flush(); os.close(); } else if (PdfName.JPXDECODE.equals(filter)) { filename = String.format(path, renderInfo.getRef().getNumber(), "jp2"); os = new FileOutputStream(filename); os.write(image.getStreamBytes()); os.flush(); os.close(); } else if (PdfName.JBIG2DECODE.equals(filter)) { // ignore: filter not supported. } else { BufferedImage awtimage = renderInfo.getImage().getBufferedImage(); if (awtimage != null) { filename = String.format(path, renderInfo.getRef().getNumber(), "png"); ImageIO.write(awtimage, "png", new FileOutputStream(filename)); } } } catch (IOException e) { e.printStackTrace(); } } /** * @see com.itextpdf.text.pdf.parser.RenderListener#renderText( * com.itextpdf.text.pdf.parser.TextRenderInfo) */ public void renderText(TextRenderInfo renderInfo) { } } 
+6
source share
1 answer

Alternatively, iText Apache PDFBox can help you.

Take a look at these classes:

org.apache.pdfbox.ExtractImages

org.apache.pdfbox.ExtractText

0
source

Source: https://habr.com/ru/post/904894/


All Articles