I need help extracting text and images from a PDF file with display or links to images in the extracted text using the Java iText5 library. If iText5 is the wrong tool for this, please advise me by recommending another Java library that performs the same function.
This is still what I did
import java.io.IOException; import com.itextpdf.text.Document; import com.itextpdf.text.DocumentException; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.PdfTextExtractor; import com.itextpdf.text.Paragraph; public class Iconverter { static int PAGE_NUMBER; public static final String RESULT = "/home/sarah/Java for Dummies 4th Edition/Img%s.%s"; public static void main(String[] args) { String docText = ""; String pdfName = "/home/sarah/Java for Dummies 4th Edition.pdf"; Document document = new Document(); document.open(); try { PdfReader reader = new PdfReader(pdfName); PAGE_NUMBER = reader.getNumberOfPages(); for(int i = 1; i <=PAGE_NUMBER; i++){ docText = PdfTextExtractor.getTextFromPage(reader, i); } extractImages(pdfName); document.add(new Paragraph("..")); } catch (Exception e) { e.printStackTrace(); } document.close(); } public static void extractImages(String filename) throws IOException, DocumentException { PdfReader reader = new PdfReader(filename); PdfReaderContentParser parser = new PdfReaderContentParser(reader); MyImageRenderListener listener = new MyImageRenderListener(RESULT); for (int i = 1; i <= PAGE_NUMBER; i++) { parser.processContent(i, listener); } } } import java.awt.image.BufferedImage; import java.io.FileOutputStream; import java.io.IOException; import javax.imageio.ImageIO; import com.itextpdf.text.pdf.PdfName; import com.itextpdf.text.pdf.parser.ImageRenderInfo; import com.itextpdf.text.pdf.parser.PdfImageObject; import com.itextpdf.text.pdf.parser.RenderListener; import com.itextpdf.text.pdf.parser.TextRenderInfo; public class MyImageRenderListener implements RenderListener { protected String path = ""; public MyImageRenderListener(String path) { this.path = path; } public void beginTextBlock() { } public void endTextBlock() { } public void renderImage(ImageRenderInfo renderInfo) { try { String filename; FileOutputStream os; PdfImageObject image = renderInfo.getImage(); PdfName filter = (PdfName)image.get(PdfName.FILTER); if (PdfName.DCTDECODE.equals(filter)) { filename = String.format(path, renderInfo.getRef().getNumber(), "JPG"); os = new FileOutputStream(filename); os.write(image.getStreamBytes()); os.flush(); os.close(); } else if (PdfName.JPXDECODE.equals(filter)) { filename = String.format(path, renderInfo.getRef().getNumber(), "jp2"); os = new FileOutputStream(filename); os.write(image.getStreamBytes()); os.flush(); os.close(); } else if (PdfName.JBIG2DECODE.equals(filter)) {
source share