I have unit-test, where I use the following to extract all text from an Excel file without any formatting, for some use cases this may be faster than repeating all elements one at a time:
private POITextExtractor extractText(File file) throws IOException { InputStream inp = null; try { inp = new PushbackInputStream( new FileInputStream(file), 8); if(POIFSFileSystem.hasPOIFSHeader(inp)) { return createExtractor(new POIFSFileSystem(inp)); } throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); } finally { if(inp != null) inp.close(); } } private static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { return createExtractor(fs.getRoot(), fs); } private static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException { for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) { Entry entry = entries.next(); if(entry.getName().equals("Workbook")) { { return new ExcelExtractor(poifsDir, fs); } } } throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); } private String assertContains(File file, String... contents) throws IOException { assertTrue(file.exists()); POITextExtractor extractor = extractText(file); assertNotNull(extractor); String str = extractor.getText(); for(String s : contents) { assertTrue("Did expect to find text '" + s + "' in resulting Excel file, but did not find it in str: " + str, str.contains(s)); } return str; }
source share