diff --git a/src/documentation/content/xdocs/spreadsheet/quick-guide.xml b/src/documentation/content/xdocs/spreadsheet/quick-guide.xml index 859adf961..d577d0974 100644 --- a/src/documentation/content/xdocs/spreadsheet/quick-guide.xml +++ b/src/documentation/content/xdocs/spreadsheet/quick-guide.xml @@ -73,6 +73,7 @@
  • How to adjust column width to fit the contents
  • Hyperlinks
  • Data Validation
  • +
  • Embedded Objects
  • Features @@ -1659,5 +1660,84 @@ Examples: dvConstraint = DVConstraint.createFormulaListConstraint("'Sheet1'!$A$1:$A$3");
    + +
    Embedded Objects +

    It is possible to perform more detailed processing of an embedded Excel, Word or PowerPoint document, + or to work with any other type of embedded object.

    +

    HSSF:

    + + POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream("excel_with_embeded.xls")); + HSSFWorkbook workbook = new HSSFWorkbook(fs); + for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) { + //the OLE2 Class Name of the object + String oleName = obj.getOLE2ClassName(); + if (oleName.equals("Worksheet")) { + DirectoryNode dn = (DirectoryNode) obj.getDirectory(); + HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false); + //System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets()); + } else if (oleName.equals("Document")) { + DirectoryNode dn = (DirectoryNode) obj.getDirectory(); + HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs); + //System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text()); + } else if (oleName.equals("Presentation")) { + DirectoryNode dn = (DirectoryNode) obj.getDirectory(); + SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn, fs)); + //System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length); + } else { + if(obj.hasDirectoryEntry()){ + // The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is + DirectoryNode dn = (DirectoryNode) obj.getDirectory(); + for (Iterator entries = dn.getEntries(); entries.hasNext();) { + Entry entry = (Entry) entries.next(); + //System.out.println(oleName + "." + entry.getName()); + } + } else { + // There is no DirectoryEntry + // Recover the object's data from the HSSFObjectData instance. + byte[] objectData = obj.getObjectData(); + } + } + } + +

    XSSF:

    + + XSSFWorkbook workbook = new XSSFWorkbook("excel_with_embeded.xlsx"); + for (PackagePart pPart : workbook.getAllEmbedds()) { + String contentType = pPart.getContentType(); + // Excel Workbook - either binary or OpenXML + if (contentType.equals("application/vnd.ms-excel")) { + HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream()); + } + // Excel Workbook - OpenXML file format + else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { + OPCPackage docPackage = OPCPackage.open(pPart.getInputStream()); + XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(docPackage); + } + // Word Document - binary (OLE2CDF) file format + else if (contentType.equals("application/msword")) { + HWPFDocument document = new HWPFDocument(pPart.getInputStream()); + } + // Word Document - OpenXML file format + else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { + OPCPackage docPackage = OPCPackage.open(pPart.getInputStream()); + XWPFDocument document = new XWPFDocument(docPackage); + } + // PowerPoint Document - binary file format + else if (contentType.equals("application/vnd.ms-powerpoint")) { + HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream()); + } + // PowerPoint Document - OpenXML file format + else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) { + OPCPackage docPackage = OPCPackage.open(pPart.getInputStream()); + XSLFSlideShow slideShow = new XSLFSlideShow(docPackage); + } + // Any other type of embedded object. + else { + System.out.println("Unknown Embedded Document: " + contentType); + InputStream inputStream = pPart.getInputStream(); + } + } + +
    diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 73abd537d..2af0bd1e9 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,6 +33,7 @@ + 47535 - fixed WordExtractor to tolerate files with empty footnote block 47517 - Fixed ExtractorFactory to support .xltx and .dotx files 45556 - Support for extraction of footnotes from docx files 45555 - Support for extraction of endnotes from docx files diff --git a/src/documentation/content/xdocs/text-extraction.xml b/src/documentation/content/xdocs/text-extraction.xml index d71a0bf10..fa7474bc0 100644 --- a/src/documentation/content/xdocs/text-extraction.xml +++ b/src/documentation/content/xdocs/text-extraction.xml @@ -102,6 +102,50 @@ org.apache.poi.hdgf.extractor.VisioTextExtractor, which will return text for your file.

    + +
    Embedded Objects +

    Extractors already exist for Excel, Word, PowerPoint and Visio; + if one of these objects is embedded into a worksheet, the ExtractorFactory class can be used to recover an extractor for it. +

    + + FileInputStream fis = new FileInputStream(inputFile); + POIFSFileSystem fileSystem = new POIFSFileSystem(fis); + // Firstly, get an extractor for the Workbook + POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem); + // Then a List of extractors for any embedded Excel, Word, PowerPoint + // or Visio objects embedded into it. + POITextExtractor[] embeddedExtractors = ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor); + for (POITextExtractor textExtractor : embeddedExtractors) { + // If the embedded object was an Excel spreadsheet. + if (textExtractor instanceof ExcelExtractor) { + ExcelExtractor excelExtractor = (ExcelExtractor) textExtractor; + System.out.println(excelExtractor.getText()); + } + // A Word Document + else if (textExtractor instanceof WordExtractor) { + WordExtractor wordExtractor = (WordExtractor) textExtractor; + String[] paragraphText = wordExtractor.getParagraphText(); + for (String paragraph : paragraphText) { + System.out.println(paragraph); + } + // Display the document's header and footer text + System.out.println("Footer text: " + wordExtractor.getFooterText()); + System.out.println("Header text: " + wordExtractor.getHeaderText()); + } + // PowerPoint Presentation. + else if (textExtractor instanceof PowerPointExtractor) { + PowerPointExtractor powerPointExtractor = (PowerPointExtractor) textExtractor; + System.out.println("Text: " + powerPointExtractor.getText()); + System.out.println("Notes: " + powerPointExtractor.getNotes()); + } + // Visio Drawing + else if (textExtractor instanceof VisioTextExtractor) { + VisioTextExtractor visioTextExtractor = (VisioTextExtractor) textExtractor; + System.out.println("Text: " + visioTextExtractor.getText()); + } + } + +