updated docs on extraction of embedded objects, misc changes in HSSF
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@795394 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
77987258b8
commit
bf76476b0e
@ -73,6 +73,7 @@
|
||||
<li><link href="#Autofit">How to adjust column width to fit the contents</link></li>
|
||||
<li><link href="#Hyperlinks">Hyperlinks</link></li>
|
||||
<li><link href="#Validation">Data Validation</link></li>
|
||||
<li><link href="#Embedded">Embedded Objects</link></li>
|
||||
</ul>
|
||||
</section>
|
||||
<section><title>Features</title>
|
||||
@ -1659,5 +1660,84 @@ Examples:
|
||||
dvConstraint = DVConstraint.createFormulaListConstraint("'Sheet1'!$A$1:$A$3");
|
||||
</source>
|
||||
</section>
|
||||
<anchor id="Embedded"/>
|
||||
<section><title>Embedded Objects</title>
|
||||
<p>It is possible to perform more detailed processing of an embedded Excel, Word or PowerPoint document,
|
||||
or to work with any other type of embedded object.</p>
|
||||
<p><strong>HSSF:</strong></p>
|
||||
<source>
|
||||
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream("excel_with_embeded.xls"));
|
||||
HSSFWorkbook workbook = new HSSFWorkbook(fs);
|
||||
for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) {
|
||||
//the OLE2 Class Name of the object
|
||||
String oleName = obj.getOLE2ClassName();
|
||||
if (oleName.equals("Worksheet")) {
|
||||
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
|
||||
HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false);
|
||||
//System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets());
|
||||
} else if (oleName.equals("Document")) {
|
||||
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
|
||||
HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs);
|
||||
//System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text());
|
||||
} else if (oleName.equals("Presentation")) {
|
||||
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
|
||||
SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn, fs));
|
||||
//System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length);
|
||||
} else {
|
||||
if(obj.hasDirectoryEntry()){
|
||||
// The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is
|
||||
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
|
||||
for (Iterator entries = dn.getEntries(); entries.hasNext();) {
|
||||
Entry entry = (Entry) entries.next();
|
||||
//System.out.println(oleName + "." + entry.getName());
|
||||
}
|
||||
} else {
|
||||
// There is no DirectoryEntry
|
||||
// Recover the object's data from the HSSFObjectData instance.
|
||||
byte[] objectData = obj.getObjectData();
|
||||
}
|
||||
}
|
||||
}
|
||||
</source>
|
||||
<p><strong>XSSF:</strong></p>
|
||||
<source>
|
||||
XSSFWorkbook workbook = new XSSFWorkbook("excel_with_embeded.xlsx");
|
||||
for (PackagePart pPart : workbook.getAllEmbedds()) {
|
||||
String contentType = pPart.getContentType();
|
||||
// Excel Workbook - either binary or OpenXML
|
||||
if (contentType.equals("application/vnd.ms-excel")) {
|
||||
HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream());
|
||||
}
|
||||
// Excel Workbook - OpenXML file format
|
||||
else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
|
||||
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
|
||||
XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(docPackage);
|
||||
}
|
||||
// Word Document - binary (OLE2CDF) file format
|
||||
else if (contentType.equals("application/msword")) {
|
||||
HWPFDocument document = new HWPFDocument(pPart.getInputStream());
|
||||
}
|
||||
// Word Document - OpenXML file format
|
||||
else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
|
||||
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
|
||||
XWPFDocument document = new XWPFDocument(docPackage);
|
||||
}
|
||||
// PowerPoint Document - binary file format
|
||||
else if (contentType.equals("application/vnd.ms-powerpoint")) {
|
||||
HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream());
|
||||
}
|
||||
// PowerPoint Document - OpenXML file format
|
||||
else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
|
||||
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
|
||||
XSLFSlideShow slideShow = new XSLFSlideShow(docPackage);
|
||||
}
|
||||
// Any other type of embedded object.
|
||||
else {
|
||||
System.out.println("Unknown Embedded Document: " + contentType);
|
||||
InputStream inputStream = pPart.getInputStream();
|
||||
}
|
||||
}
|
||||
</source>
|
||||
</section>
|
||||
</body>
|
||||
</document>
|
||||
|
@ -33,6 +33,7 @@
|
||||
|
||||
<changes>
|
||||
<release version="3.5-beta7" date="2009-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="fix">47535 - fixed WordExtractor to tolerate files with empty footnote block</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">47517 - Fixed ExtractorFactory to support .xltx and .dotx files</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45556 - Support for extraction of footnotes from docx files</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45555 - Support for extraction of endnotes from docx files</action>
|
||||
|
@ -102,6 +102,50 @@
|
||||
<em>org.apache.poi.hdgf.extractor.VisioTextExtractor</em>, which
|
||||
will return text for your file.</p>
|
||||
</section>
|
||||
|
||||
<section><title>Embedded Objects</title>
|
||||
<p>Extractors already exist for Excel, Word, PowerPoint and Visio;
|
||||
if one of these objects is embedded into a worksheet, the ExtractorFactory class can be used to recover an extractor for it.
|
||||
</p>
|
||||
<source>
|
||||
FileInputStream fis = new FileInputStream(inputFile);
|
||||
POIFSFileSystem fileSystem = new POIFSFileSystem(fis);
|
||||
// Firstly, get an extractor for the Workbook
|
||||
POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem);
|
||||
// Then a List of extractors for any embedded Excel, Word, PowerPoint
|
||||
// or Visio objects embedded into it.
|
||||
POITextExtractor[] embeddedExtractors = ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor);
|
||||
for (POITextExtractor textExtractor : embeddedExtractors) {
|
||||
// If the embedded object was an Excel spreadsheet.
|
||||
if (textExtractor instanceof ExcelExtractor) {
|
||||
ExcelExtractor excelExtractor = (ExcelExtractor) textExtractor;
|
||||
System.out.println(excelExtractor.getText());
|
||||
}
|
||||
// A Word Document
|
||||
else if (textExtractor instanceof WordExtractor) {
|
||||
WordExtractor wordExtractor = (WordExtractor) textExtractor;
|
||||
String[] paragraphText = wordExtractor.getParagraphText();
|
||||
for (String paragraph : paragraphText) {
|
||||
System.out.println(paragraph);
|
||||
}
|
||||
// Display the document's header and footer text
|
||||
System.out.println("Footer text: " + wordExtractor.getFooterText());
|
||||
System.out.println("Header text: " + wordExtractor.getHeaderText());
|
||||
}
|
||||
// PowerPoint Presentation.
|
||||
else if (textExtractor instanceof PowerPointExtractor) {
|
||||
PowerPointExtractor powerPointExtractor = (PowerPointExtractor) textExtractor;
|
||||
System.out.println("Text: " + powerPointExtractor.getText());
|
||||
System.out.println("Notes: " + powerPointExtractor.getNotes());
|
||||
}
|
||||
// Visio Drawing
|
||||
else if (textExtractor instanceof VisioTextExtractor) {
|
||||
VisioTextExtractor visioTextExtractor = (VisioTextExtractor) textExtractor;
|
||||
System.out.println("Text: " + visioTextExtractor.getText());
|
||||
}
|
||||
}
|
||||
</source>
|
||||
</section>
|
||||
</body>
|
||||
|
||||
<footer>
|
||||
|
68
src/examples/src/org/apache/poi/hssf/usermodel/examples/EmeddedObjects.java
Executable file
68
src/examples/src/org/apache/poi/hssf/usermodel/examples/EmeddedObjects.java
Executable file
@ -0,0 +1,68 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hssf.usermodel.examples;
|
||||
|
||||
import org.apache.poi.hssf.usermodel.*;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hslf.HSLFSlideShow;
|
||||
import org.apache.poi.hslf.usermodel.SlideShow;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Demonstrates how you can extract embedded data from a .xls file
|
||||
*/
|
||||
public class EmeddedObjects {
|
||||
public static void main(String[] args) throws Exception {
|
||||
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(args[0]));
|
||||
HSSFWorkbook workbook = new HSSFWorkbook(fs);
|
||||
for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) {
|
||||
//the OLE2 Class Name of the object
|
||||
String oleName = obj.getOLE2ClassName();
|
||||
if (oleName.equals("Worksheet")) {
|
||||
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
|
||||
HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false);
|
||||
//System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets());
|
||||
} else if (oleName.equals("Document")) {
|
||||
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
|
||||
HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs);
|
||||
//System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text());
|
||||
} else if (oleName.equals("Presentation")) {
|
||||
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
|
||||
SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn, fs));
|
||||
//System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length);
|
||||
} else {
|
||||
if(obj.hasDirectoryEntry()){
|
||||
// The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is
|
||||
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
|
||||
for (Iterator entries = dn.getEntries(); entries.hasNext();) {
|
||||
Entry entry = (Entry) entries.next();
|
||||
//System.out.println(oleName + "." + entry.getName());
|
||||
}
|
||||
} else {
|
||||
// There is no DirectoryEntry
|
||||
// Recover the object’s data from the HSSFObjectData instance.
|
||||
byte[] objectData = obj.getObjectData();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
72
src/examples/src/org/apache/poi/xssf/usermodel/examples/EmbeddedObjects.java
Executable file
72
src/examples/src/org/apache/poi/xssf/usermodel/examples/EmbeddedObjects.java
Executable file
@ -0,0 +1,72 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.xssf.usermodel.examples;
|
||||
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackagePart;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.poi.hslf.HSLFSlideShow;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.xslf.XSLFSlideShow;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* Demonstrates how you can extract embedded data from a .xlsx file
|
||||
*/
|
||||
public class EmbeddedObjects {
|
||||
public static void main(String[] args) throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook(args[0]);
|
||||
for (PackagePart pPart : workbook.getAllEmbedds()) {
|
||||
String contentType = pPart.getContentType();
|
||||
// Excel Workbook – either binary or OpenXML
|
||||
if (contentType.equals("application/vnd.ms-excel")) {
|
||||
HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream());
|
||||
}
|
||||
// Excel Workbook – OpenXML file format
|
||||
else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
|
||||
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
|
||||
XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(docPackage);
|
||||
}
|
||||
// Word Document – binary (OLE2CDF) file format
|
||||
else if (contentType.equals("application/msword")) {
|
||||
HWPFDocument document = new HWPFDocument(pPart.getInputStream());
|
||||
}
|
||||
// Word Document – OpenXML file format
|
||||
else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
|
||||
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
|
||||
XWPFDocument document = new XWPFDocument(docPackage);
|
||||
}
|
||||
// PowerPoint Document – binary file format
|
||||
else if (contentType.equals("application/vnd.ms-powerpoint")) {
|
||||
HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream());
|
||||
}
|
||||
// PowerPoint Document – OpenXML file format
|
||||
else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
|
||||
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
|
||||
XSLFSlideShow slideShow = new XSLFSlideShow(docPackage);
|
||||
}
|
||||
// Any other type of embedded object.
|
||||
else {
|
||||
System.out.println("Unknown Embedded Document: " + contentType);
|
||||
InputStream inputStream = pPart.getInputStream();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -31,7 +31,7 @@ public class HSSFShapeGroup
|
||||
extends HSSFShape
|
||||
implements HSSFShapeContainer
|
||||
{
|
||||
List shapes = new ArrayList();
|
||||
List<HSSFShape> shapes = new ArrayList<HSSFShape>();
|
||||
int x1 = 0;
|
||||
int y1 = 0 ;
|
||||
int x2 = 1023;
|
||||
@ -115,7 +115,7 @@ public class HSSFShapeGroup
|
||||
/**
|
||||
* Return all children contained by this shape.
|
||||
*/
|
||||
public List getChildren()
|
||||
public List<HSSFShape> getChildren()
|
||||
{
|
||||
return shapes;
|
||||
}
|
||||
|
@ -1568,10 +1568,10 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
|
||||
*
|
||||
* @return the list of pictures (a list of {@link HSSFPictureData} objects.)
|
||||
*/
|
||||
public List getAllPictures()
|
||||
public List<HSSFPictureData> getAllPictures()
|
||||
{
|
||||
// The drawing group record always exists at the top level, so we won't need to do this recursively.
|
||||
List pictures = new ArrayList();
|
||||
List<HSSFPictureData> pictures = new ArrayList<HSSFPictureData>();
|
||||
Iterator recordIter = workbook.getRecords().iterator();
|
||||
while (recordIter.hasNext())
|
||||
{
|
||||
@ -1592,7 +1592,7 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
|
||||
* @param escherRecords the escher records.
|
||||
* @param pictures the list to populate with the pictures.
|
||||
*/
|
||||
private void searchForPictures(List escherRecords, List pictures)
|
||||
private void searchForPictures(List escherRecords, List<HSSFPictureData> pictures)
|
||||
{
|
||||
Iterator recordIter = escherRecords.iterator();
|
||||
while (recordIter.hasNext())
|
||||
@ -1646,9 +1646,9 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
|
||||
*
|
||||
* @return the list of embedded objects (a list of {@link HSSFObjectData} objects.)
|
||||
*/
|
||||
public List getAllEmbeddedObjects()
|
||||
public List<HSSFObjectData> getAllEmbeddedObjects()
|
||||
{
|
||||
List objects = new ArrayList();
|
||||
List<HSSFObjectData> objects = new ArrayList<HSSFObjectData>();
|
||||
for (int i = 0; i < getNumberOfSheets(); i++)
|
||||
{
|
||||
getAllEmbeddedObjects(getSheetAt(i).getSheet().getRecords(), objects);
|
||||
@ -1662,7 +1662,7 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
|
||||
* @param records the list of records to search.
|
||||
* @param objects the list of embedded objects to populate.
|
||||
*/
|
||||
private void getAllEmbeddedObjects(List records, List objects)
|
||||
private void getAllEmbeddedObjects(List records, List<HSSFObjectData> objects)
|
||||
{
|
||||
Iterator recordIter = records.iterator();
|
||||
while (recordIter.hasNext())
|
||||
|
Loading…
Reference in New Issue
Block a user