updated docs on extraction of embedded objects, misc changes in HSSF

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@795394 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2009-07-18 16:49:56 +00:00
parent 77987258b8
commit bf76476b0e
7 changed files with 273 additions and 8 deletions

View File

@ -73,6 +73,7 @@
<li><link href="#Autofit">How to adjust column width to fit the contents</link></li> <li><link href="#Autofit">How to adjust column width to fit the contents</link></li>
<li><link href="#Hyperlinks">Hyperlinks</link></li> <li><link href="#Hyperlinks">Hyperlinks</link></li>
<li><link href="#Validation">Data Validation</link></li> <li><link href="#Validation">Data Validation</link></li>
<li><link href="#Embedded">Embedded Objects</link></li>
</ul> </ul>
</section> </section>
<section><title>Features</title> <section><title>Features</title>
@ -1659,5 +1660,84 @@ Examples:
dvConstraint = DVConstraint.createFormulaListConstraint("'Sheet1'!$A$1:$A$3"); dvConstraint = DVConstraint.createFormulaListConstraint("'Sheet1'!$A$1:$A$3");
</source> </source>
</section> </section>
<anchor id="Embedded"/>
<section><title>Embedded Objects</title>
<p>It is possible to perform more detailed processing of an embedded Excel, Word or PowerPoint document,
or to work with any other type of embedded object.</p>
<p><strong>HSSF:</strong></p>
<source>
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream("excel_with_embeded.xls"));
HSSFWorkbook workbook = new HSSFWorkbook(fs);
for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) {
//the OLE2 Class Name of the object
String oleName = obj.getOLE2ClassName();
if (oleName.equals("Worksheet")) {
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false);
//System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets());
} else if (oleName.equals("Document")) {
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs);
//System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text());
} else if (oleName.equals("Presentation")) {
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn, fs));
//System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length);
} else {
if(obj.hasDirectoryEntry()){
// The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
for (Iterator entries = dn.getEntries(); entries.hasNext();) {
Entry entry = (Entry) entries.next();
//System.out.println(oleName + "." + entry.getName());
}
} else {
// There is no DirectoryEntry
// Recover the object's data from the HSSFObjectData instance.
byte[] objectData = obj.getObjectData();
}
}
}
</source>
<p><strong>XSSF:</strong></p>
<source>
XSSFWorkbook workbook = new XSSFWorkbook("excel_with_embeded.xlsx");
for (PackagePart pPart : workbook.getAllEmbedds()) {
String contentType = pPart.getContentType();
// Excel Workbook - either binary or OpenXML
if (contentType.equals("application/vnd.ms-excel")) {
HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream());
}
// Excel Workbook - OpenXML file format
else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(docPackage);
}
// Word Document - binary (OLE2CDF) file format
else if (contentType.equals("application/msword")) {
HWPFDocument document = new HWPFDocument(pPart.getInputStream());
}
// Word Document - OpenXML file format
else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
XWPFDocument document = new XWPFDocument(docPackage);
}
// PowerPoint Document - binary file format
else if (contentType.equals("application/vnd.ms-powerpoint")) {
HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream());
}
// PowerPoint Document - OpenXML file format
else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
XSLFSlideShow slideShow = new XSLFSlideShow(docPackage);
}
// Any other type of embedded object.
else {
System.out.println("Unknown Embedded Document: " + contentType);
InputStream inputStream = pPart.getInputStream();
}
}
</source>
</section>
</body> </body>
</document> </document>

View File

@ -33,6 +33,7 @@
<changes> <changes>
<release version="3.5-beta7" date="2009-??-??"> <release version="3.5-beta7" date="2009-??-??">
<action dev="POI-DEVELOPERS" type="fix">47535 - fixed WordExtractor to tolerate files with empty footnote block</action>
<action dev="POI-DEVELOPERS" type="fix">47517 - Fixed ExtractorFactory to support .xltx and .dotx files</action> <action dev="POI-DEVELOPERS" type="fix">47517 - Fixed ExtractorFactory to support .xltx and .dotx files</action>
<action dev="POI-DEVELOPERS" type="add">45556 - Support for extraction of footnotes from docx files</action> <action dev="POI-DEVELOPERS" type="add">45556 - Support for extraction of footnotes from docx files</action>
<action dev="POI-DEVELOPERS" type="add">45555 - Support for extraction of endnotes from docx files</action> <action dev="POI-DEVELOPERS" type="add">45555 - Support for extraction of endnotes from docx files</action>

View File

@ -102,6 +102,50 @@
<em>org.apache.poi.hdgf.extractor.VisioTextExtractor</em>, which <em>org.apache.poi.hdgf.extractor.VisioTextExtractor</em>, which
will return text for your file.</p> will return text for your file.</p>
</section> </section>
<section><title>Embedded Objects</title>
<p>Extractors already exist for Excel, Word, PowerPoint and Visio;
if one of these objects is embedded into a worksheet, the ExtractorFactory class can be used to recover an extractor for it.
</p>
<source>
FileInputStream fis = new FileInputStream(inputFile);
POIFSFileSystem fileSystem = new POIFSFileSystem(fis);
// Firstly, get an extractor for the Workbook
POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem);
// Then a List of extractors for any embedded Excel, Word, PowerPoint
// or Visio objects embedded into it.
POITextExtractor[] embeddedExtractors = ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor);
for (POITextExtractor textExtractor : embeddedExtractors) {
// If the embedded object was an Excel spreadsheet.
if (textExtractor instanceof ExcelExtractor) {
ExcelExtractor excelExtractor = (ExcelExtractor) textExtractor;
System.out.println(excelExtractor.getText());
}
// A Word Document
else if (textExtractor instanceof WordExtractor) {
WordExtractor wordExtractor = (WordExtractor) textExtractor;
String[] paragraphText = wordExtractor.getParagraphText();
for (String paragraph : paragraphText) {
System.out.println(paragraph);
}
// Display the document's header and footer text
System.out.println("Footer text: " + wordExtractor.getFooterText());
System.out.println("Header text: " + wordExtractor.getHeaderText());
}
// PowerPoint Presentation.
else if (textExtractor instanceof PowerPointExtractor) {
PowerPointExtractor powerPointExtractor = (PowerPointExtractor) textExtractor;
System.out.println("Text: " + powerPointExtractor.getText());
System.out.println("Notes: " + powerPointExtractor.getNotes());
}
// Visio Drawing
else if (textExtractor instanceof VisioTextExtractor) {
VisioTextExtractor visioTextExtractor = (VisioTextExtractor) textExtractor;
System.out.println("Text: " + visioTextExtractor.getText());
}
}
</source>
</section>
</body> </body>
<footer> <footer>

View File

@ -0,0 +1,68 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.usermodel.examples;
import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.SlideShow;
import java.io.FileInputStream;
import java.util.Iterator;
/**
* Demonstrates how you can extract embedded data from a .xls file
*/
public class EmeddedObjects {
public static void main(String[] args) throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(args[0]));
HSSFWorkbook workbook = new HSSFWorkbook(fs);
for (HSSFObjectData obj : workbook.getAllEmbeddedObjects()) {
//the OLE2 Class Name of the object
String oleName = obj.getOLE2ClassName();
if (oleName.equals("Worksheet")) {
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(dn, fs, false);
//System.out.println(entry.getName() + ": " + embeddedWorkbook.getNumberOfSheets());
} else if (oleName.equals("Document")) {
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
HWPFDocument embeddedWordDocument = new HWPFDocument(dn, fs);
//System.out.println(entry.getName() + ": " + embeddedWordDocument.getRange().text());
} else if (oleName.equals("Presentation")) {
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
SlideShow embeddedPowerPointDocument = new SlideShow(new HSLFSlideShow(dn, fs));
//System.out.println(entry.getName() + ": " + embeddedPowerPointDocument.getSlides().length);
} else {
if(obj.hasDirectoryEntry()){
// The DirectoryEntry is a DocumentNode. Examine its entries to find out what it is
DirectoryNode dn = (DirectoryNode) obj.getDirectory();
for (Iterator entries = dn.getEntries(); entries.hasNext();) {
Entry entry = (Entry) entries.next();
//System.out.println(oleName + "." + entry.getName());
}
} else {
// There is no DirectoryEntry
// Recover the objects data from the HSSFObjectData instance.
byte[] objectData = obj.getObjectData();
}
}
}
}
}

View File

@ -0,0 +1,72 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.usermodel.examples;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import java.io.InputStream;
/**
* Demonstrates how you can extract embedded data from a .xlsx file
*/
public class EmbeddedObjects {
public static void main(String[] args) throws Exception {
XSSFWorkbook workbook = new XSSFWorkbook(args[0]);
for (PackagePart pPart : workbook.getAllEmbedds()) {
String contentType = pPart.getContentType();
// Excel Workbook either binary or OpenXML
if (contentType.equals("application/vnd.ms-excel")) {
HSSFWorkbook embeddedWorkbook = new HSSFWorkbook(pPart.getInputStream());
}
// Excel Workbook OpenXML file format
else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
XSSFWorkbook embeddedWorkbook = new XSSFWorkbook(docPackage);
}
// Word Document binary (OLE2CDF) file format
else if (contentType.equals("application/msword")) {
HWPFDocument document = new HWPFDocument(pPart.getInputStream());
}
// Word Document OpenXML file format
else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
XWPFDocument document = new XWPFDocument(docPackage);
}
// PowerPoint Document binary file format
else if (contentType.equals("application/vnd.ms-powerpoint")) {
HSLFSlideShow slideShow = new HSLFSlideShow(pPart.getInputStream());
}
// PowerPoint Document OpenXML file format
else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
OPCPackage docPackage = OPCPackage.open(pPart.getInputStream());
XSLFSlideShow slideShow = new XSLFSlideShow(docPackage);
}
// Any other type of embedded object.
else {
System.out.println("Unknown Embedded Document: " + contentType);
InputStream inputStream = pPart.getInputStream();
}
}
}
}

View File

@ -31,7 +31,7 @@ public class HSSFShapeGroup
extends HSSFShape extends HSSFShape
implements HSSFShapeContainer implements HSSFShapeContainer
{ {
List shapes = new ArrayList(); List<HSSFShape> shapes = new ArrayList<HSSFShape>();
int x1 = 0; int x1 = 0;
int y1 = 0 ; int y1 = 0 ;
int x2 = 1023; int x2 = 1023;
@ -115,7 +115,7 @@ public class HSSFShapeGroup
/** /**
* Return all children contained by this shape. * Return all children contained by this shape.
*/ */
public List getChildren() public List<HSSFShape> getChildren()
{ {
return shapes; return shapes;
} }

View File

@ -1568,10 +1568,10 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
* *
* @return the list of pictures (a list of {@link HSSFPictureData} objects.) * @return the list of pictures (a list of {@link HSSFPictureData} objects.)
*/ */
public List getAllPictures() public List<HSSFPictureData> getAllPictures()
{ {
// The drawing group record always exists at the top level, so we won't need to do this recursively. // The drawing group record always exists at the top level, so we won't need to do this recursively.
List pictures = new ArrayList(); List<HSSFPictureData> pictures = new ArrayList<HSSFPictureData>();
Iterator recordIter = workbook.getRecords().iterator(); Iterator recordIter = workbook.getRecords().iterator();
while (recordIter.hasNext()) while (recordIter.hasNext())
{ {
@ -1592,7 +1592,7 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
* @param escherRecords the escher records. * @param escherRecords the escher records.
* @param pictures the list to populate with the pictures. * @param pictures the list to populate with the pictures.
*/ */
private void searchForPictures(List escherRecords, List pictures) private void searchForPictures(List escherRecords, List<HSSFPictureData> pictures)
{ {
Iterator recordIter = escherRecords.iterator(); Iterator recordIter = escherRecords.iterator();
while (recordIter.hasNext()) while (recordIter.hasNext())
@ -1646,9 +1646,9 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
* *
* @return the list of embedded objects (a list of {@link HSSFObjectData} objects.) * @return the list of embedded objects (a list of {@link HSSFObjectData} objects.)
*/ */
public List getAllEmbeddedObjects() public List<HSSFObjectData> getAllEmbeddedObjects()
{ {
List objects = new ArrayList(); List<HSSFObjectData> objects = new ArrayList<HSSFObjectData>();
for (int i = 0; i < getNumberOfSheets(); i++) for (int i = 0; i < getNumberOfSheets(); i++)
{ {
getAllEmbeddedObjects(getSheetAt(i).getSheet().getRecords(), objects); getAllEmbeddedObjects(getSheetAt(i).getSheet().getRecords(), objects);
@ -1662,7 +1662,7 @@ public class HSSFWorkbook extends POIDocument implements org.apache.poi.ss.userm
* @param records the list of records to search. * @param records the list of records to search.
* @param objects the list of embedded objects to populate. * @param objects the list of embedded objects to populate.
*/ */
private void getAllEmbeddedObjects(List records, List objects) private void getAllEmbeddedObjects(List records, List<HSSFObjectData> objects)
{ {
Iterator recordIter = records.iterator(); Iterator recordIter = records.iterator();
while (recordIter.hasNext()) while (recordIter.hasNext())