#60519 - Extractor for *SSF embeddings

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1776819 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2016-12-31 21:50:47 +00:00
parent db4bdaf29a
commit 1d9c74b1bf
11 changed files with 1064 additions and 37 deletions

View File

@ -23,6 +23,8 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.extractor.EmbeddedData;
import org.apache.poi.ss.extractor.EmbeddedExtractor;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
@ -55,6 +57,8 @@ public abstract class SpreadsheetHandler extends AbstractFileHandler {
readContent(read);
extractEmbedded(read);
modifyContent(read);
read.close();
@ -92,6 +96,18 @@ public abstract class SpreadsheetHandler extends AbstractFileHandler {
}
}
private void extractEmbedded(Workbook wb) throws IOException {
EmbeddedExtractor ee = new EmbeddedExtractor();
for (Sheet s : wb) {
for (EmbeddedData ed : ee.extractAll(s)) {
assertNotNull(ed.getFilename());
assertNotNull(ed.getEmbeddedData());
assertNotNull(ed.getShape());
}
}
}
private void modifyContent(Workbook wb) {
/* a number of file fail because of various things: udf, unimplemented functions, ...
we would need quite a list of excludes and the large regression tests would probably

View File

@ -25,6 +25,7 @@ import org.apache.poi.ddf.*;
import org.apache.poi.hssf.record.*;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.ss.usermodel.ObjectData;
import org.apache.poi.util.HexDump;
/**
@ -32,7 +33,7 @@ import org.apache.poi.util.HexDump;
* <p/>
* Right now, 13, july, 2012 can not be created from scratch
*/
public final class HSSFObjectData extends HSSFPicture {
public final class HSSFObjectData extends HSSFPicture implements ObjectData {
/**
* Reference to the filesystem root, required for retrieving the object data.
*/
@ -43,20 +44,12 @@ public final class HSSFObjectData extends HSSFPicture {
this._root = _root;
}
/**
* Returns the OLE2 Class Name of the object
*/
@Override
public String getOLE2ClassName() {
return findObjectRecord().getOLEClassName();
}
/**
* Gets the object data. Only call for ones that have
* data though. See {@link #hasDirectoryEntry()}
*
* @return the object data as an OLE2 directory.
* @throws IOException if there was an error reading the data.
*/
@Override
public DirectoryEntry getDirectory() throws IOException {
EmbeddedObjectRefSubRecord subRecord = findObjectRecord();
@ -70,20 +63,12 @@ public final class HSSFObjectData extends HSSFPicture {
throw new IOException("Stream " + streamName + " was not an OLE2 directory");
}
/**
* Returns the data portion, for an ObjectData
* that doesn't have an associated POIFS Directory
* Entry
*/
@Override
public byte[] getObjectData() {
return findObjectRecord().getObjectData();
}
/**
* Does this ObjectData have an associated POIFS
* Directory Entry?
* (Not all do, those that don't have a data portion)
*/
@Override
public boolean hasDirectoryEntry() {
EmbeddedObjectRefSubRecord subRecord = findObjectRecord();

View File

@ -0,0 +1,65 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.ss.usermodel;
import java.io.IOException;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
/**
* Common interface for OLE shapes, i.e. shapes linked to embedded documents
*
* @since POI 3.16-beta2
*/
public interface ObjectData extends SimpleShape {
/**
* @return the data portion, for an ObjectData that doesn't have an associated POIFS Directory Entry
*/
byte[] getObjectData() throws IOException;
/**
* @return does this ObjectData have an associated POIFS Directory Entry?
* (Not all do, those that don't have a data portion)
*/
boolean hasDirectoryEntry();
/**
* Gets the object data. Only call for ones that have
* data though. See {@link #hasDirectoryEntry()}.
* The caller has to close the corresponding POIFSFileSystem
*
* @return the object data as an OLE2 directory.
* @throws IOException if there was an error reading the data.
*/
DirectoryEntry getDirectory() throws IOException;
/**
* @return the OLE2 Class Name of the object
*/
String getOLE2ClassName();
/**
* @return a filename suggestion - inspecting/interpreting the Directory object probably gives a better result
*/
String getFileName();
/**
* @return the preview picture
*/
PictureData getPictureData();
}

View File

@ -41,6 +41,11 @@ public interface PackageRelationshipTypes {
*/
String CORE_PROPERTIES_ECMA376 = "http://schemas.openxmlformats.org/officedocument/2006/relationships/metadata/core-properties";
/**
* Namespace of Core properties relationship type as defiend in ECMA 376
*/
String CORE_PROPERTIES_ECMA376_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
/**
* Digital signature relationship type.
*/

View File

@ -0,0 +1,104 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.ss.extractor;
import org.apache.poi.ss.usermodel.Shape;
/**
* A collection of embedded object informations and content
*/
public class EmbeddedData {
private String filename;
private byte[] embeddedData;
private Shape shape;
private String contentType = "binary/octet-stream";
public EmbeddedData(String filename, byte[] embeddedData, String contentType) {
setFilename(filename);
setEmbeddedData(embeddedData);
setContentType(contentType);
}
/**
* @return the filename
*/
public String getFilename() {
return filename;
}
/**
* Sets the filename
*
* @param filename the filename
*/
public void setFilename(String filename) {
if (filename == null) {
this.filename = "unknown.bin";
} else {
this.filename = filename.replaceAll("[^/\\\\]*[/\\\\]", "").trim();
}
}
/**
* @return the embedded object byte array
*/
public byte[] getEmbeddedData() {
return embeddedData;
}
/**
* Sets the embedded object as byte array
*
* @param embeddedData the embedded object byte array
*/
public void setEmbeddedData(byte[] embeddedData) {
this.embeddedData = (embeddedData == null) ? null : embeddedData.clone();
}
/**
* @return the shape which links to the embedded object
*/
public Shape getShape() {
return shape;
}
/**
* Sets the shape which links to the embedded object
*
* @param shape the shape
*/
public void setShape(Shape shape) {
this.shape = shape;
}
/**
* @return the content-/mime-type of the embedded object, the default (if unknown) is {@code binary/octet-stream}
*/
public String getContentType() {
return contentType;
}
/**
* Sets the content-/mime-type
*
* @param contentType the content-type
*/
public void setContentType(String contentType) {
this.contentType = contentType;
}
}

View File

@ -0,0 +1,353 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.ss.extractor;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Drawing;
import org.apache.poi.ss.usermodel.ObjectData;
import org.apache.poi.ss.usermodel.Picture;
import org.apache.poi.ss.usermodel.PictureData;
import org.apache.poi.ss.usermodel.Shape;
import org.apache.poi.ss.usermodel.ShapeContainer;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> {
private static final POILogger LOG = POILogFactory.getLogger(EmbeddedExtractor.class);
/**
* @return the list of known extractors, if you provide custom extractors, override this method
*/
@Override
public Iterator<EmbeddedExtractor> iterator() {
EmbeddedExtractor[] ee = {
new Ole10Extractor(), new PdfExtractor(), new WordExtractor(), new ExcelExtractor(), new FsExtractor()
};
return Arrays.asList(ee).iterator();
}
public EmbeddedData extractOne(DirectoryNode src) throws IOException {
for (EmbeddedExtractor ee : this) {
if (ee.canExtract(src)) {
return ee.extract(src);
}
}
return null;
}
public EmbeddedData extractOne(Picture src) throws IOException {
for (EmbeddedExtractor ee : this) {
if (ee.canExtract(src)) {
return ee.extract(src);
}
}
return null;
}
public List<EmbeddedData> extractAll(Sheet sheet) throws IOException {
Drawing<?> patriarch = sheet.getDrawingPatriarch();
if (null == patriarch){
return Collections.emptyList();
}
List<EmbeddedData> embeddings = new ArrayList<EmbeddedData>();
extractAll(patriarch, embeddings);
return embeddings;
}
protected void extractAll(ShapeContainer<?> parent, List<EmbeddedData> embeddings) throws IOException {
for (Shape shape : parent) {
EmbeddedData data = null;
if (shape instanceof ObjectData) {
ObjectData od = (ObjectData)shape;
try {
if (od.hasDirectoryEntry()) {
data = extractOne((DirectoryNode)od.getDirectory());
} else {
data = new EmbeddedData(od.getFileName(), od.getObjectData(), "binary/octet-stream");
}
} catch (Exception e) {
LOG.log(POILogger.WARN, "Entry not found / readable - ignoring OLE embedding", e);
}
} else if (shape instanceof Picture) {
data = extractOne((Picture)shape);
} else if (shape instanceof ShapeContainer) {
extractAll((ShapeContainer<?>)shape, embeddings);
}
if (data == null) {
continue;
}
data.setShape(shape);
String filename = data.getFilename();
String extension = (filename == null || filename.indexOf('.') == -1) ? ".bin" : filename.substring(filename.indexOf('.'));
// try to find an alternative name
if (filename == null || "".equals(filename) || filename.startsWith("MBD") || filename.startsWith("Root Entry")) {
filename = shape.getShapeName();
if (filename != null) {
filename += extension;
}
}
// default to dummy name
if (filename == null || "".equals(filename)) {
filename = "picture_"+embeddings.size()+extension;
}
filename = filename.trim();
data.setFilename(filename);
embeddings.add(data);
}
}
public boolean canExtract(DirectoryNode source) {
return false;
}
public boolean canExtract(Picture source) {
return false;
}
protected EmbeddedData extract(DirectoryNode dn) throws IOException {
assert(canExtract(dn));
POIFSFileSystem dest = new POIFSFileSystem();
copyNodes(dn, dest.getRoot());
// start with a reasonable big size
ByteArrayOutputStream bos = new ByteArrayOutputStream(20000);
dest.writeFilesystem(bos);
dest.close();
return new EmbeddedData(dn.getName(), bos.toByteArray(), "binary/octet-stream");
}
protected EmbeddedData extract(Picture source) throws IOException {
return null;
}
public static class Ole10Extractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return ClassID.OLE10_PACKAGE.equals(clsId);
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
try {
Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn);
return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), "binary/octet-stream");
} catch (Ole10NativeException e) {
throw new IOException(e);
}
}
}
static class PdfExtractor extends EmbeddedExtractor {
static ClassID PdfClassID = new ClassID("{B801CA65-A1FC-11D0-85AD-444553540000}");
@Override
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (PdfClassID.equals(clsId)
|| dn.hasEntry("CONTENTS"));
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
InputStream is = dn.createDocumentInputStream("CONTENTS");
IOUtils.copy(is, bos);
is.close();
return new EmbeddedData(dn.getName()+".pdf", bos.toByteArray(), "application/pdf");
}
@Override
public boolean canExtract(Picture source) {
PictureData pd = source.getPictureData();
return (pd.getPictureType() == Workbook.PICTURE_TYPE_EMF);
}
/**
* Mac Office encodes embedded objects inside the picture, e.g. PDF is part of an EMF.
* If an embedded stream is inside an EMF picture, this method extracts the payload.
*
* @return the embedded data in an EMF picture or null if none is found
*/
@Override
protected EmbeddedData extract(Picture source) throws IOException {
// check for emf+ embedded pdf (poor mans style :( )
// Mac Excel 2011 embeds pdf files with this method.
PictureData pd = source.getPictureData();
if (pd.getPictureType() != Workbook.PICTURE_TYPE_EMF) {
return null;
}
// TODO: investigate if this is just an EMF-hack or if other formats are also embedded in EMF
byte pictureBytes[] = pd.getData();
int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes(LocaleUtil.CHARSET_1252));
if (idxStart == -1) {
return null;
}
int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes(LocaleUtil.CHARSET_1252));
if (idxEnd == -1) {
return null;
}
int pictureBytesLen = idxEnd-idxStart+6;
byte[] pdfBytes = new byte[pictureBytesLen];
System.arraycopy(pictureBytes, idxStart, pdfBytes, 0, pictureBytesLen);
String filename = source.getShapeName().trim();
if (!filename.toLowerCase(Locale.ROOT).endsWith(".pdf")) {
filename += ".pdf";
}
return new EmbeddedData(filename, pdfBytes, "application/pdf");
}
}
static class WordExtractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (ClassID.WORD95.equals(clsId)
|| ClassID.WORD97.equals(clsId)
|| dn.hasEntry("WordDocument"));
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
ed.setFilename(dn.getName()+".doc");
return ed;
}
}
static class ExcelExtractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
ClassID clsId = dn.getStorageClsid();
return (ClassID.EXCEL95.equals(clsId)
|| ClassID.EXCEL97.equals(clsId)
|| dn.hasEntry("Workbook") /*...*/);
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
ed.setFilename(dn.getName()+".xls");
return ed;
}
}
static class FsExtractor extends EmbeddedExtractor {
@Override
public boolean canExtract(DirectoryNode dn) {
return true;
}
@Override
public EmbeddedData extract(DirectoryNode dn) throws IOException {
EmbeddedData ed = super.extract(dn);
ed.setFilename(dn.getName()+".ole");
// TODO: read the content type from CombObj stream
return ed;
}
}
protected static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException {
for (Entry e : src) {
if (e instanceof DirectoryNode) {
DirectoryNode srcDir = (DirectoryNode)e;
DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName());
destDir.setStorageClsid(srcDir.getStorageClsid());
copyNodes(srcDir, destDir);
} else {
InputStream is = src.createDocumentInputStream(e);
dest.createDocument(e.getName(), is);
is.close();
}
}
}
/**
* Knuth-Morris-Pratt Algorithm for Pattern Matching
* Finds the first occurrence of the pattern in the text.
*/
private static int indexOf(byte[] data, int offset, byte[] pattern) {
int[] failure = computeFailure(pattern);
int j = 0;
if (data.length == 0) return -1;
for (int i = offset; i < data.length; i++) {
while (j > 0 && pattern[j] != data[i]) {
j = failure[j - 1];
}
if (pattern[j] == data[i]) { j++; }
if (j == pattern.length) {
return i - pattern.length + 1;
}
}
return -1;
}
/**
* Computes the failure function using a boot-strapping process,
* where the pattern is matched against itself.
*/
private static int[] computeFailure(byte[] pattern) {
int[] failure = new int[pattern.length];
int j = 0;
for (int i = 1; i < pattern.length; i++) {
while (j > 0 && pattern[j] != pattern[i]) {
j = failure[j - 1];
}
if (pattern[j] == pattern[i]) {
j++;
}
failure[i] = j;
}
return failure;
}
}

View File

@ -20,8 +20,10 @@ package org.apache.poi.xssf.usermodel;
import static org.apache.poi.POIXMLTypeLoader.DEFAULT_XML_OPTIONS;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import javax.xml.namespace.QName;
@ -32,13 +34,21 @@ import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.ss.usermodel.ClientAnchor;
import org.apache.poi.ss.usermodel.Drawing;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.ImageUtils;
import org.apache.poi.util.Internal;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.poi.util.Units;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.XmlOptions;
import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
import org.openxmlformats.schemas.drawingml.x2006.main.CTGroupTransform2D;
import org.openxmlformats.schemas.drawingml.x2006.main.CTPoint2D;
import org.openxmlformats.schemas.drawingml.x2006.main.CTPositiveSize2D;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTransform2D;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTConnector;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTDrawing;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTGraphicalObjectFrame;
@ -53,7 +63,9 @@ import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.STEditAs;
/**
* Represents a SpreadsheetML drawing
*/
public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing<XSSFShape> {
private static final POILogger LOG = POILogFactory.getLogger(XSSFDrawing.class);
/**
* Root element of the SpreadsheetML Drawing part
*/
@ -86,7 +98,12 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
XmlOptions options = new XmlOptions(DEFAULT_XML_OPTIONS);
//Removing root element
options.setLoadReplaceDocumentElement(null);
drawing = CTDrawing.Factory.parse(part.getInputStream(),options);
InputStream is = part.getInputStream();
try {
drawing = CTDrawing.Factory.parse(is,options);
} finally {
is.close();
}
}
/**
@ -176,6 +193,8 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
XSSFPicture shape = new XSSFPicture(this, ctShape);
shape.anchor = anchor;
shape.setPictureReference(rel);
ctShape.getSpPr().setXfrm(createXfrm(anchor));
return shape;
}
@ -202,6 +221,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
XSSFGraphicFrame frame = createGraphicFrame(anchor);
frame.setChart(chart, chartRelId);
frame.getCTGraphicalObjectFrame().setXfrm(createXfrm(anchor));
return chart;
}
@ -241,6 +261,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
CTShape ctShape = ctAnchor.addNewSp();
ctShape.set(XSSFSimpleShape.prototype());
ctShape.getNvSpPr().getCNvPr().setId(shapeId);
ctShape.getSpPr().setXfrm(createXfrm(anchor));
XSSFSimpleShape shape = new XSSFSimpleShape(this, ctShape);
shape.anchor = anchor;
return shape;
@ -278,6 +299,11 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
CTTwoCellAnchor ctAnchor = createTwoCellAnchor(anchor);
CTGroupShape ctGroup = ctAnchor.addNewGrpSp();
ctGroup.set(XSSFShapeGroup.prototype());
CTTransform2D xfrm = createXfrm(anchor);
CTGroupTransform2D grpXfrm =ctGroup.getGrpSpPr().getXfrm();
grpXfrm.setOff(xfrm.getOff());
grpXfrm.setExt(xfrm.getExt());
grpXfrm.setChExt(xfrm.getExt());
XSSFShapeGroup shape = new XSSFShapeGroup(this, ctGroup);
shape.anchor = anchor;
@ -333,6 +359,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
CTTwoCellAnchor ctAnchor = createTwoCellAnchor(anchor);
CTGraphicalObjectFrame ctGraphicFrame = ctAnchor.addNewGraphicFrame();
ctGraphicFrame.set(XSSFGraphicFrame.prototype());
ctGraphicFrame.setXfrm(createXfrm(anchor));
long frameId = numOfGraphicFrames++;
XSSFGraphicFrame graphicFrame = new XSSFGraphicFrame(this, ctGraphicFrame);
@ -378,39 +405,159 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
return ctAnchor;
}
private CTTransform2D createXfrm(XSSFClientAnchor anchor) {
CTTransform2D xfrm = CTTransform2D.Factory.newInstance();
CTPoint2D off = xfrm.addNewOff();
off.setX(anchor.getDx1());
off.setY(anchor.getDy1());
XSSFSheet sheet = (XSSFSheet)getParent();
double widthPx = 0;
for (int col=anchor.getCol1(); col<anchor.getCol2(); col++) {
widthPx += sheet.getColumnWidthInPixels(col);
}
double heightPx = 0;
for (int row=anchor.getRow1(); row<anchor.getRow2(); row++) {
heightPx += ImageUtils.getRowHeightInPixels(sheet, row);
}
int width = Units.pixelToEMU((int)widthPx);
int height = Units.pixelToEMU((int)heightPx);
CTPositiveSize2D ext = xfrm.addNewExt();
ext.setCx(width - anchor.getDx1() + anchor.getDx2());
ext.setCy(height - anchor.getDy1() + anchor.getDy2());
// TODO: handle vflip/hflip
return xfrm;
}
private long newShapeId(){
return drawing.sizeOfTwoCellAnchorArray() + 1;
}
/**
*
* @return list of shapes in this drawing
*/
public List<XSSFShape> getShapes(){
List<XSSFShape> lst = new ArrayList<XSSFShape>();
for(XmlObject obj : drawing.selectPath("./*/*")) {
XSSFShape shape = null;
if(obj instanceof CTPicture) shape = new XSSFPicture(this, (CTPicture)obj) ;
else if(obj instanceof CTConnector) shape = new XSSFConnector(this, (CTConnector)obj) ;
else if(obj instanceof CTShape) shape = new XSSFSimpleShape(this, (CTShape)obj) ;
else if(obj instanceof CTGraphicalObjectFrame) shape = new XSSFGraphicFrame(this, (CTGraphicalObjectFrame)obj) ;
else if(obj instanceof CTGroupShape) shape = new XSSFShapeGroup(this, (CTGroupShape)obj) ;
if(shape != null){
shape.anchor = getAnchorFromParent(obj);
lst.add(shape);
XmlCursor cur = drawing.newCursor();
try {
if (cur.toFirstChild()) {
addShapes(cur, lst);
}
} finally {
cur.dispose();
}
return lst;
}
/**
* @return list of shapes in this shape group
*/
public List<XSSFShape> getShapes(XSSFShapeGroup groupshape){
List<XSSFShape> lst = new ArrayList<XSSFShape>();
XmlCursor cur = groupshape.getCTGroupShape().newCursor();
try {
addShapes(cur, lst);
} finally {
cur.dispose();
}
return lst;
}
private void addShapes(XmlCursor cur, List<XSSFShape> lst) {
try {
do {
cur.push();
if (cur.toFirstChild()) {
do {
XmlObject obj = cur.getObject();
XSSFShape shape;
if (obj instanceof CTMarker) {
// ignore anchor elements
continue;
} else if (obj instanceof CTPicture) {
shape = new XSSFPicture(this, (CTPicture)obj) ;
} else if(obj instanceof CTConnector) {
shape = new XSSFConnector(this, (CTConnector)obj) ;
} else if(obj instanceof CTShape) {
shape = hasOleLink(obj)
? new XSSFObjectData(this, (CTShape)obj)
: new XSSFSimpleShape(this, (CTShape)obj) ;
} else if(obj instanceof CTGraphicalObjectFrame) {
shape = new XSSFGraphicFrame(this, (CTGraphicalObjectFrame)obj) ;
} else if(obj instanceof CTGroupShape) {
shape = new XSSFShapeGroup(this, (CTGroupShape)obj) ;
} else if(obj instanceof XmlAnyTypeImpl) {
LOG.log(POILogger.WARN, "trying to parse AlternateContent, "
+ "this unlinks the returned Shapes from the underlying xml content, "
+ "so those shapes can't be used to modify the drawing, "
+ "i.e. modifications will be ignored!");
// XmlAnyTypeImpl is returned for AlternateContent parts, which might contain a CTDrawing
cur.push();
cur.toFirstChild();
XmlCursor cur2 = null;
try {
// need to parse AlternateContent again, otherwise the child elements aren't typed,
// but also XmlAnyTypes
CTDrawing alterWS = CTDrawing.Factory.parse(cur.newXMLStreamReader());
cur2 = alterWS.newCursor();
if (cur2.toFirstChild()) {
addShapes(cur2, lst);
}
} catch (XmlException e) {
LOG.log(POILogger.WARN, "unable to parse CTDrawing in alternate content.", e);
} finally {
if (cur2 != null) {
cur2.dispose();
}
cur.pop();
}
continue;
} else {
// ignore anything else
continue;
}
assert(shape != null);
shape.anchor = getAnchorFromParent(obj);
lst.add(shape);
} while (cur.toNextSibling());
}
cur.pop();
} while (cur.toNextSibling());
} finally {
cur.dispose();
}
}
private boolean hasOleLink(XmlObject shape) {
QName uriName = new QName(null, "uri");
String xquery = "declare namespace a='"+XSSFRelation.NS_DRAWINGML+"' .//a:extLst/a:ext";
XmlCursor cur = shape.newCursor();
cur.selectPath(xquery);
try {
while (cur.toNextSelection()) {
String uri = cur.getAttributeText(uriName);
if ("{63B3BB69-23CF-44E3-9099-C40C66FF867C}".equals(uri)) {
return true;
}
}
} finally {
cur.dispose();
}
return false;
}
private XSSFAnchor getAnchorFromParent(XmlObject obj){
XSSFAnchor anchor = null;
XmlObject parentXbean = null;
XmlCursor cursor = obj.newCursor();
if(cursor.toParent()) parentXbean = cursor.getObject();
if(cursor.toParent()) {
parentXbean = cursor.getObject();
}
cursor.dispose();
if(parentXbean != null){
if (parentXbean instanceof CTTwoCellAnchor) {
@ -424,4 +571,8 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
return anchor;
}
@Override
public Iterator<XSSFShape> iterator() {
return getShapes().iterator();
}
}

View File

@ -0,0 +1,169 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xssf.usermodel;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import javax.xml.namespace.QName;
import org.apache.poi.POIXMLDocumentPart;
import org.apache.poi.POIXMLException;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.ObjectData;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.xmlbeans.XmlCursor;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTOleObject;
/**
* Represents binary object (i.e. OLE) data stored in the file. Eg. A GIF, JPEG etc...
*/
public class XSSFObjectData extends XSSFSimpleShape implements ObjectData {
private static final POILogger LOG = POILogFactory.getLogger(XSSFObjectData.class);
/**
* A default instance of CTShape used for creating new shapes.
*/
private static CTShape prototype = null;
private CTOleObject oleObject;
protected XSSFObjectData(XSSFDrawing drawing, CTShape ctShape) {
super(drawing, ctShape);
}
/**
* Prototype with the default structure of a new auto-shape.
*/
protected static CTShape prototype() {
if(prototype == null) {
prototype = XSSFSimpleShape.prototype();
}
return prototype;
}
@Override
public String getOLE2ClassName() {
return getOleObject().getProgId();
}
/**
* @return the CTOleObject associated with the shape
*/
public CTOleObject getOleObject() {
if (oleObject == null) {
long shapeId = getCTShape().getNvSpPr().getCNvPr().getId();
oleObject = getSheet().readOleObject(shapeId);
if (oleObject == null) {
throw new POIXMLException("Ole object not found in sheet container - it's probably a control element");
}
}
return oleObject;
}
@Override
public byte[] getObjectData() throws IOException {
InputStream is = getObjectPart().getInputStream();
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
is.close();
return bos.toByteArray();
}
/**
* @return the package part of the object data
*/
public PackagePart getObjectPart() {
if (!getOleObject().isSetId()) {
throw new POIXMLException("Invalid ole object found in sheet container");
}
POIXMLDocumentPart pdp = getSheet().getRelationById(getOleObject().getId());
return (pdp == null) ? null : pdp.getPackagePart();
}
@Override
public boolean hasDirectoryEntry() {
InputStream is = null;
try {
is = getObjectPart().getInputStream();
// If clearly doesn't do mark/reset, wrap up
if (! is.markSupported()) {
is = new PushbackInputStream(is, 8);
}
// Ensure that there is at least some data there
byte[] header8 = IOUtils.peekFirst8Bytes(is);
// Try to create
return NPOIFSFileSystem.hasPOIFSHeader(header8);
} catch (IOException e) {
LOG.log(POILogger.WARN, "can't determine if directory entry exists", e);
return false;
} finally {
IOUtils.closeQuietly(is);
}
}
@Override
@SuppressWarnings("resource")
public DirectoryEntry getDirectory() throws IOException {
InputStream is = null;
try {
is = getObjectPart().getInputStream();
return new POIFSFileSystem(is).getRoot();
} finally {
IOUtils.closeQuietly(is);
}
}
/**
* The filename of the embedded image
*/
@Override
public String getFileName() {
return getObjectPart().getPartName().getName();
}
protected XSSFSheet getSheet() {
return (XSSFSheet)getDrawing().getParent();
}
@Override
public XSSFPictureData getPictureData() {
XmlCursor cur = getOleObject().newCursor();
try {
if (cur.toChild(XSSFRelation.NS_SPREADSHEETML, "objectPr")) {
String blipId = cur.getAttributeText(new QName(PackageRelationshipTypes.CORE_PROPERTIES_ECMA376_NS, "id"));
return (XSSFPictureData)getDrawing().getRelationById(blipId);
}
return null;
} finally {
cur.dispose();
}
}
}

View File

@ -40,6 +40,8 @@ import java.util.SortedMap;
import java.util.TreeMap;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.poi.POIXMLDocumentPart;
import org.apache.poi.POIXMLException;
@ -86,7 +88,9 @@ import org.apache.poi.xssf.usermodel.XSSFPivotTable.PivotTableReferenceConfigura
import org.apache.poi.xssf.usermodel.helpers.ColumnHelper;
import org.apache.poi.xssf.usermodel.helpers.XSSFIgnoredErrorHelper;
import org.apache.poi.xssf.usermodel.helpers.XSSFRowShifter;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.XmlOptions;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.*;
@ -4371,4 +4375,64 @@ public class XSSFSheet extends POIXMLDocumentPart implements Sheet {
CTIgnoredError ctIgnoredError = ctIgnoredErrors.addNewIgnoredError();
XSSFIgnoredErrorHelper.addIgnoredErrors(ctIgnoredError, ref, ignoredErrorTypes);
}
/**
* Determine the OleObject which links shapes with embedded resources
*
* @param shapeId the shape id
* @return the CTOleObject of the shape
*/
protected CTOleObject readOleObject(long shapeId) {
if (!getCTWorksheet().isSetOleObjects()) {
return null;
}
// we use a XmlCursor here to handle oleObject with-/out AlternateContent wrappers
String xquery = "declare namespace p='"+XSSFRelation.NS_SPREADSHEETML+"' .//p:oleObject";
XmlCursor cur = getCTWorksheet().getOleObjects().newCursor();
try {
cur.selectPath(xquery);
CTOleObject coo = null;
while (cur.toNextSelection()) {
String sId = cur.getAttributeText(new QName(null, "shapeId"));
if (sId == null || Long.parseLong(sId) != shapeId) {
continue;
}
XmlObject xObj = cur.getObject();
if (xObj instanceof CTOleObject) {
// the unusual case ...
coo = (CTOleObject)xObj;
} else {
XMLStreamReader reader = cur.newXMLStreamReader();
try {
CTOleObjects coos = CTOleObjects.Factory.parse(reader);
if (coos.sizeOfOleObjectArray() == 0) {
continue;
}
coo = coos.getOleObjectArray(0);
} catch (XmlException e) {
logger.log(POILogger.INFO, "can't parse CTOleObjects", e);
} finally {
try {
reader.close();
} catch (XMLStreamException e) {
logger.log(POILogger.INFO, "can't close reader", e);
}
}
}
// there are choice and fallback OleObject ... we prefer the one having the objectPr element,
// which is in the choice element
if (cur.toChild(XSSFRelation.NS_SPREADSHEETML, "objectPr")) {
break;
}
}
return (coo == null) ? null : coo;
} finally {
cur.dispose();
}
}
}

View File

@ -0,0 +1,115 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.ss.extractor;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.io.InputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import javax.xml.bind.DatatypeConverter;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIDataSamples;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.junit.Test;
public class TestEmbeddedExtractor {
private static final POIDataSamples samples = POIDataSamples.getSpreadSheetInstance();
@Test
public void extractPDFfromEMF() throws Exception {
InputStream fis = samples.openResourceAsStream("Basic_Expense_Template_2011.xls");
Workbook wb = WorkbookFactory.create(fis);
fis.close();
EmbeddedExtractor ee = new EmbeddedExtractor();
List<EmbeddedData> edList = new ArrayList<EmbeddedData>();
for (Sheet s : wb) {
edList.addAll(ee.extractAll(s));
}
wb.close();
assertEquals(2, edList.size());
String filename1 = "Sample.pdf";
EmbeddedData ed0 = edList.get(0);
assertEquals(filename1, ed0.getFilename());
assertEquals(filename1, ed0.getShape().getShapeName().trim());
assertEquals("uNplB1QpYug+LWappiTh0w==", md5hash(ed0.getEmbeddedData()));
String filename2 = "kalastuslupa_jiyjhnj_yuiyuiyuio_uyte_sldfsdfsdf_sfsdfsdf_sfsssfsf_sdfsdfsdfsdf_sdfsdfsdf.pdf";
EmbeddedData ed1 = edList.get(1);
assertEquals(filename2, ed1.getFilename());
assertEquals(filename2, ed1.getShape().getShapeName().trim());
assertEquals("QjLuAZ+cd7KbhVz4sj+QdA==", md5hash(ed1.getEmbeddedData()));
}
@Test
public void extractFromXSSF() throws IOException, EncryptedDocumentException, InvalidFormatException {
InputStream fis = samples.openResourceAsStream("58325_db.xlsx");
Workbook wb = WorkbookFactory.create(fis);
fis.close();
EmbeddedExtractor ee = new EmbeddedExtractor();
List<EmbeddedData> edList = new ArrayList<EmbeddedData>();
for (Sheet s : wb) {
edList.addAll(ee.extractAll(s));
}
wb.close();
assertEquals(4, edList.size());
EmbeddedData ed0 = edList.get(0);
assertEquals("Object 1.pdf", ed0.getFilename());
assertEquals("Object 1", ed0.getShape().getShapeName().trim());
assertEquals("Oyys6UtQU1gbHYBYqA4NFA==", md5hash(ed0.getEmbeddedData()));
EmbeddedData ed1 = edList.get(1);
assertEquals("Object 2.pdf", ed1.getFilename());
assertEquals("Object 2", ed1.getShape().getShapeName().trim());
assertEquals("xLScPUS0XH+5CTZ2A3neNw==", md5hash(ed1.getEmbeddedData()));
EmbeddedData ed2 = edList.get(2);
assertEquals("Object 3.pdf", ed2.getFilename());
assertEquals("Object 3", ed2.getShape().getShapeName().trim());
assertEquals("rX4klZqJAeM5npb54Gi2+Q==", md5hash(ed2.getEmbeddedData()));
EmbeddedData ed3 = edList.get(3);
assertEquals("Microsoft_Excel_Worksheet1.xlsx", ed3.getFilename());
assertEquals("Object 1", ed3.getShape().getShapeName().trim());
assertEquals("4m4N8ji2tjpEGPQuw2YwGA==", md5hash(ed3.getEmbeddedData()));
}
public static String md5hash(byte[] input) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
byte hash[] = md.digest(input);
return DatatypeConverter.printBase64Binary(hash);
} catch (NoSuchAlgorithmException e) {
// doesn't happen
return "";
}
}
}

Binary file not shown.