From bug #38616 - support for extracting images from word files, plus tests for this

git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@450066 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2006-09-26 14:46:39 +00:00
parent 31e27577bd
commit 34878d8f01
8 changed files with 645 additions and 0 deletions

View File

@ -89,6 +89,9 @@ public class HWPFDocument extends POIDocument
/** Holds the save history for this document. */
protected SavedByTable _sbt;
/** Holds pictures table */
protected PicturesTable _pictures;
protected HWPFDocument()
{
@ -185,6 +188,9 @@ public class HWPFDocument extends POIDocument
_dataStream = new byte[0];
}
// read in the pictures stream
_pictures = new PicturesTable(_dataStream);
// get the start of text in the main stream
int fcMin = _fib.getFcMin();
@ -288,6 +294,13 @@ public class HWPFDocument extends POIDocument
return _sbt;
}
/**
* @return PicturesTable object, that is able to extract images from this document
*/
public PicturesTable getPicturesTable() {
return _pictures;
}
/**
* Writes out the word file that is represented by an instance of this class.
*

View File

@ -0,0 +1,152 @@
/* ====================================================================
Copyright 2002-2006 Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import java.util.List;
import java.util.ArrayList;
/**
* Holds information about all pictures embedded in Word Document either via "Insert -> Picture -> From File" or via
* clipboard. Responsible for images extraction and determining whether some document<EFBFBD>s piece contains embedded image.
* Analyzes raw data bytestream <EFBFBD>Data<EFBFBD> (where Word stores all embedded objects) provided by HWPFDocument.
*
* Word stores images as is within so called "Data stream" - the stream within a Word docfile containing various data
* that hang off of characters in the main stream. For example, binary data describing in-line pictures and/or
* formfields an also embedded objects-native data. Word picture structures are concatenated one after the other in
* the data stream if the document contains pictures.
* Data stream is easily reachable via HWPFDocument._dataStream property.
* A picture is represented in the document text stream as a special character, an Unicode \u0001 whose
* CharacterRun.isSpecial() returns true. The file location of the picture in the Word binary file is accessed
* via CharacterRun.getPicOffset(). The CharacterRun.getPicOffset() is a byte offset into the data stream.
* Beginning at the position recorded in picOffset, a header data structure, will be stored.
*
* @author Dmitry Romanov
*/
public class PicturesTable
{
static final int TYPE_IMAGE = 0x08;
static final int TYPE_IMAGE_WORD2000 = 0x00;
static final int TYPE_IMAGE_PASTED_FROM_CLIPBOARD = 0xA;
static final int TYPE_IMAGE_PASTED_FROM_CLIPBOARD_WORD2000 = 0x2;
static final int TYPE_HORIZONTAL_LINE = 0xE;
static final int BLOCK_TYPE_OFFSET = 0xE;
static final int MM_MODE_TYPE_OFFSET = 0x6;
private byte[] _dataStream;
/** @link dependency
* @stereotype instantiate*/
/*# Picture lnkPicture; */
/**
*
* @param _dataStream
*/
public PicturesTable(byte[] _dataStream)
{
this._dataStream = _dataStream;
}
/**
* determines whether specified CharacterRun contains reference to a picture
* @param run
*/
public boolean hasPicture(CharacterRun run) {
if (run.isSpecialCharacter() && !run.isObj() && !run.isOle2() && !run.isData() && "\u0001".equals(run.text())) {
return isBlockContainsImage(run.getPicOffset());
}
return false;
}
/**
* determines whether specified CharacterRun contains reference to a picture
* @param run
*/
public boolean hasHorizontalLine(CharacterRun run) {
if (run.isSpecialCharacter() && "\u0001".equals(run.text())) {
return isBlockContainsHorizontalLine(run.getPicOffset());
}
return false;
}
private boolean isPictureRecognized(short blockType, short mappingModeOfMETAFILEPICT) {
return (blockType == TYPE_IMAGE || blockType == TYPE_IMAGE_PASTED_FROM_CLIPBOARD || (blockType==TYPE_IMAGE_WORD2000 && mappingModeOfMETAFILEPICT==0x64) || (blockType==TYPE_IMAGE_PASTED_FROM_CLIPBOARD_WORD2000 && mappingModeOfMETAFILEPICT==0x64));
}
private static short getBlockType(byte[] dataStream, int pictOffset) {
return LittleEndian.getShort(dataStream, pictOffset + BLOCK_TYPE_OFFSET);
}
private static short getMmMode(byte[] dataStream, int pictOffset) {
return LittleEndian.getShort(dataStream, pictOffset + MM_MODE_TYPE_OFFSET);
}
/**
* Returns picture object tied to specified CharacterRun
* @param run
* @param fillBytes if true, Picture will be returned with filled byte array that represent picture's contents. If you don't want
* to have that byte array in memory but only write picture's contents to stream, pass false and then use Picture.writeImageContent
* @see Picture#writeImageContent(java.io.OutputStream)
* @return a Picture object if picture exists for specified CharacterRun, null otherwise. PicturesTable.hasPicture is used to determine this.
* @see #hasPicture(org.apache.poi.hwpf.usermodel.CharacterRun)
*/
public Picture extractPicture(CharacterRun run, boolean fillBytes) {
if (hasPicture(run)) {
return new Picture(run.getPicOffset(), _dataStream, fillBytes);
}
return null;
}
/**
* @return a list of Picture objects found in current document
*/
public List getAllPictures() {
ArrayList pictures = new ArrayList();
int pos = 0;
boolean atEnd = false;
while(pos<_dataStream.length && !atEnd) {
if (isBlockContainsImage(pos)) {
pictures.add(new Picture(pos, _dataStream, false));
}
int skipOn = LittleEndian.getInt(_dataStream, pos);
if(skipOn <= 0) { atEnd = true; }
pos += skipOn;
}
return pictures;
}
private boolean isBlockContainsImage(int i)
{
return isPictureRecognized(getBlockType(_dataStream, i), getMmMode(_dataStream, i));
}
private boolean isBlockContainsHorizontalLine(int i)
{
return getBlockType(_dataStream, i)==TYPE_HORIZONTAL_LINE && getMmMode(_dataStream, i)==0x64;
}
}

View File

@ -0,0 +1,350 @@
/* ====================================================================
Copyright 2002-2006 Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.usermodel;
import org.apache.poi.util.LittleEndian;
import java.io.OutputStream;
import java.io.IOException;
/**
* Represents embedded picture extracted from Word Document
* @author Dmitry Romanov
*/
public class Picture
{
// public static final int FILENAME_OFFSET = 0x7C;
// public static final int FILENAME_SIZE_OFFSET = 0x6C;
static final int BLOCK_TYPE_OFFSET = 0xE;
static final int PICT_HEADER_OFFSET = 0x4;
static final int UNKNOWN_HEADER_SIZE = 0x49;
public static final byte[] GIF = new byte[]{'G', 'I', 'F'};
public static final byte[] PNG = new byte[]{ (byte)0x89, 0x50, 0x4E, 0x47,0x0D,0x0A,0x1A,0x0A};
public static final byte[] JPG = new byte[]{(byte)0xFF, (byte)0xD8};
public static final byte[] BMP = new byte[]{'B', 'M'};
public static final byte[] TIFF = new byte[]{0x49, 0x49, 0x2A, 0x00};
public static final byte[] TIFF1 = new byte[]{0x4D, 0x4D, 0x00, 0x2A};
public static final byte[] IHDR = new byte[]{'I', 'H', 'D', 'R'};
private int dataBlockStartOfsset;
private int pictureBytesStartOffset;
private int dataBlockSize;
private int size;
// private String fileName;
private byte[] content;
private byte[] _dataStream;
private int aspectRatioX;
private int aspectRatioY;
private int height = -1;
private int width = -1;
public Picture(int dataBlockStartOfsset, byte[] _dataStream, boolean fillBytes)
{
this._dataStream = _dataStream;
this.dataBlockStartOfsset = dataBlockStartOfsset;
this.dataBlockSize = LittleEndian.getInt(_dataStream, dataBlockStartOfsset);
this.pictureBytesStartOffset = getPictureBytesStartOffset(dataBlockStartOfsset, _dataStream, dataBlockSize);
this.size = dataBlockSize - (pictureBytesStartOffset - dataBlockStartOfsset);
if (size<0) {
}
this.aspectRatioX = extractAspectRatioX(_dataStream, dataBlockStartOfsset);
this.aspectRatioY = extractAspectRatioY(_dataStream, dataBlockStartOfsset);
// this.fileName = extractFileName(dataBlockStartOfsset, _dataStream);
// if (fileName==null || fileName.length()==0) {
// fileName = "clipboard";
// }
if (fillBytes)
{
fillImageContent(_dataStream);
}
String ext = suggestFileExtension();
// trying to extract width and height from pictures content:
if ("jpg".equalsIgnoreCase(ext)) {
fillJPGWidthHeight();
} else if ("png".equalsIgnoreCase(ext)) {
fillPNGWidthHeight();
}
}
private static int extractAspectRatioX(byte[] _dataStream, int dataBlockStartOffset)
{
return LittleEndian.getShort(_dataStream, dataBlockStartOffset+0x20)/10;
}
private static int extractAspectRatioY(byte[] _dataStream, int dataBlockStartOffset)
{
return LittleEndian.getShort(_dataStream, dataBlockStartOffset+0x22)/10;
}
/**
* Tries to suggest a filename: hex representation of picture structure offset in "Data" stream plus extension that
* is tried to determine from first byte of picture's content.
*
* @return suggested file name
*/
public String suggestFullFileName()
{
String fileExt = suggestFileExtension();
return Integer.toHexString(dataBlockStartOfsset) + (fileExt.length()>0 ? "."+fileExt : "");
}
/**
* Writes Picture's content bytes to specified OutputStream.
* Is useful when there is need to write picture bytes directly to stream, omitting its representation in
* memory as distinct byte array.
*
* @param out a stream to write to
* @throws IOException if some exception is occured while writing to specified out
*/
public void writeImageContent(OutputStream out) throws IOException
{
if (content!=null && content.length>0) {
out.write(content, 0, size);
} else {
out.write(_dataStream, pictureBytesStartOffset, size);
}
}
/**
* @return picture's content as byte array
*/
public byte[] getContent()
{
if (content == null || content.length<=0)
{
fillImageContent(this._dataStream);
}
return content;
}
/**
*
* @return size in bytes of the picture
*/
public int getSize()
{
return size;
}
/**
* returns horizontal aspect ratio for picture provided by user
*/
public int getAspectRatioX()
{
return aspectRatioX;
}
/**
* returns vertical aspect ratio for picture provided by user
*/
public int getAspectRatioY()
{
return aspectRatioY;
}
/**
* tries to suggest extension for picture's file by matching signatures of popular image formats to first bytes
* of picture's contents
* @return suggested file extension
*/
public String suggestFileExtension()
{
if (content!=null && content.length>0) {
return suggestFileExtension(content, 0);
}
return suggestFileExtension(_dataStream, pictureBytesStartOffset);
}
private String suggestFileExtension(byte[] _dataStream, int pictureBytesStartOffset)
{
if (matchSignature(_dataStream, JPG, pictureBytesStartOffset)) {
return "jpg";
} else if (matchSignature(_dataStream, PNG, pictureBytesStartOffset)) {
return "png";
} else if (matchSignature(_dataStream, GIF, pictureBytesStartOffset)) {
return "gif";
} else if (matchSignature(_dataStream, BMP, pictureBytesStartOffset)) {
return "bmp";
} else if (matchSignature(_dataStream, TIFF, pictureBytesStartOffset)) {
return "tiff";
} else if (matchSignature(_dataStream, TIFF1, pictureBytesStartOffset)) {
return "tiff";
}
return "";
}
private static boolean matchSignature(byte[] dataStream, byte[] signature, int pictureBytesOffset)
{
boolean matched = pictureBytesOffset < dataStream.length;
for (int i = 0; (i+pictureBytesOffset) < dataStream.length && i < signature.length; i++)
{
if (dataStream[i+pictureBytesOffset] != signature[i])
{
matched = false;
break;
}
}
return matched;
}
// public String getFileName()
// {
// return fileName;
// }
// private static String extractFileName(int blockStartIndex, byte[] dataStream) {
// int fileNameStartOffset = blockStartIndex + 0x7C;
// int fileNameSizeOffset = blockStartIndex + FILENAME_SIZE_OFFSET;
// int fileNameSize = LittleEndian.getShort(dataStream, fileNameSizeOffset);
//
// int fileNameIndex = fileNameStartOffset;
// char[] fileNameChars = new char[(fileNameSize-1)/2];
// int charIndex = 0;
// while(charIndex<fileNameChars.length) {
// short aChar = LittleEndian.getShort(dataStream, fileNameIndex);
// fileNameChars[charIndex] = (char)aChar;
// charIndex++;
// fileNameIndex += 2;
// }
// String fileName = new String(fileNameChars);
// return fileName.trim();
// }
private void fillImageContent(byte[] dataStream)
{
this.content = new byte[size];
System.arraycopy(dataStream, pictureBytesStartOffset, content, 0, size);
}
private static int getPictureBytesStartOffset(int dataBlockStartOffset, byte[] _dataStream, int dataBlockSize)
{
final int dataBlockEndOffset = dataBlockSize + dataBlockStartOffset;
int realPicoffset = dataBlockStartOffset;
int PICTFBlockSize = LittleEndian.getShort(_dataStream, dataBlockStartOffset +PICT_HEADER_OFFSET);
int PICTF1BlockOffset = PICTFBlockSize + PICT_HEADER_OFFSET;
int PICTF1BlockSize = LittleEndian.getShort(_dataStream, dataBlockStartOffset +PICTF1BlockOffset);
int unknownHeaderOffset = (PICTF1BlockSize + PICTF1BlockOffset) < dataBlockEndOffset ? (PICTF1BlockSize + PICTF1BlockOffset) : PICTF1BlockOffset;
realPicoffset += (unknownHeaderOffset + UNKNOWN_HEADER_SIZE);
if (realPicoffset>=dataBlockEndOffset) {
realPicoffset -= UNKNOWN_HEADER_SIZE;
}
return realPicoffset;
}
private void fillJPGWidthHeight() {
/*
http://www.codecomments.com/archive281-2004-3-158083.html
Algorhitm proposed by Patrick TJ McPhee:
read 2 bytes
make sure they are 'ffd8'x
repeatedly:
read 2 bytes
make sure the first one is 'ff'x
if the second one is 'd9'x stop
else if the second one is c0 or c2 (or possibly other values ...)
skip 2 bytes
read one byte into depth
read two bytes into height
read two bytes into width
else
read two bytes into length
skip forward length-2 bytes
Also used Ruby code snippet from: http://www.bigbold.com/snippets/posts/show/805 for reference
*/
int pointer = pictureBytesStartOffset+2;
int firstByte = _dataStream[pointer];
int secondByte = _dataStream[pointer+1];
int endOfPicture = pictureBytesStartOffset + size;
while(pointer<endOfPicture-1) {
do {
firstByte = _dataStream[pointer];
secondByte = _dataStream[pointer+1];
} while (!(firstByte==(byte)0xFF) && pointer<endOfPicture-1);
if (firstByte==((byte)0xFF) && pointer<endOfPicture-1) {
if (secondByte==(byte)0xD9 || secondByte==(byte)0xDA) {
break;
} else if ( (secondByte & 0xF0) == 0xC0 && secondByte!=(byte)0xC4 && secondByte!=(byte)0xC8 && secondByte!=(byte)0xCC) {
pointer += 5;
this.height = getBigEndianShort(_dataStream, pointer);
this.width = getBigEndianShort(_dataStream, pointer+2);
break;
} else {
pointer++;
pointer++;
int length = getBigEndianShort(_dataStream, pointer);
pointer+=length;
}
} else {
pointer++;
}
}
}
private void fillPNGWidthHeight()
{
/*
Used PNG file format description from http://www.wotsit.org/download.asp?f=png
*/
int HEADER_START = pictureBytesStartOffset + PNG.length + 4;
if (matchSignature(_dataStream, IHDR, HEADER_START)) {
int IHDR_CHUNK_WIDTH = HEADER_START + 4;
this.width = getBigEndianInt(_dataStream, IHDR_CHUNK_WIDTH);
this.height = getBigEndianInt(_dataStream, IHDR_CHUNK_WIDTH + 4);
}
}
/**
* returns pixel width of the picture or -1 if dimensions determining was failed
*/
public int getWidth()
{
return width;
}
/**
* returns pixel height of the picture or -1 if dimensions determining was failed
*/
public int getHeight()
{
return height;
}
private static int getBigEndianInt(byte[] data, int offset)
{
return (((data[offset] & 0xFF)<< 24) + ((data[offset +1] & 0xFF) << 16) + ((data[offset + 2] & 0xFF) << 8) + (data[offset +3] & 0xFF));
}
private static int getBigEndianShort(byte[] data, int offset)
{
return (((data[offset] & 0xFF)<< 8) + (data[offset +1] & 0xFF));
}
}

View File

@ -0,0 +1,130 @@
/* ====================================================================
Copyright 2002-2004 Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.util.List;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.Picture;
import junit.framework.TestCase;
/**
* Test picture support in HWPF
* @author nick
*/
public class TestHWPFPictures extends TestCase {
private HWPFDocument docA;
private HWPFDocument docB;
private String docAFile;
private String docBFile;
private String imgAFile;
private String imgBFile;
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
docAFile = dirname + "/testPictures.doc";
docBFile = dirname + "/two_images.doc";
imgAFile = dirname + "/simple_image.jpg";
imgBFile = dirname + "/simple_image.png";
}
/**
* Test just opening the files
*/
public void testOpen() throws Exception {
docA = new HWPFDocument(new FileInputStream(docAFile));
docB = new HWPFDocument(new FileInputStream(docBFile));
}
/**
* Test that we have the right numbers of images in each file
*/
public void testImageCount() throws Exception {
docA = new HWPFDocument(new FileInputStream(docAFile));
docB = new HWPFDocument(new FileInputStream(docBFile));
assertNotNull(docA.getPicturesTable());
assertNotNull(docB.getPicturesTable());
PicturesTable picA = docA.getPicturesTable();
PicturesTable picB = docB.getPicturesTable();
List picturesA = picA.getAllPictures();
List picturesB = picB.getAllPictures();
assertEquals(7, picturesA.size());
assertEquals(2, picturesB.size());
}
/**
* Test that we have the right images in at least one file
*/
public void testImageData() throws Exception {
docB = new HWPFDocument(new FileInputStream(docBFile));
PicturesTable picB = docB.getPicturesTable();
List picturesB = picB.getAllPictures();
assertEquals(2, picturesB.size());
Picture pic1 = (Picture)picturesB.get(0);
Picture pic2 = (Picture)picturesB.get(1);
assertNotNull(pic1);
assertNotNull(pic2);
// Check the same
byte[] pic1B = readFile(imgAFile);
byte[] pic2B = readFile(imgBFile);
assertEquals(pic1B.length, pic1.getContent().length);
assertEquals(pic2B.length, pic2.getContent().length);
assertBytesSame(pic1B, pic1.getContent());
assertBytesSame(pic2B, pic2.getContent());
}
private void assertBytesSame(byte[] a, byte[] b) {
assertEquals(a.length, b.length);
for(int i=0; i<a.length; i++) {
assertEquals(a[i],b[i]);
}
}
private byte[] readFile(String file) throws Exception {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
FileInputStream fis = new FileInputStream(file);
byte[] buffer = new byte[1024];
int read = 0;
while(read > -1) {
read = fis.read(buffer);
if(read > 0) {
baos.write(buffer,0,read);
}
}
return baos.toByteArray();
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 643 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 KiB