diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index 7657019e7..b425e093e 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -88,6 +88,9 @@ public class HWPFDocument extends POIDocument /** Holds the save history for this document. */ protected SavedByTable _sbt; + + /** Holds pictures table */ + protected PicturesTable _pictures; protected HWPFDocument() { @@ -184,6 +187,9 @@ public class HWPFDocument extends POIDocument { _dataStream = new byte[0]; } + + // read in the pictures stream + _pictures = new PicturesTable(_dataStream); // get the start of text in the main stream int fcMin = _fib.getFcMin(); @@ -287,6 +293,13 @@ public class HWPFDocument extends POIDocument { return _sbt; } + + /** + * @return PicturesTable object, that is able to extract images from this document + */ + public PicturesTable getPicturesTable() { + return _pictures; + } /** * Writes out the word file that is represented by an instance of this class. diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PicturesTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PicturesTable.java new file mode 100644 index 000000000..69bde3aa4 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PicturesTable.java @@ -0,0 +1,152 @@ +/* ==================================================================== + Copyright 2002-2006 Apache Software Foundation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + + +package org.apache.poi.hwpf.model; + +import org.apache.poi.util.LittleEndian; +import org.apache.poi.hwpf.usermodel.CharacterRun; +import org.apache.poi.hwpf.usermodel.Picture; + +import java.util.List; +import java.util.ArrayList; + + +/** + * Holds information about all pictures embedded in Word Document either via "Insert -> Picture -> From File" or via + * clipboard. Responsible for images extraction and determining whether some document�s piece contains embedded image. + * Analyzes raw data bytestream �Data� (where Word stores all embedded objects) provided by HWPFDocument. + * + * Word stores images as is within so called "Data stream" - the stream within a Word docfile containing various data + * that hang off of characters in the main stream. For example, binary data describing in-line pictures and/or + * formfields an also embedded objects-native data. Word picture structures are concatenated one after the other in + * the data stream if the document contains pictures. + * Data stream is easily reachable via HWPFDocument._dataStream property. + * A picture is represented in the document text stream as a special character, an Unicode \u0001 whose + * CharacterRun.isSpecial() returns true. The file location of the picture in the Word binary file is accessed + * via CharacterRun.getPicOffset(). The CharacterRun.getPicOffset() is a byte offset into the data stream. + * Beginning at the position recorded in picOffset, a header data structure, will be stored. + * + * @author Dmitry Romanov + */ +public class PicturesTable +{ + static final int TYPE_IMAGE = 0x08; + static final int TYPE_IMAGE_WORD2000 = 0x00; + static final int TYPE_IMAGE_PASTED_FROM_CLIPBOARD = 0xA; + static final int TYPE_IMAGE_PASTED_FROM_CLIPBOARD_WORD2000 = 0x2; + static final int TYPE_HORIZONTAL_LINE = 0xE; + static final int BLOCK_TYPE_OFFSET = 0xE; + static final int MM_MODE_TYPE_OFFSET = 0x6; + + private byte[] _dataStream; + + /** @link dependency + * @stereotype instantiate*/ + /*# Picture lnkPicture; */ + + /** + * + * @param _dataStream + */ + public PicturesTable(byte[] _dataStream) + { + this._dataStream = _dataStream; + } + + /** + * determines whether specified CharacterRun contains reference to a picture + * @param run + */ + public boolean hasPicture(CharacterRun run) { + if (run.isSpecialCharacter() && !run.isObj() && !run.isOle2() && !run.isData() && "\u0001".equals(run.text())) { + return isBlockContainsImage(run.getPicOffset()); + } + return false; + } + + /** + * determines whether specified CharacterRun contains reference to a picture + * @param run + */ + public boolean hasHorizontalLine(CharacterRun run) { + if (run.isSpecialCharacter() && "\u0001".equals(run.text())) { + return isBlockContainsHorizontalLine(run.getPicOffset()); + } + return false; + } + + private boolean isPictureRecognized(short blockType, short mappingModeOfMETAFILEPICT) { + return (blockType == TYPE_IMAGE || blockType == TYPE_IMAGE_PASTED_FROM_CLIPBOARD || (blockType==TYPE_IMAGE_WORD2000 && mappingModeOfMETAFILEPICT==0x64) || (blockType==TYPE_IMAGE_PASTED_FROM_CLIPBOARD_WORD2000 && mappingModeOfMETAFILEPICT==0x64)); + } + + private static short getBlockType(byte[] dataStream, int pictOffset) { + return LittleEndian.getShort(dataStream, pictOffset + BLOCK_TYPE_OFFSET); + } + + private static short getMmMode(byte[] dataStream, int pictOffset) { + return LittleEndian.getShort(dataStream, pictOffset + MM_MODE_TYPE_OFFSET); + } + + /** + * Returns picture object tied to specified CharacterRun + * @param run + * @param fillBytes if true, Picture will be returned with filled byte array that represent picture's contents. If you don't want + * to have that byte array in memory but only write picture's contents to stream, pass false and then use Picture.writeImageContent + * @see Picture#writeImageContent(java.io.OutputStream) + * @return a Picture object if picture exists for specified CharacterRun, null otherwise. PicturesTable.hasPicture is used to determine this. + * @see #hasPicture(org.apache.poi.hwpf.usermodel.CharacterRun) + */ + public Picture extractPicture(CharacterRun run, boolean fillBytes) { + if (hasPicture(run)) { + return new Picture(run.getPicOffset(), _dataStream, fillBytes); + } + return null; + } + + /** + * @return a list of Picture objects found in current document + */ + public List getAllPictures() { + ArrayList pictures = new ArrayList(); + + int pos = 0; + boolean atEnd = false; + + while(pos<_dataStream.length && !atEnd) { + if (isBlockContainsImage(pos)) { + pictures.add(new Picture(pos, _dataStream, false)); + } + + int skipOn = LittleEndian.getInt(_dataStream, pos); + if(skipOn <= 0) { atEnd = true; } + pos += skipOn; + } + + return pictures; + } + + private boolean isBlockContainsImage(int i) + { + return isPictureRecognized(getBlockType(_dataStream, i), getMmMode(_dataStream, i)); + } + + private boolean isBlockContainsHorizontalLine(int i) + { + return getBlockType(_dataStream, i)==TYPE_HORIZONTAL_LINE && getMmMode(_dataStream, i)==0x64; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.java new file mode 100644 index 000000000..8adbf090f --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Picture.java @@ -0,0 +1,350 @@ +/* ==================================================================== + Copyright 2002-2006 Apache Software Foundation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + + +package org.apache.poi.hwpf.usermodel; + +import org.apache.poi.util.LittleEndian; + +import java.io.OutputStream; +import java.io.IOException; + +/** + * Represents embedded picture extracted from Word Document + * @author Dmitry Romanov + */ +public class Picture +{ +// public static final int FILENAME_OFFSET = 0x7C; +// public static final int FILENAME_SIZE_OFFSET = 0x6C; + static final int BLOCK_TYPE_OFFSET = 0xE; + static final int PICT_HEADER_OFFSET = 0x4; + static final int UNKNOWN_HEADER_SIZE = 0x49; + + public static final byte[] GIF = new byte[]{'G', 'I', 'F'}; + public static final byte[] PNG = new byte[]{ (byte)0x89, 0x50, 0x4E, 0x47,0x0D,0x0A,0x1A,0x0A}; + public static final byte[] JPG = new byte[]{(byte)0xFF, (byte)0xD8}; + public static final byte[] BMP = new byte[]{'B', 'M'}; + public static final byte[] TIFF = new byte[]{0x49, 0x49, 0x2A, 0x00}; + public static final byte[] TIFF1 = new byte[]{0x4D, 0x4D, 0x00, 0x2A}; + + public static final byte[] IHDR = new byte[]{'I', 'H', 'D', 'R'}; + + private int dataBlockStartOfsset; + private int pictureBytesStartOffset; + private int dataBlockSize; + private int size; +// private String fileName; + private byte[] content; + private byte[] _dataStream; + private int aspectRatioX; + private int aspectRatioY; + private int height = -1; + private int width = -1; + + + public Picture(int dataBlockStartOfsset, byte[] _dataStream, boolean fillBytes) + { + this._dataStream = _dataStream; + this.dataBlockStartOfsset = dataBlockStartOfsset; + this.dataBlockSize = LittleEndian.getInt(_dataStream, dataBlockStartOfsset); + this.pictureBytesStartOffset = getPictureBytesStartOffset(dataBlockStartOfsset, _dataStream, dataBlockSize); + this.size = dataBlockSize - (pictureBytesStartOffset - dataBlockStartOfsset); + + if (size<0) { + + } + + this.aspectRatioX = extractAspectRatioX(_dataStream, dataBlockStartOfsset); + this.aspectRatioY = extractAspectRatioY(_dataStream, dataBlockStartOfsset); +// this.fileName = extractFileName(dataBlockStartOfsset, _dataStream); +// if (fileName==null || fileName.length()==0) { +// fileName = "clipboard"; +// } + + if (fillBytes) + { + fillImageContent(_dataStream); + } + + String ext = suggestFileExtension(); + // trying to extract width and height from pictures content: + if ("jpg".equalsIgnoreCase(ext)) { + fillJPGWidthHeight(); + } else if ("png".equalsIgnoreCase(ext)) { + fillPNGWidthHeight(); + } + } + + private static int extractAspectRatioX(byte[] _dataStream, int dataBlockStartOffset) + { + return LittleEndian.getShort(_dataStream, dataBlockStartOffset+0x20)/10; + } + + private static int extractAspectRatioY(byte[] _dataStream, int dataBlockStartOffset) + { + return LittleEndian.getShort(_dataStream, dataBlockStartOffset+0x22)/10; + } + + /** + * Tries to suggest a filename: hex representation of picture structure offset in "Data" stream plus extension that + * is tried to determine from first byte of picture's content. + * + * @return suggested file name + */ + public String suggestFullFileName() + { + String fileExt = suggestFileExtension(); + return Integer.toHexString(dataBlockStartOfsset) + (fileExt.length()>0 ? "."+fileExt : ""); + } + + /** + * Writes Picture's content bytes to specified OutputStream. + * Is useful when there is need to write picture bytes directly to stream, omitting its representation in + * memory as distinct byte array. + * + * @param out a stream to write to + * @throws IOException if some exception is occured while writing to specified out + */ + public void writeImageContent(OutputStream out) throws IOException + { + if (content!=null && content.length>0) { + out.write(content, 0, size); + } else { + out.write(_dataStream, pictureBytesStartOffset, size); + } + } + + /** + * @return picture's content as byte array + */ + public byte[] getContent() + { + if (content == null || content.length<=0) + { + fillImageContent(this._dataStream); + } + return content; + } + + /** + * + * @return size in bytes of the picture + */ + public int getSize() + { + return size; + } + + /** + * returns horizontal aspect ratio for picture provided by user + */ + public int getAspectRatioX() + { + return aspectRatioX; + } + /** + * returns vertical aspect ratio for picture provided by user + */ + public int getAspectRatioY() + { + return aspectRatioY; + } + + /** + * tries to suggest extension for picture's file by matching signatures of popular image formats to first bytes + * of picture's contents + * @return suggested file extension + */ + public String suggestFileExtension() + { + if (content!=null && content.length>0) { + return suggestFileExtension(content, 0); + } + return suggestFileExtension(_dataStream, pictureBytesStartOffset); + } + + + private String suggestFileExtension(byte[] _dataStream, int pictureBytesStartOffset) + { + if (matchSignature(_dataStream, JPG, pictureBytesStartOffset)) { + return "jpg"; + } else if (matchSignature(_dataStream, PNG, pictureBytesStartOffset)) { + return "png"; + } else if (matchSignature(_dataStream, GIF, pictureBytesStartOffset)) { + return "gif"; + } else if (matchSignature(_dataStream, BMP, pictureBytesStartOffset)) { + return "bmp"; + } else if (matchSignature(_dataStream, TIFF, pictureBytesStartOffset)) { + return "tiff"; + } else if (matchSignature(_dataStream, TIFF1, pictureBytesStartOffset)) { + return "tiff"; + } + return ""; + } + + private static boolean matchSignature(byte[] dataStream, byte[] signature, int pictureBytesOffset) + { + boolean matched = pictureBytesOffset < dataStream.length; + for (int i = 0; (i+pictureBytesOffset) < dataStream.length && i < signature.length; i++) + { + if (dataStream[i+pictureBytesOffset] != signature[i]) + { + matched = false; + break; + } + } + return matched; + } + +// public String getFileName() +// { +// return fileName; +// } + +// private static String extractFileName(int blockStartIndex, byte[] dataStream) { +// int fileNameStartOffset = blockStartIndex + 0x7C; +// int fileNameSizeOffset = blockStartIndex + FILENAME_SIZE_OFFSET; +// int fileNameSize = LittleEndian.getShort(dataStream, fileNameSizeOffset); +// +// int fileNameIndex = fileNameStartOffset; +// char[] fileNameChars = new char[(fileNameSize-1)/2]; +// int charIndex = 0; +// while(charIndex=dataBlockEndOffset) { + realPicoffset -= UNKNOWN_HEADER_SIZE; + } + return realPicoffset; + } + + private void fillJPGWidthHeight() { + /* + http://www.codecomments.com/archive281-2004-3-158083.html + + Algorhitm proposed by Patrick TJ McPhee: + + read 2 bytes + make sure they are 'ffd8'x + repeatedly: + read 2 bytes + make sure the first one is 'ff'x + if the second one is 'd9'x stop + else if the second one is c0 or c2 (or possibly other values ...) + skip 2 bytes + read one byte into depth + read two bytes into height + read two bytes into width + else + read two bytes into length + skip forward length-2 bytes + + Also used Ruby code snippet from: http://www.bigbold.com/snippets/posts/show/805 for reference + */ + int pointer = pictureBytesStartOffset+2; + int firstByte = _dataStream[pointer]; + int secondByte = _dataStream[pointer+1]; + + int endOfPicture = pictureBytesStartOffset + size; + while(pointer -1) { + read = fis.read(buffer); + if(read > 0) { + baos.write(buffer,0,read); + } + } + + return baos.toByteArray(); + } +} diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/simple_image.jpg b/src/scratchpad/testcases/org/apache/poi/hwpf/data/simple_image.jpg new file mode 100644 index 000000000..af68bcabf Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/simple_image.jpg differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/simple_image.png b/src/scratchpad/testcases/org/apache/poi/hwpf/data/simple_image.png new file mode 100644 index 000000000..a9120d70d Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/simple_image.png differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/testPictures.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/testPictures.doc new file mode 100644 index 000000000..d031993cd Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/testPictures.doc differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/two_images.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/two_images.doc new file mode 100755 index 000000000..f94867d22 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/two_images.doc differ