Record level dumper of powerpoint files, and their constituent escher parts
Prints out in indented text what occurs where (Code has existed for a while, but the lookup table used to be just a cut'n'paste from something of unknown copyright status. Now converted to using Yegor's new RecordTypes class, so can be released) git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@353743 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3a1dcb4a85
commit
e00185760d
412
src/scratchpad/src/org/apache/poi/hslf/dev/SlideShowDumper.java
Normal file
412
src/scratchpad/src/org/apache/poi/hslf/dev/SlideShowDumper.java
Normal file
@ -0,0 +1,412 @@
|
||||
|
||||
/* ====================================================================
|
||||
Copyright 2002-2004 Apache Software Foundation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
|
||||
|
||||
package org.apache.poi.hslf.dev;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.POIFSDocument;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
|
||||
import org.apache.poi.ddf.*;
|
||||
import org.apache.poi.hslf.record.RecordTypes;
|
||||
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
/**
|
||||
* This class provides a way to "peek" inside a powerpoint file. It
|
||||
* will print out all the types it find, and for those it know aren't
|
||||
* atoms, what they contain
|
||||
*
|
||||
* To figure out what things are, and if they are atoms or not, used the
|
||||
* list in hslf.record.RecordTypes
|
||||
*
|
||||
* To peek inside PPDrawings, which hold Escher drawings, we use the
|
||||
* DDF package from POI (but we can fake it by using the Escher listings
|
||||
* from hslf.record.RecordTypes also)
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
|
||||
public class SlideShowDumper
|
||||
{
|
||||
private InputStream istream;
|
||||
private POIFSFileSystem filesystem;
|
||||
|
||||
private byte[] _docstream;
|
||||
|
||||
/** Do we try to use DDF to understand the escher objects? */
|
||||
private boolean ddfEscher = false;
|
||||
/** Do we use our own built-in basic escher groker to understand the escher objects? */
|
||||
private boolean basicEscher = false;
|
||||
|
||||
/**
|
||||
* right now this function takes one parameter: a ppt file, and outputs
|
||||
* a dump of what it contains
|
||||
*/
|
||||
public static void main(String args[]) throws IOException
|
||||
{
|
||||
if(args.length == 0) {
|
||||
System.err.println("Useage: SlideShowDumper [-escher|-basicescher] <filename>");
|
||||
return;
|
||||
}
|
||||
|
||||
String filename = args[0];
|
||||
if(args.length > 1) {
|
||||
filename = args[1];
|
||||
}
|
||||
|
||||
SlideShowDumper foo = new SlideShowDumper(filename);
|
||||
|
||||
if(args.length > 1) {
|
||||
if(args[0].equalsIgnoreCase("-escher")) {
|
||||
foo.setDDFEscher(true);
|
||||
} else {
|
||||
foo.setBasicEscher(true);
|
||||
}
|
||||
}
|
||||
|
||||
foo.printDump();
|
||||
foo.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a Powerpoint dump from fileName. Parses the document
|
||||
* and dumps out the contents
|
||||
*
|
||||
* @param fileName The name of the file to read.
|
||||
* @throws IOException if there is a problem while parsing the document.
|
||||
*/
|
||||
public SlideShowDumper(String fileName) throws IOException
|
||||
{
|
||||
this(new FileInputStream(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a Powerpoint dump from an input stream. Parses the
|
||||
* document and dumps out the contents
|
||||
*
|
||||
* @param inputStream the source of the data
|
||||
* @throws IOException if there is a problem while parsing the document.
|
||||
*/
|
||||
public SlideShowDumper(InputStream inputStream) throws IOException
|
||||
{
|
||||
//do Ole stuff
|
||||
this(new POIFSFileSystem(inputStream));
|
||||
istream = inputStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a Powerpoint dump from a POIFS Filesystem. Parses the
|
||||
* document and dumps out the contents
|
||||
*
|
||||
* @param filesystem the POIFS FileSystem to read from
|
||||
* @throws IOException if there is a problem while parsing the document.
|
||||
*/
|
||||
public SlideShowDumper(POIFSFileSystem filesystem) throws IOException
|
||||
{
|
||||
this.filesystem = filesystem;
|
||||
|
||||
// Get the main document stream
|
||||
DocumentEntry docProps =
|
||||
(DocumentEntry)filesystem.getRoot().getEntry("PowerPoint Document");
|
||||
|
||||
// Grab the document stream
|
||||
_docstream = new byte[docProps.getSize()];
|
||||
filesystem.createDocumentInputStream("PowerPoint Document").read(_docstream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Control dumping of any Escher records found - should DDF be used?
|
||||
*/
|
||||
public void setDDFEscher(boolean grok) {
|
||||
ddfEscher = grok;
|
||||
basicEscher = !(grok);
|
||||
}
|
||||
|
||||
/**
|
||||
* Control dumping of any Escher records found - should our built in
|
||||
* basic groker be used?
|
||||
*/
|
||||
public void setBasicEscher(boolean grok) {
|
||||
basicEscher = grok;
|
||||
ddfEscher = !(grok);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shuts things down. Closes underlying streams etc
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
public void close() throws IOException
|
||||
{
|
||||
if(istream != null) {
|
||||
istream.close();
|
||||
}
|
||||
filesystem = null;
|
||||
}
|
||||
|
||||
|
||||
public void printDump() {
|
||||
// The format of records in a powerpoint file are:
|
||||
// <little endian 2 byte "info">
|
||||
// <little endian 2 byte "type">
|
||||
// <little endian 4 byte "length">
|
||||
// If it has a zero length, following it will be another record
|
||||
// <xx xx yy yy 00 00 00 00> <xx xx yy yy zz zz zz zz>
|
||||
// If it has a length, depending on its type it may have children or data
|
||||
// If it has children, these will follow straight away
|
||||
// <xx xx yy yy zz zz zz zz <xx xx yy yy zz zz zz zz>>
|
||||
// If it has data, this will come straigh after, and run for the length
|
||||
// <xx xx yy yy zz zz zz zz dd dd dd dd dd dd dd>
|
||||
// All lengths given exclude the 8 byte record header
|
||||
// (Data records are known as Atoms)
|
||||
|
||||
// Document should start with:
|
||||
// 0F 00 E8 03 ## ## ## ##
|
||||
// (type 1000 = document, info 00 0f is normal, rest is document length)
|
||||
// 01 00 E9 03 28 00 00 00
|
||||
// (type 1001 = document atom, info 00 01 normal, 28 bytes long)
|
||||
|
||||
// When parsing a document, look to see if you know about that type
|
||||
// of the current record. If you know it's a type that has children,
|
||||
// process the record's data area looking for more records
|
||||
// If you know about the type and it doesn't have children, either do
|
||||
// something with the data (eg TextRun) or skip over it
|
||||
// Otherwise, check the first byte. If you do a BINARY_AND on it with
|
||||
// 0x0f (15) and get back 0x0f, you know it has children. Otherwise
|
||||
// it doesn't
|
||||
|
||||
walkTree(0,0,_docstream.length);
|
||||
}
|
||||
|
||||
public String makeHex(short s) {
|
||||
String hex = Integer.toHexString((int)s).toUpperCase();
|
||||
if(hex.length() == 1) { return "0" + hex; }
|
||||
return hex;
|
||||
}
|
||||
public String makeHex(int i) {
|
||||
String hex = Integer.toHexString(i).toUpperCase();
|
||||
if(hex.length() == 1) { return "000" + hex; }
|
||||
if(hex.length() == 2) { return "00" + hex; }
|
||||
if(hex.length() == 3) { return "0" + hex; }
|
||||
return hex;
|
||||
}
|
||||
|
||||
public void walkTree(int depth, int startPos, int maxLen) {
|
||||
int pos = startPos;
|
||||
int endPos = startPos + maxLen;
|
||||
int indent = depth;
|
||||
while(pos <= endPos - 8) {
|
||||
long type = LittleEndian.getUShort(_docstream,pos+2);
|
||||
long len = LittleEndian.getUInt(_docstream,pos+4);
|
||||
byte opt = _docstream[pos];
|
||||
|
||||
String ind = "";
|
||||
for(int i=0; i<indent; i++) { ind += " "; }
|
||||
|
||||
System.out.println(ind + "At position " + pos + " (" + makeHex(pos) + "):");
|
||||
System.out.println(ind + "Type is " + type + " (" + makeHex((int)type) + "), len is " + len + " (" + makeHex((int)len) + ")");
|
||||
|
||||
// See if we know about the type of it
|
||||
String recordName = RecordTypes.recordName((int)type);
|
||||
|
||||
// Jump over header, and think about going on more
|
||||
pos += 8;
|
||||
if(recordName != null) {
|
||||
System.out.println(ind + "That's a " + recordName);
|
||||
|
||||
// Now check if it's a container or not
|
||||
int container = (int)opt & 0x0f;
|
||||
|
||||
if(type == 0L || (container != 0x0f)) {
|
||||
System.out.println();
|
||||
} else if (type == 1035l || type == 1036l) {
|
||||
// Special Handling of 1035=PPDrawingGroup and 1036=PPDrawing
|
||||
System.out.println();
|
||||
|
||||
if(ddfEscher) {
|
||||
// Seems to be:
|
||||
walkEscherDDF((indent+3),pos+8,(int)len-8);
|
||||
} else if(basicEscher) {
|
||||
walkEscherBasic((indent+3),pos+8,(int)len-8);
|
||||
}
|
||||
} else {
|
||||
// General container record handling code
|
||||
System.out.println();
|
||||
walkTree((indent+2),pos,(int)len);
|
||||
}
|
||||
} else {
|
||||
System.out.println(ind + "** unknown record **");
|
||||
System.out.println();
|
||||
}
|
||||
pos += (int)len;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the DDF code to walk the Escher records
|
||||
*/
|
||||
public void walkEscherDDF(int indent, int pos, int len) {
|
||||
if(len < 8) { return; }
|
||||
|
||||
String ind = "";
|
||||
for(int i=0; i<indent; i++) { ind += " "; }
|
||||
|
||||
byte[] contents = new byte[len];
|
||||
System.arraycopy(_docstream,pos,contents,0,len);
|
||||
DefaultEscherRecordFactory erf = new DefaultEscherRecordFactory();
|
||||
EscherRecord record = erf.createRecord(contents,0);
|
||||
|
||||
// For now, try filling in the fields
|
||||
record.fillFields(contents,0,erf);
|
||||
|
||||
long atomType = LittleEndian.getUShort(contents,2);
|
||||
// This lacks the 8 byte header size
|
||||
long atomLen = LittleEndian.getUShort(contents,4);
|
||||
// This (should) include the 8 byte header size
|
||||
int recordLen = record.getRecordSize();
|
||||
|
||||
|
||||
System.out.println(ind + "At position " + pos + " (" + makeHex(pos) + "):");
|
||||
System.out.println(ind + "Type is " + atomType + " (" + makeHex((int)atomType) + "), len is " + atomLen + " (" + makeHex((int)atomLen) + ") (" + (atomLen+8) + ") - record claims " + recordLen);
|
||||
|
||||
// Check for corrupt / lying ones
|
||||
if(recordLen != 8 && (recordLen != (atomLen+8))) {
|
||||
System.out.println(ind + "** Atom length of " + atomLen + " (" + (atomLen+8) + ") doesn't match record length of " + recordLen);
|
||||
}
|
||||
|
||||
// Print the record's details
|
||||
if(record instanceof EscherContainerRecord) {
|
||||
EscherContainerRecord ecr = (EscherContainerRecord)record;
|
||||
System.out.println(ind + ecr.toString(false));
|
||||
walkEscherDDF((indent+3), pos + 8, (int)atomLen );
|
||||
} else {
|
||||
System.out.println(ind + record.toString());
|
||||
}
|
||||
|
||||
// Handle records that seem to lie
|
||||
if(atomType == 61451l) {
|
||||
// Normally claims a size of 8
|
||||
recordLen = (int)atomLen + 8;
|
||||
}
|
||||
if(atomType == 61453l) {
|
||||
// Returns EscherContainerRecord, but really msofbtClientTextbox
|
||||
recordLen = (int)atomLen + 8;
|
||||
record.fillFields( contents, 0, erf );
|
||||
if(! (record instanceof EscherTextboxRecord)) {
|
||||
System.out.println(ind + "** Really a msofbtClientTextbox !");
|
||||
}
|
||||
}
|
||||
|
||||
// Decide on what to do, based on how the lenghts match up
|
||||
if(recordLen == 8 && atomLen > 8 ) {
|
||||
// Assume it has children, rather than being corrupted
|
||||
walkEscherDDF((indent+3), pos + 8, (int)atomLen );
|
||||
|
||||
// Wind on our length + our header
|
||||
pos += atomLen;
|
||||
pos += 8;
|
||||
len -= atomLen;
|
||||
len -= 8;
|
||||
} else {
|
||||
// No children, wind on our real length
|
||||
pos += atomLen;
|
||||
pos += 8;
|
||||
len -= atomLen;
|
||||
len -= 8;
|
||||
}
|
||||
|
||||
// Move on to the next one, if we're not at the end yet
|
||||
if(len >= 8) {
|
||||
walkEscherDDF(indent, pos, len );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the basic record format groking code to walk the Escher records
|
||||
*/
|
||||
public void walkEscherBasic(int indent, int pos, int len) {
|
||||
if(len < 8) { return; }
|
||||
|
||||
String ind = "";
|
||||
for(int i=0; i<indent; i++) { ind += " "; }
|
||||
|
||||
long type = LittleEndian.getUShort(_docstream,pos+2);
|
||||
long atomlen = LittleEndian.getUInt(_docstream,pos+4);
|
||||
String typeS = makeHex((int)type);
|
||||
|
||||
System.out.println(ind + "At position " + pos + " (" + makeHex(pos) + "):");
|
||||
System.out.println(ind + "Type is " + type + " (" + typeS + "), len is " + atomlen + " (" + makeHex((int)atomlen) + ")");
|
||||
|
||||
String typeName = RecordTypes.recordName((int)type);
|
||||
if(typeName != null) {
|
||||
System.out.println(ind + "That's an Escher Record: " + typeName);
|
||||
} else {
|
||||
System.out.println(ind + "(Unknown Escher Record)");
|
||||
}
|
||||
|
||||
|
||||
// Code to print the first 8 bytes
|
||||
// System.out.print(ind);
|
||||
// for(int i=0; i<8; i++) {
|
||||
// short bv = _docstream[i+pos];
|
||||
// if(bv < 0) { bv += 256; }
|
||||
// System.out.print(i + "=" + bv + " (" + makeHex(bv) + ") ");
|
||||
// }
|
||||
// System.out.println("");
|
||||
|
||||
// Record specific dumps
|
||||
if(type == 61453l) {
|
||||
// Text Box. Print out first 8 bytes of data, then 8 4 later
|
||||
System.out.print(ind);
|
||||
for(int i=8; i<16; i++) {
|
||||
short bv = _docstream[i+pos];
|
||||
if(bv < 0) { bv += 256; }
|
||||
System.out.print(i + "=" + bv + " (" + makeHex(bv) + ") ");
|
||||
}
|
||||
System.out.println("");
|
||||
System.out.print(ind);
|
||||
for(int i=20; i<28; i++) {
|
||||
short bv = _docstream[i+pos];
|
||||
if(bv < 0) { bv += 256; }
|
||||
System.out.print(i + "=" + bv + " (" + makeHex(bv) + ") ");
|
||||
}
|
||||
System.out.println("");
|
||||
}
|
||||
|
||||
|
||||
// Blank line before next entry
|
||||
System.out.println("");
|
||||
|
||||
// Look in children if we are a container
|
||||
if(type == 61443l || type == 61444l) {
|
||||
walkEscherBasic((indent+3), pos+8, (int)atomlen);
|
||||
}
|
||||
|
||||
// Keep going if not yet at end
|
||||
if(atomlen < len) {
|
||||
int atomleni = (int)atomlen;
|
||||
walkEscherBasic(indent, pos+atomleni+8, len-atomleni-8);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user