Record level dumper of powerpoint files, and their constituent escher parts

Prints out in indented text what occurs where (Code has existed for a while, but the lookup table used to be just a cut'n'paste from something of unknown copyright status. Now converted to using Yegor's new RecordTypes class, so can be released) git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@353743 13f79535-47bb-0310-9956-ffa450edef68
2005-07-08 15:57:07 +00:00 · 2005-07-08 15:57:07 +00:00 · e00185760d
commit e00185760d
parent 3a1dcb4a85
1 changed files with 412 additions and 0 deletions
--- a/src/scratchpad/src/org/apache/poi/hslf/dev/SlideShowDumper.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/dev/SlideShowDumper.java
@ -0,0 +1,412 @@
+
+/* ====================================================================
+   Copyright 2002-2004   Apache Software Foundation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+        
+
+
+package org.apache.poi.hslf.dev;
+
+import java.util.*;
+import java.io.*;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSDocument;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+
+import org.apache.poi.ddf.*;
+import org.apache.poi.hslf.record.RecordTypes;
+
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * This class provides a way to "peek" inside a powerpoint file. It
+ *  will print out all the types it find, and for those it know aren't
+ *  atoms, what they contain
+ *
+ * To figure out what things are, and if they are atoms or not, used the
+ *  list in hslf.record.RecordTypes
+ *
+ * To peek inside PPDrawings, which hold Escher drawings, we use the 
+ *  DDF package from POI (but we can fake it by using the Escher listings
+ *  from hslf.record.RecordTypes also)
+ * 
+ * @author Nick Burch
+ */
+
+public class SlideShowDumper
+{
+  private InputStream istream;
+  private POIFSFileSystem filesystem;
+
+  private byte[] _docstream;
+
+  /** Do we try to use DDF to understand the escher objects? */
+  private boolean ddfEscher = false;
+  /** Do we use our own built-in basic escher groker to understand the escher objects? */
+  private boolean basicEscher = false;
+
+  /**
+   *  right now this function takes one parameter: a ppt file, and outputs
+   *  a dump of what it contains
+   */
+  public static void main(String args[]) throws IOException
+  {
+	if(args.length == 0) {
+		System.err.println("Useage: SlideShowDumper [-escher|-basicescher] <filename>");
+		return;
+	}
+
+	String filename = args[0];
+	if(args.length > 1) {
+		filename = args[1];
+	}
+
+	SlideShowDumper foo = new SlideShowDumper(filename);
+
+	if(args.length > 1) {
+		if(args[0].equalsIgnoreCase("-escher")) {
+			foo.setDDFEscher(true);
+		} else {
+			foo.setBasicEscher(true);
+		}
+	}
+
+	foo.printDump();
+	foo.close();
+  }
+
+
+  /**
+   * Constructs a Powerpoint dump from fileName. Parses the document 
+   * and dumps out the contents
+   *
+   * @param fileName The name of the file to read.
+   * @throws IOException if there is a problem while parsing the document.
+   */
+  public SlideShowDumper(String fileName) throws IOException
+  {
+  	this(new FileInputStream(fileName));
+  }
+  
+  /**
+   * Constructs a Powerpoint dump from an input stream. Parses the 
+   * document and dumps out the contents
+   *
+   * @param inputStream the source of the data
+   * @throws IOException if there is a problem while parsing the document.
+   */
+  public SlideShowDumper(InputStream inputStream) throws IOException
+  {
+	//do Ole stuff
+	this(new POIFSFileSystem(inputStream));
+	istream = inputStream;
+  }
+
+  /**
+   * Constructs a Powerpoint dump from a POIFS Filesystem. Parses the 
+   * document and dumps out the contents
+   *
+   * @param filesystem the POIFS FileSystem to read from
+   * @throws IOException if there is a problem while parsing the document.
+   */
+  public SlideShowDumper(POIFSFileSystem filesystem) throws IOException
+  {
+	this.filesystem = filesystem;
+
+	// Get the main document stream
+	DocumentEntry docProps =
+		(DocumentEntry)filesystem.getRoot().getEntry("PowerPoint Document");
+
+	// Grab the document stream
+	_docstream = new byte[docProps.getSize()];
+	filesystem.createDocumentInputStream("PowerPoint Document").read(_docstream);
+  }
+
+  /**
+   * Control dumping of any Escher records found - should DDF be used?
+   */
+  public void setDDFEscher(boolean grok) {
+	ddfEscher = grok;
+	basicEscher = !(grok);
+  }
+
+  /**
+   * Control dumping of any Escher records found - should our built in
+   *  basic groker be used?
+   */
+  public void setBasicEscher(boolean grok) {
+	basicEscher = grok;
+	ddfEscher = !(grok);
+  }
+
+  /**
+   * Shuts things down. Closes underlying streams etc
+   *
+   * @throws IOException
+   */
+  public void close() throws IOException
+  {
+	if(istream != null) {
+		istream.close();
+	}
+	filesystem = null;
+  }
+
+
+  public void printDump() {
+	// The format of records in a powerpoint file are:
+	//   <little endian 2 byte "info">
+	//   <little endian 2 byte "type">
+	//   <little endian 4 byte "length">
+	// If it has a zero length, following it will be another record
+	//		<xx xx yy yy 00 00 00 00> <xx xx yy yy zz zz zz zz>
+	// If it has a length, depending on its type it may have children or data
+	// If it has children, these will follow straight away
+	//		<xx xx yy yy zz zz zz zz <xx xx yy yy zz zz zz zz>>
+	// If it has data, this will come straigh after, and run for the length
+	//      <xx xx yy yy zz zz zz zz dd dd dd dd dd dd dd>
+	// All lengths given exclude the 8 byte record header
+	// (Data records are known as Atoms)
+
+	// Document should start with:
+	//   0F 00 E8 03 ## ## ## ##
+    //     (type 1000 = document, info 00 0f is normal, rest is document length)
+	//   01 00 E9 03 28 00 00 00
+	//     (type 1001 = document atom, info 00 01 normal, 28 bytes long)
+
+	// When parsing a document, look to see if you know about that type
+	//  of the current record. If you know it's a type that has children, 
+	//  process the record's data area looking for more records
+	// If you know about the type and it doesn't have children, either do
+	//  something with the data (eg TextRun) or skip over it
+	// Otherwise, check the first byte. If you do a BINARY_AND on it with
+	//  0x0f (15) and get back 0x0f, you know it has children. Otherwise
+	//  it doesn't
+
+	walkTree(0,0,_docstream.length);
+}
+
+public String makeHex(short s) {
+	String hex = Integer.toHexString((int)s).toUpperCase();
+	if(hex.length() == 1) { return "0" + hex; }
+	return hex;
+}
+public String makeHex(int i) {
+	String hex = Integer.toHexString(i).toUpperCase();
+	if(hex.length() == 1) { return "000" + hex; }
+	if(hex.length() == 2) { return "00" + hex; }
+	if(hex.length() == 3) { return "0" + hex; }
+	return hex;
+}
+
+public void walkTree(int depth, int startPos, int maxLen) {
+	int pos = startPos;
+	int endPos = startPos + maxLen;
+	int indent = depth;
+	while(pos <= endPos - 8) {
+		long type = LittleEndian.getUShort(_docstream,pos+2);
+		long len = LittleEndian.getUInt(_docstream,pos+4);
+		byte opt = _docstream[pos];
+
+		String ind = "";
+		for(int i=0; i<indent; i++) { ind += " "; }
+
+		System.out.println(ind + "At position " + pos + " (" + makeHex(pos) + "):");
+		System.out.println(ind + "Type is " + type + " (" + makeHex((int)type) + "), len is " + len + " (" + makeHex((int)len) + ")");
+
+		// See if we know about the type of it
+		String recordName = RecordTypes.recordName((int)type);
+
+		// Jump over header, and think about going on more
+		pos += 8;
+		if(recordName != null) {
+			System.out.println(ind + "That's a " + recordName);
+
+			// Now check if it's a container or not
+			int container = (int)opt & 0x0f;
+
+			if(type == 0L || (container != 0x0f)) {
+				System.out.println();
+			} else if (type == 1035l || type == 1036l) {
+				// Special Handling of 1035=PPDrawingGroup and 1036=PPDrawing
+				System.out.println();
+
+				if(ddfEscher) {
+					// Seems to be:
+					walkEscherDDF((indent+3),pos+8,(int)len-8);
+				} else if(basicEscher) {
+					walkEscherBasic((indent+3),pos+8,(int)len-8);
+				}
+			} else {
+				// General container record handling code
+				System.out.println();
+				walkTree((indent+2),pos,(int)len);
+			}
+		} else {
+			System.out.println(ind + "** unknown record **");
+			System.out.println();
+		}
+		pos += (int)len;
+	}
+  }
+
+  /**
+   * Use the DDF code to walk the Escher records
+   */
+  public void walkEscherDDF(int indent, int pos, int len) {
+	if(len < 8) { return; }
+
+	String ind = "";
+	for(int i=0; i<indent; i++) { ind += " "; }
+
+	byte[] contents = new byte[len];
+	System.arraycopy(_docstream,pos,contents,0,len);
+	DefaultEscherRecordFactory erf = new DefaultEscherRecordFactory();
+	EscherRecord record = erf.createRecord(contents,0);
+
+	// For now, try filling in the fields
+	record.fillFields(contents,0,erf);
+
+	long atomType = LittleEndian.getUShort(contents,2);
+	// This lacks the 8 byte header size
+	long atomLen = LittleEndian.getUShort(contents,4);
+	// This (should) include the 8 byte header size
+	int recordLen = record.getRecordSize();
+
+
+	System.out.println(ind + "At position " + pos + " (" + makeHex(pos) + "):");
+	System.out.println(ind + "Type is " + atomType + " (" + makeHex((int)atomType) + "), len is " + atomLen + " (" + makeHex((int)atomLen) + ") (" + (atomLen+8) + ") - record claims " + recordLen);
+
+	// Check for corrupt / lying ones
+	if(recordLen != 8 && (recordLen != (atomLen+8))) {
+		System.out.println(ind + "** Atom length of " + atomLen + " (" + (atomLen+8) + ") doesn't match record length of " + recordLen);
+	}
+
+	// Print the record's details
+	if(record instanceof EscherContainerRecord) {
+		EscherContainerRecord ecr = (EscherContainerRecord)record;
+		System.out.println(ind + ecr.toString(false));
+		walkEscherDDF((indent+3), pos + 8, (int)atomLen );
+	} else {
+		System.out.println(ind + record.toString());
+	}
+
+	// Handle records that seem to lie
+	if(atomType == 61451l) {
+		// Normally claims a size of 8
+		recordLen = (int)atomLen + 8;
+	}
+	if(atomType == 61453l) {
+		// Returns EscherContainerRecord, but really msofbtClientTextbox
+		recordLen = (int)atomLen + 8;
+		record.fillFields( contents, 0, erf );
+		if(! (record instanceof EscherTextboxRecord)) {
+			System.out.println(ind + "** Really a msofbtClientTextbox !");
+		}
+	}
+
+	// Decide on what to do, based on how the lenghts match up
+	if(recordLen == 8 && atomLen > 8 ) {
+		// Assume it has children, rather than being corrupted
+		walkEscherDDF((indent+3), pos + 8, (int)atomLen );
+
+		// Wind on our length + our header
+		pos += atomLen;
+		pos += 8;
+		len -= atomLen;
+		len -= 8;
+	} else {
+		// No children, wind on our real length
+		pos += atomLen;
+		pos += 8;
+		len -= atomLen;
+		len -= 8;
+	}
+
+	// Move on to the next one, if we're not at the end yet
+	if(len >= 8) {
+		walkEscherDDF(indent, pos, len );
+	}
+  }
+
+  /**
+   * Use the basic record format groking code to walk the Escher records
+   */
+  public void walkEscherBasic(int indent, int pos, int len) {
+	if(len < 8) { return; }
+
+	String ind = "";
+	for(int i=0; i<indent; i++) { ind += " "; }
+
+	long type = LittleEndian.getUShort(_docstream,pos+2);
+	long atomlen = LittleEndian.getUInt(_docstream,pos+4);
+	String typeS = makeHex((int)type);
+
+	System.out.println(ind + "At position " + pos + " (" + makeHex(pos) + "):");
+	System.out.println(ind + "Type is " + type + " (" + typeS + "), len is " + atomlen + " (" + makeHex((int)atomlen) + ")");
+
+	String typeName = RecordTypes.recordName((int)type);
+	if(typeName != null) {
+		System.out.println(ind + "That's an Escher Record: " + typeName);
+	} else {
+		System.out.println(ind + "(Unknown Escher Record)");
+	}
+
+
+	// Code to print the first 8 bytes
+//	System.out.print(ind);
+//	for(int i=0; i<8; i++) {
+//		short bv = _docstream[i+pos];
+//		if(bv < 0) { bv += 256; }
+//		System.out.print(i + "=" + bv + " (" + makeHex(bv) + ")  ");
+//	}
+//	System.out.println("");
+
+	// Record specific dumps
+	if(type == 61453l) {
+		// Text Box. Print out first 8 bytes of data, then 8 4 later
+		System.out.print(ind);
+		for(int i=8; i<16; i++) {
+			short bv = _docstream[i+pos];
+			if(bv < 0) { bv += 256; }
+			System.out.print(i + "=" + bv + " (" + makeHex(bv) + ")  ");
+		}
+		System.out.println("");
+		System.out.print(ind);
+		for(int i=20; i<28; i++) {
+			short bv = _docstream[i+pos];
+			if(bv < 0) { bv += 256; }
+			System.out.print(i + "=" + bv + " (" + makeHex(bv) + ")  ");
+		}
+		System.out.println("");
+	}
+
+
+	// Blank line before next entry
+	System.out.println("");
+
+	// Look in children if we are a container
+	if(type == 61443l || type == 61444l) {
+		walkEscherBasic((indent+3), pos+8, (int)atomlen);
+	}
+
+	// Keep going if not yet at end
+	if(atomlen < len) {
+		int atomleni = (int)atomlen;
+		walkEscherBasic(indent, pos+atomleni+8, len-atomleni-8);
+	}
+  }
+}