Various bug fixes, and hpbf updates

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690517 13f79535-47bb-0310-9956-ffa450edef68
2008-08-30 14:47:33 +00:00 · 2008-08-30 14:47:33 +00:00 · 4b40a5cb0c
commit 4b40a5cb0c
parent 988b6c869f
16 changed files with 242 additions and 24 deletions
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@ -37,6 +37,11 @@

 		<!-- Don't forget to update status.xml too! -->
        <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
+           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
+           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
+           <action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
+           <action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
           <action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
           <action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
           <action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>
--- a/src/documentation/content/xdocs/hpbf/file-format.xml
+++ b/src/documentation/content/xdocs/hpbf/file-format.xml
@ -165,6 +165,12 @@ PL   62 1a 00 00 48 00 00 00 // PL   from: 1a62 (6754), len: 48 (72)

 (the text will then start)
 </source>
+		<p>We think that the first 4 bytes of text describes the
+		 the function of the data at the offset. The first short is
+		 then the count of that type, eg the 2nd will have 1. We
+		 think that the second 4 bytes of text describes the format
+		 of data block at the offset. The format of the text block
+		 is easy, but we're still trying to figure out the others.</p>
 		</section>
 	</body>
 </document>
--- a/src/documentation/content/xdocs/hpbf/index.xml
+++ b/src/documentation/content/xdocs/hpbf/index.xml
@ -33,11 +33,20 @@
            <title>Overview</title>

            <p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
-            <p>Currently, HPBF is in the experimental stage, while we try
-              to figure out the file format. Our initial aim is to provide
-              a text extractor for the format, with low level code following
-              after that if demand and developer interest warrant it.</p>
-			<p>At this time, there is no <em>usermodel</em> api or similar.</p>
+            <p>Currently, HPBF is in an early stage, whilst we try to
+              figure out the file format. So far, we have basic text
+              extraction support, and are able to read some parts within
+              the file. Writing is not yet supported, as we are unable
+              to make sense of the Contents stream, which we think has
+              lots of offsets to other parts of the file.</p>
+            <p>Our initial aim is to provude a text extractor for the format
+              (now done), and be able to extract hyperlinks from within
+              the document (not yet supported). Additional low level
+              code to process the file format may follow, if there
+              is demand and developer interest warrant it.</p>
+			<p>At this time, there is no <em>usermodel</em> api or similar.
+              There is only low level support for certain parts of
+              the file, but by no means all of it.</p>
            <p>Our current understanding of the file format is documented
              <link href="file-format.html">here</link>.</p>
            <note> 
--- a/src/documentation/content/xdocs/index.xml
+++ b/src/documentation/content/xdocs/index.xml
@ -165,8 +165,8 @@
        </section>
        <section><title>HPBF for Publisher Documents</title>
 	<p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure
-	  Java. At the moment, we are still figuring out the file format, but we hope
-      to have simple text extraction shortly. Please see <link
+	  Java. It currently only supports reading at a low level for around
+      half of the file parts, and simple text extraction.  Please see <link
 	    href="./hpbf/index.html">the HPBF project page for more
 	    information</link>.</p>
        </section>
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,11 @@
 	<!-- Don't forget to update changes.xml too! -->
    <changes>
        <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
+           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
+           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
+           <action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
+           <action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
           <action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
           <action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
           <action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>
--- a/src/java/org/apache/poi/ddf/EscherBSERecord.java
+++ b/src/java/org/apache/poi/ddf/EscherBSERecord.java
@ -87,9 +87,10 @@ public class EscherBSERecord
        field_10_unused2 = data[pos + 34];
        field_11_unused3 = data[pos + 35];
        bytesRemaining -= 36;
+        
        int bytesRead = 0;
-        if (bytesRemaining > 0)
-        {
+        if (bytesRemaining > 0) {
+        	// Some older escher formats skip this last record
            field_12_blipRecord = (EscherBlipRecord) recordFactory.createRecord( data, pos + 36 );
            bytesRead = field_12_blipRecord.fillFields( data, pos + 36, recordFactory );
        }
@ -168,7 +169,16 @@ public class EscherBSERecord
     */
    public int getRecordSize()
    {
-        return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 + 1 + 1 + field_12_blipRecord.getRecordSize() + (remainingData == null ? 0 : remainingData.length);
+    	int field_12_size = 0;
+    	if(field_12_blipRecord != null) {
+    		field_12_size = field_12_blipRecord.getRecordSize(); 
+    	}
+    	int remaining_size = 0;
+    	if(remainingData != null) {
+    		remaining_size = remainingData.length;
+    	}
+        return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 +
+            1 + 1 + field_12_size + remaining_size;
    }

    /**
--- a/src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java
+++ b/src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java
@ -247,6 +247,11 @@ public abstract class HeaderFooter {
    public static String stripFields(String text) {
    	int pos;
    	
+    	// Check we really got something to work on
+    	if(text == null || text.length() == 0) {
+    		return text;
+    	}
+    	
    	// Firstly, do the easy ones which are static
    	for(int i=0; i<Field.ALL_FIELDS.size(); i++) {
    		String seq = ((Field)Field.ALL_FIELDS.get(i)).sequence;
@ -257,6 +262,7 @@ public abstract class HeaderFooter {
    	}
    	
    	// Now do the tricky, dynamic ones
+    	// These are things like font sizes and font names
    	text = text.replaceAll("\\&\\d+", "");
    	text = text.replaceAll("\\&\".*?,.*?\"", "");
    	
@ -292,9 +298,9 @@ public abstract class HeaderFooter {
    public static final Field TIME_FIELD = new Field("&T");
    public static final Field NUM_PAGES_FIELD = new Field("&N");
    
-    public static final Field PICTURE_FIELD = new Field("&P");
+    public static final Field PICTURE_FIELD = new Field("&G");
    
-    public static final PairField BOLD_FIELD = new PairField("&B"); // PAID
+    public static final PairField BOLD_FIELD = new PairField("&B");
    public static final PairField ITALIC_FIELD = new PairField("&I");
    public static final PairField STRIKETHROUGH_FIELD = new PairField("&S");
    public static final PairField SUBSCRIPT_FIELD = new PairField("&Y");
--- a/src/scratchpad/src/org/apache/poi/hpbf/dev/PLCDumper.java
+++ b/src/scratchpad/src/org/apache/poi/hpbf/dev/PLCDumper.java
@ -0,0 +1,90 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpbf.dev;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.ddf.DefaultEscherRecordFactory;
+import org.apache.poi.ddf.EscherRecord;
+import org.apache.poi.hpbf.HPBFDocument;
+import org.apache.poi.hpbf.model.QuillContents;
+import org.apache.poi.hpbf.model.qcbits.QCBit;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.HexDump;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
+
+/**
+ * For dumping out the PLC contents of QC Bits of a
+ *  HPBF (Publisher) file, while we try to figure out
+ *  what the format of them is.
+ */
+public class PLCDumper {
+	private HPBFDocument doc;
+	private QuillContents qc;
+	
+	public PLCDumper(HPBFDocument doc) {
+		this.doc = doc;
+		qc = doc.getQuillContents();
+	}
+	public PLCDumper(POIFSFileSystem fs) throws IOException {
+		this(new HPBFDocument(fs));
+	}
+	public PLCDumper(InputStream inp) throws IOException {
+		this(new POIFSFileSystem(inp));
+	}
+	
+	public static void main(String[] args) throws Exception {
+		if(args.length < 1) {
+			System.err.println("Use:");
+			System.err.println("  PLCDumper <filename>");
+			System.exit(1);
+		}
+		PLCDumper dump = new PLCDumper(
+				new FileInputStream(args[0])
+		);
+		
+		System.out.println("Dumping " + args[0]);
+		dump.dumpPLC();
+	}
+	
+	private void dumpPLC() {	
+		QuillContents qc = doc.getQuillContents();
+		QCBit[] bits = qc.getBits();
+		
+		for(int i=0; i<bits.length; i++) {
+			if(bits[i] == null) continue;
+			if(bits[i].getBitType().equals("PLC ")) {
+				dumpBit(bits[i], i);
+			}
+		}
+	}
+	
+	private void dumpBit(QCBit bit, int index) {
+		System.out.println("");
+		System.out.println("Dumping " + bit.getBitType() + " bit at " + index);
+		System.out.println("  Is a " + bit.getThingType() + ", number is " + bit.getOptA());
+		System.out.println("  Starts at " + bit.getDataOffset() + " (" + Integer.toHexString(bit.getDataOffset()) + ")");
+		System.out.println("  Runs for  " + bit.getLength() + " (" + Integer.toHexString(bit.getLength()) + ")");
+		
+		System.out.println(HexDump.dump(bit.getData(), 0, 0));
+	}
+}
--- a/src/scratchpad/src/org/apache/poi/hpbf/model/QuillContents.java
+++ b/src/scratchpad/src/org/apache/poi/hpbf/model/QuillContents.java
@ -70,6 +70,7 @@ public final class QuillContents extends HPBFPart {
 				bits[i].setOptA(optA);
 				bits[i].setOptB(optB);
 				bits[i].setOptC(optC);
+				bits[i].setDataOffset(from);
 			} else {
 				// Doesn't have data
 			}
--- a/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCBit.java
+++ b/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCBit.java
@ -28,6 +28,8 @@ public abstract class QCBit {
 	protected int optB;
 	protected int optC;
 	
+	protected int dataOffset;
+	
 	public QCBit(String thingType, String bitType, byte[] data) {
 		this.thingType = thingType;
 		this.bitType = bitType;
@ -66,4 +68,15 @@ public abstract class QCBit {
 	public void setOptC(int optC) {
 		this.optC = optC;
 	}
+
+	public int getDataOffset() {
+		return dataOffset;
+	}
+	public void setDataOffset(int offset) {
+		this.dataOffset = offset;
+	}
+	
+	public int getLength() {
+		return data.length;
+	}
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
@ -167,6 +167,13 @@ public class HeaderStories {
 		if(stripFields) {
 			return Range.stripFields(text);
 		}
+		// If you create a header/footer, then remove it again, word
+		//  will leave \r\r. Turn these back into an empty string,
+		//  which is more what you'd expect
+		if(text.equals("\r\r")) {
+			return "";
+		}
+		
 		return text;
 	}
 	
--- a/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
@ -59,11 +59,11 @@ public class TextPublisherTextExtractor extends TestCase {
 		
 		assertEquals(
 "This is some text on the first page\n" +
-"It’s in times new roman, font size 10, all normal\n" +
+"It\u2019s in times new roman, font size 10, all normal\n" +
 "" +
 "This is in bold and italic\n" +
-"It’s Arial, 20 point font\n" +
-"It’s in the second textbox on the first page\n" +
+"It\u2019s Arial, 20 point font\n" +
+"It\u2019s in the second textbox on the first page\n" +
 "" +
 "This is the second page\n\n" +
 "" +
@ -102,4 +102,36 @@ public class TextPublisherTextExtractor extends TestCase {
 				, text
 		);
 	}
+	
+	/**
+	 * We have the same file saved for Publisher 98, Publisher
+	 *  2000 and Publisher 2007. Check they all agree.
+	 * @throws Exception
+	 */
+	public void testMultipleVersions() throws Exception {
+		File f;
+		HPBFDocument doc;
+		
+		f = new File(dir, "Sample.pub");
+		doc = new HPBFDocument(
+				new FileInputStream(f)
+		);
+		String s2007 = (new PublisherTextExtractor(doc)).getText();
+		
+		f = new File(dir, "Sample2000.pub");
+		doc = new HPBFDocument(
+				new FileInputStream(f)
+		);
+		String s2000 = (new PublisherTextExtractor(doc)).getText();
+		
+		f = new File(dir, "Sample98.pub");
+		doc = new HPBFDocument(
+				new FileInputStream(f)
+		);
+		String s98 = (new PublisherTextExtractor(doc)).getText();
+		
+		// Check they all agree
+		assertEquals(s2007, s2000);
+		assertEquals(s2007, s98);
+	}
 }
--- a/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java
+++ b/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java
@ -47,4 +47,38 @@ public class TestEscherParts extends TestCase {
 		
 		// TODO - check the contents
 	}
+	
+	public void testComplex() throws Exception {
+		File f = new File(dir, "SampleBrochure.pub");
+		HPBFDocument doc = new HPBFDocument(
+				new FileInputStream(f)
+		);
+
+		EscherStm es = doc.getEscherStm();
+		EscherDelayStm eds = doc.getEscherDelayStm();
+		
+		assertNotNull(es);
+		assertNotNull(eds);
+		
+		assertEquals(30, es.getEscherRecords().length);
+		assertEquals(19, eds.getEscherRecords().length);
+		
+		// TODO - check contents
+		
+		
+		// Now do another complex file
+		f = new File(dir, "SampleNewsletter.pub");
+		doc = new HPBFDocument(
+				new FileInputStream(f)
+		);
+
+		es = doc.getEscherStm();
+		eds = doc.getEscherDelayStm();
+		
+		assertNotNull(es);
+		assertNotNull(eds);
+		
+		assertEquals(51, es.getEscherRecords().length);
+		assertEquals(92, eds.getEscherRecords().length);
+	}
 }
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@ -183,7 +183,7 @@ public class TestWordExtractor extends TestCase {
    	extractor = new WordExtractor(doc);
    	
    	assertEquals(
-    			"\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
+    			"This is a simple header, with a \u20ac euro symbol in it.\n\n",
    			extractor.getHeaderText()
    	);
    	text = extractor.getText();
@ -217,7 +217,7 @@ public class TestWordExtractor extends TestCase {
    	extractor = new WordExtractor(doc);
    	
    	assertEquals(
-    			"\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
+    			"The footer, with Moli\u00e8re, has Unicode in it.\n",
    			extractor.getFooterText()
    	);
    	text = extractor.getText();
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
@ -123,7 +123,7 @@ public class TestHeaderStories extends TestCase {
    	
 		assertEquals("", hs.getFirstHeader());
 		assertEquals("", hs.getEvenHeader());
-		assertEquals("\r\r", hs.getOddHeader());
+		assertEquals("", hs.getOddHeader()); // Was \r\r but gets emptied

 		
 		assertEquals("", hs.getFirstFooter());
@ -181,13 +181,13 @@ public class TestHeaderStories extends TestCase {
    public void testUnicode() throws Exception {
    	HeaderStories hs = new HeaderStories(unicode);
    	
-		assertEquals("\r\r", hs.getFirstHeader());
-		assertEquals("\r\r", hs.getEvenHeader());
+		assertEquals("", hs.getFirstHeader());
+		assertEquals("", hs.getEvenHeader());
 		assertEquals("This is a simple header, with a \u20ac euro symbol in it.\r\r\r", hs.getOddHeader());

 		
-		assertEquals("\r\r", hs.getFirstFooter());
-		assertEquals("\r\r", hs.getEvenFooter());
+		assertEquals("", hs.getFirstFooter());
+		assertEquals("", hs.getEvenFooter());
 		assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
    }
    
--- a/src/testcases/org/apache/poi/hssf/usermodel/TestHSSFHeaderFooter.java
+++ b/src/testcases/org/apache/poi/hssf/usermodel/TestHSSFHeaderFooter.java
@ -87,8 +87,8 @@ public final class TestHSSFHeaderFooter extends TestCase {
    	assertTrue(head.areFieldsStripped());
    	
    	// Now even more complex
-    	head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G");
-    	assertEquals("HEADER TEXT &G", head.getCenter());
+    	head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G&X END");
+    	assertEquals("HEADER TEXT  END", head.getCenter());
 	}

 	/**