Various bug fixes, and hpbf updates

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690517 13f79535-47bb-0310-9956-ffa450edef68
2008-08-30 14:47:33 +00:00 · 2008-08-30 14:47:33 +00:00 · 4b40a5cb0c
commit 4b40a5cb0c
parent 988b6c869f
16 changed files with 242 additions and 24 deletions
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@ -37,6 +37,11 @@
 		<!-- Don't forget to update status.xml too! -->
        <release version="3.1.1-alpha1" date="2008-??-??">
           <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
           <action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
           <action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
           <action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
           <action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
           <action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>
--- a/src/documentation/content/xdocs/hpbf/file-format.xml
+++ b/src/documentation/content/xdocs/hpbf/file-format.xml
@ -165,6 +165,12 @@ PL   62 1a 00 00 48 00 00 00 // PL   from: 1a62 (6754), len: 48 (72)
 (the text will then start)
 </source>
 		<p>We think that the first 4 bytes of text describes the
 		 the function of the data at the offset. The first short is
 		 then the count of that type, eg the 2nd will have 1. We
 		 think that the second 4 bytes of text describes the format
 		 of data block at the offset. The format of the text block
 		 is easy, but we're still trying to figure out the others.</p>
 		</section>
 	</body>
 </document>
--- a/src/documentation/content/xdocs/hpbf/index.xml
+++ b/src/documentation/content/xdocs/hpbf/index.xml
@ -33,11 +33,20 @@
            <title>Overview</title>
            <p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
-            <p>Currently, HPBF is in the experimental stage, while we try
+            <p>Currently, HPBF is in an early stage, whilst we try to
-              to figure out the file format. Our initial aim is to provide
+              figure out the file format. So far, we have basic text
-              a text extractor for the format, with low level code following
+              extraction support, and are able to read some parts within
-              after that if demand and developer interest warrant it.</p>
+              the file. Writing is not yet supported, as we are unable
-			<p>At this time, there is no <em>usermodel</em> api or similar.</p>
+              to make sense of the Contents stream, which we think has
              lots of offsets to other parts of the file.</p>
            <p>Our initial aim is to provude a text extractor for the format
              (now done), and be able to extract hyperlinks from within
              the document (not yet supported). Additional low level
              code to process the file format may follow, if there
              is demand and developer interest warrant it.</p>
 			<p>At this time, there is no <em>usermodel</em> api or similar.
              There is only low level support for certain parts of
              the file, but by no means all of it.</p>
            <p>Our current understanding of the file format is documented
              <link href="file-format.html">here</link>.</p>
            <note> 
--- a/src/documentation/content/xdocs/index.xml
+++ b/src/documentation/content/xdocs/index.xml
@ -165,8 +165,8 @@
        </section>
        <section><title>HPBF for Publisher Documents</title>
 	<p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure
-	  Java. At the moment, we are still figuring out the file format, but we hope
+	  Java. It currently only supports reading at a low level for around
-      to have simple text extraction shortly. Please see <link
+      half of the file parts, and simple text extraction.  Please see <link
 	    href="./hpbf/index.html">the HPBF project page for more
 	    information</link>.</p>
        </section>
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,11 @@
 	<!-- Don't forget to update changes.xml too! -->
    <changes>
        <release version="3.1.1-alpha1" date="2008-??-??">
           <action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
           <action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
           <action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
           <action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
           <action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
           <action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
           <action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>
--- a/src/java/org/apache/poi/ddf/EscherBSERecord.java
+++ b/src/java/org/apache/poi/ddf/EscherBSERecord.java
@ -87,9 +87,10 @@ public class EscherBSERecord
        field_10_unused2 = data[pos + 34];
        field_11_unused3 = data[pos + 35];
        bytesRemaining -= 36;
        int bytesRead = 0;
-        if (bytesRemaining > 0)
+        if (bytesRemaining > 0) {
-        {
+        	// Some older escher formats skip this last record
            field_12_blipRecord = (EscherBlipRecord) recordFactory.createRecord( data, pos + 36 );
            bytesRead = field_12_blipRecord.fillFields( data, pos + 36, recordFactory );
        }
@ -168,7 +169,16 @@ public class EscherBSERecord
     */
    public int getRecordSize()
    {
-        return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 + 1 + 1 + field_12_blipRecord.getRecordSize() + (remainingData == null ? 0 : remainingData.length);
+    	int field_12_size = 0;
    	if(field_12_blipRecord != null) {
    		field_12_size = field_12_blipRecord.getRecordSize(); 
    	}
    	int remaining_size = 0;
    	if(remainingData != null) {
    		remaining_size = remainingData.length;
    	}
        return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 +
            1 + 1 + field_12_size + remaining_size;
    }
    /**
--- a/src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java
+++ b/src/java/org/apache/poi/hssf/usermodel/HeaderFooter.java
@ -247,6 +247,11 @@ public abstract class HeaderFooter {
    public static String stripFields(String text) {
    	int pos;
    	// Check we really got something to work on
    	if(text == null || text.length() == 0) {
    		return text;
    	}
    	// Firstly, do the easy ones which are static
    	for(int i=0; i<Field.ALL_FIELDS.size(); i++) {
    		String seq = ((Field)Field.ALL_FIELDS.get(i)).sequence;
@ -257,6 +262,7 @@ public abstract class HeaderFooter {
    	}
    	// Now do the tricky, dynamic ones
    	// These are things like font sizes and font names
    	text = text.replaceAll("\\&\\d+", "");
    	text = text.replaceAll("\\&\".*?,.*?\"", "");
@ -292,9 +298,9 @@ public abstract class HeaderFooter {
    public static final Field TIME_FIELD = new Field("&T");
    public static final Field NUM_PAGES_FIELD = new Field("&N");
-    public static final Field PICTURE_FIELD = new Field("&P");
+    public static final Field PICTURE_FIELD = new Field("&G");
-    public static final PairField BOLD_FIELD = new PairField("&B"); // PAID
+    public static final PairField BOLD_FIELD = new PairField("&B");
    public static final PairField ITALIC_FIELD = new PairField("&I");
    public static final PairField STRIKETHROUGH_FIELD = new PairField("&S");
    public static final PairField SUBSCRIPT_FIELD = new PairField("&Y");
--- a/src/scratchpad/src/org/apache/poi/hpbf/dev/PLCDumper.java
+++ b/src/scratchpad/src/org/apache/poi/hpbf/dev/PLCDumper.java
@ -0,0 +1,90 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hpbf.dev;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import org.apache.poi.ddf.DefaultEscherRecordFactory;
 import org.apache.poi.ddf.EscherRecord;
 import org.apache.poi.hpbf.HPBFDocument;
 import org.apache.poi.hpbf.model.QuillContents;
 import org.apache.poi.hpbf.model.qcbits.QCBit;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.HexDump;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.StringUtil;
 /**
 * For dumping out the PLC contents of QC Bits of a
 *  HPBF (Publisher) file, while we try to figure out
 *  what the format of them is.
 */
 public class PLCDumper {
 	private HPBFDocument doc;
 	private QuillContents qc;
 	public PLCDumper(HPBFDocument doc) {
 		this.doc = doc;
 		qc = doc.getQuillContents();
 	}
 	public PLCDumper(POIFSFileSystem fs) throws IOException {
 		this(new HPBFDocument(fs));
 	}
 	public PLCDumper(InputStream inp) throws IOException {
 		this(new POIFSFileSystem(inp));
 	}
 	public static void main(String[] args) throws Exception {
 		if(args.length < 1) {
 			System.err.println("Use:");
 			System.err.println("  PLCDumper <filename>");
 			System.exit(1);
 		}
 		PLCDumper dump = new PLCDumper(
 				new FileInputStream(args[0])
 		);
 		System.out.println("Dumping " + args[0]);
 		dump.dumpPLC();
 	}
 	private void dumpPLC() {	
 		QuillContents qc = doc.getQuillContents();
 		QCBit[] bits = qc.getBits();
 		for(int i=0; i<bits.length; i++) {
 			if(bits[i] == null) continue;
 			if(bits[i].getBitType().equals("PLC ")) {
 				dumpBit(bits[i], i);
 			}
 		}
 	}
 	private void dumpBit(QCBit bit, int index) {
 		System.out.println("");
 		System.out.println("Dumping " + bit.getBitType() + " bit at " + index);
 		System.out.println("  Is a " + bit.getThingType() + ", number is " + bit.getOptA());
 		System.out.println("  Starts at " + bit.getDataOffset() + " (" + Integer.toHexString(bit.getDataOffset()) + ")");
 		System.out.println("  Runs for  " + bit.getLength() + " (" + Integer.toHexString(bit.getLength()) + ")");
 		System.out.println(HexDump.dump(bit.getData(), 0, 0));
 	}
 }
--- a/src/scratchpad/src/org/apache/poi/hpbf/model/QuillContents.java
+++ b/src/scratchpad/src/org/apache/poi/hpbf/model/QuillContents.java
@ -70,6 +70,7 @@ public final class QuillContents extends HPBFPart {
 				bits[i].setOptA(optA);
 				bits[i].setOptB(optB);
 				bits[i].setOptC(optC);
 				bits[i].setDataOffset(from);
 			} else {
 				// Doesn't have data
 			}
--- a/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCBit.java
+++ b/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCBit.java
@ -28,6 +28,8 @@ public abstract class QCBit {
 	protected int optB;
 	protected int optC;
 	protected int dataOffset;
 	public QCBit(String thingType, String bitType, byte[] data) {
 		this.thingType = thingType;
 		this.bitType = bitType;
@ -66,4 +68,15 @@ public abstract class QCBit {
 	public void setOptC(int optC) {
 		this.optC = optC;
 	}
 	public int getDataOffset() {
 		return dataOffset;
 	}
 	public void setDataOffset(int offset) {
 		this.dataOffset = offset;
 	}
 	public int getLength() {
 		return data.length;
 	}
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
@ -167,6 +167,13 @@ public class HeaderStories {
 		if(stripFields) {
 			return Range.stripFields(text);
 		}
 		// If you create a header/footer, then remove it again, word
 		//  will leave \r\r. Turn these back into an empty string,
 		//  which is more what you'd expect
 		if(text.equals("\r\r")) {
 			return "";
 		}
 		return text;
 	}
--- a/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java
@ -59,11 +59,11 @@ public class TextPublisherTextExtractor extends TestCase {
 		assertEquals(
 "This is some text on the first page\n" +
-"It’s in times new roman, font size 10, all normal\n" +
+"It\u2019s in times new roman, font size 10, all normal\n" +
 "" +
 "This is in bold and italic\n" +
-"It’s Arial, 20 point font\n" +
+"It\u2019s Arial, 20 point font\n" +
-"It’s in the second textbox on the first page\n" +
+"It\u2019s in the second textbox on the first page\n" +
 "" +
 "This is the second page\n\n" +
 "" +
@ -102,4 +102,36 @@ public class TextPublisherTextExtractor extends TestCase {
 				, text
 		);
 	}
 	/**
 	 * We have the same file saved for Publisher 98, Publisher
 	 *  2000 and Publisher 2007. Check they all agree.
 	 * @throws Exception
 	 */
 	public void testMultipleVersions() throws Exception {
 		File f;
 		HPBFDocument doc;
 		f = new File(dir, "Sample.pub");
 		doc = new HPBFDocument(
 				new FileInputStream(f)
 		);
 		String s2007 = (new PublisherTextExtractor(doc)).getText();
 		f = new File(dir, "Sample2000.pub");
 		doc = new HPBFDocument(
 				new FileInputStream(f)
 		);
 		String s2000 = (new PublisherTextExtractor(doc)).getText();
 		f = new File(dir, "Sample98.pub");
 		doc = new HPBFDocument(
 				new FileInputStream(f)
 		);
 		String s98 = (new PublisherTextExtractor(doc)).getText();
 		// Check they all agree
 		assertEquals(s2007, s2000);
 		assertEquals(s2007, s98);
 	}
 }
--- a/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java
+++ b/src/scratchpad/testcases/org/apache/poi/hpbf/model/TestEscherParts.java
@ -47,4 +47,38 @@ public class TestEscherParts extends TestCase {
 		// TODO - check the contents
 	}
 	public void testComplex() throws Exception {
 		File f = new File(dir, "SampleBrochure.pub");
 		HPBFDocument doc = new HPBFDocument(
 				new FileInputStream(f)
 		);
 		EscherStm es = doc.getEscherStm();
 		EscherDelayStm eds = doc.getEscherDelayStm();
 		assertNotNull(es);
 		assertNotNull(eds);
 		assertEquals(30, es.getEscherRecords().length);
 		assertEquals(19, eds.getEscherRecords().length);
 		// TODO - check contents
 		// Now do another complex file
 		f = new File(dir, "SampleNewsletter.pub");
 		doc = new HPBFDocument(
 				new FileInputStream(f)
 		);
 		es = doc.getEscherStm();
 		eds = doc.getEscherDelayStm();
 		assertNotNull(es);
 		assertNotNull(eds);
 		assertEquals(51, es.getEscherRecords().length);
 		assertEquals(92, eds.getEscherRecords().length);
 	}
 }
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@ -183,7 +183,7 @@ public class TestWordExtractor extends TestCase {
    	extractor = new WordExtractor(doc);
    	assertEquals(
-    			"\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
+    			"This is a simple header, with a \u20ac euro symbol in it.\n\n",
    			extractor.getHeaderText()
    	);
    	text = extractor.getText();
@ -217,7 +217,7 @@ public class TestWordExtractor extends TestCase {
    	extractor = new WordExtractor(doc);
    	assertEquals(
-    			"\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
+    			"The footer, with Moli\u00e8re, has Unicode in it.\n",
    			extractor.getFooterText()
    	);
    	text = extractor.getText();
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
@ -123,7 +123,7 @@ public class TestHeaderStories extends TestCase {
 		assertEquals("", hs.getFirstHeader());
 		assertEquals("", hs.getEvenHeader());
-		assertEquals("\r\r", hs.getOddHeader());
+		assertEquals("", hs.getOddHeader()); // Was \r\r but gets emptied
 		assertEquals("", hs.getFirstFooter());
@ -181,13 +181,13 @@ public class TestHeaderStories extends TestCase {
    public void testUnicode() throws Exception {
    	HeaderStories hs = new HeaderStories(unicode);
-		assertEquals("\r\r", hs.getFirstHeader());
+		assertEquals("", hs.getFirstHeader());
-		assertEquals("\r\r", hs.getEvenHeader());
+		assertEquals("", hs.getEvenHeader());
 		assertEquals("This is a simple header, with a \u20ac euro symbol in it.\r\r\r", hs.getOddHeader());
-		assertEquals("\r\r", hs.getFirstFooter());
+		assertEquals("", hs.getFirstFooter());
-		assertEquals("\r\r", hs.getEvenFooter());
+		assertEquals("", hs.getEvenFooter());
 		assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
    }
--- a/src/testcases/org/apache/poi/hssf/usermodel/TestHSSFHeaderFooter.java
+++ b/src/testcases/org/apache/poi/hssf/usermodel/TestHSSFHeaderFooter.java
@ -87,8 +87,8 @@ public final class TestHSSFHeaderFooter extends TestCase {
    	assertTrue(head.areFieldsStripped());
    	// Now even more complex
-    	head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G");
+    	head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G&X END");
-    	assertEquals("HEADER TEXT &G", head.getCenter());
+    	assertEquals("HEADER TEXT  END", head.getCenter());
 	}
 	/**