Various bug fixes, and hpbf updates

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690517 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-30 14:47:33 +00:00
parent 988b6c869f
commit 4b40a5cb0c
16 changed files with 242 additions and 24 deletions

View File

@ -37,6 +37,11 @@
<!-- Don't forget to update status.xml too! --> <!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??"> <release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
<action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
<action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
<action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action> <action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
<action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action> <action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
<action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action> <action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>

View File

@ -165,6 +165,12 @@ PL 62 1a 00 00 48 00 00 00 // PL from: 1a62 (6754), len: 48 (72)
(the text will then start) (the text will then start)
</source> </source>
<p>We think that the first 4 bytes of text describes the
the function of the data at the offset. The first short is
then the count of that type, eg the 2nd will have 1. We
think that the second 4 bytes of text describes the format
of data block at the offset. The format of the text block
is easy, but we're still trying to figure out the others.</p>
</section> </section>
</body> </body>
</document> </document>

View File

@ -33,11 +33,20 @@
<title>Overview</title> <title>Overview</title>
<p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p> <p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
<p>Currently, HPBF is in the experimental stage, while we try <p>Currently, HPBF is in an early stage, whilst we try to
to figure out the file format. Our initial aim is to provide figure out the file format. So far, we have basic text
a text extractor for the format, with low level code following extraction support, and are able to read some parts within
after that if demand and developer interest warrant it.</p> the file. Writing is not yet supported, as we are unable
<p>At this time, there is no <em>usermodel</em> api or similar.</p> to make sense of the Contents stream, which we think has
lots of offsets to other parts of the file.</p>
<p>Our initial aim is to provude a text extractor for the format
(now done), and be able to extract hyperlinks from within
the document (not yet supported). Additional low level
code to process the file format may follow, if there
is demand and developer interest warrant it.</p>
<p>At this time, there is no <em>usermodel</em> api or similar.
There is only low level support for certain parts of
the file, but by no means all of it.</p>
<p>Our current understanding of the file format is documented <p>Our current understanding of the file format is documented
<link href="file-format.html">here</link>.</p> <link href="file-format.html">here</link>.</p>
<note> <note>

View File

@ -165,8 +165,8 @@
</section> </section>
<section><title>HPBF for Publisher Documents</title> <section><title>HPBF for Publisher Documents</title>
<p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure <p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure
Java. At the moment, we are still figuring out the file format, but we hope Java. It currently only supports reading at a low level for around
to have simple text extraction shortly. Please see <link half of the file parts, and simple text extraction. Please see <link
href="./hpbf/index.html">the HPBF project page for more href="./hpbf/index.html">the HPBF project page for more
information</link>.</p> information</link>.</p>
</section> </section>

View File

@ -34,6 +34,11 @@
<!-- Don't forget to update changes.xml too! --> <!-- Don't forget to update changes.xml too! -->
<changes> <changes>
<release version="3.1.1-alpha1" date="2008-??-??"> <release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
<action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
<action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
<action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action> <action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
<action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action> <action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
<action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action> <action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>

View File

@ -87,9 +87,10 @@ public class EscherBSERecord
field_10_unused2 = data[pos + 34]; field_10_unused2 = data[pos + 34];
field_11_unused3 = data[pos + 35]; field_11_unused3 = data[pos + 35];
bytesRemaining -= 36; bytesRemaining -= 36;
int bytesRead = 0; int bytesRead = 0;
if (bytesRemaining > 0) if (bytesRemaining > 0) {
{ // Some older escher formats skip this last record
field_12_blipRecord = (EscherBlipRecord) recordFactory.createRecord( data, pos + 36 ); field_12_blipRecord = (EscherBlipRecord) recordFactory.createRecord( data, pos + 36 );
bytesRead = field_12_blipRecord.fillFields( data, pos + 36, recordFactory ); bytesRead = field_12_blipRecord.fillFields( data, pos + 36, recordFactory );
} }
@ -168,7 +169,16 @@ public class EscherBSERecord
*/ */
public int getRecordSize() public int getRecordSize()
{ {
return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 + 1 + 1 + field_12_blipRecord.getRecordSize() + (remainingData == null ? 0 : remainingData.length); int field_12_size = 0;
if(field_12_blipRecord != null) {
field_12_size = field_12_blipRecord.getRecordSize();
}
int remaining_size = 0;
if(remainingData != null) {
remaining_size = remainingData.length;
}
return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 +
1 + 1 + field_12_size + remaining_size;
} }
/** /**

View File

@ -247,6 +247,11 @@ public abstract class HeaderFooter {
public static String stripFields(String text) { public static String stripFields(String text) {
int pos; int pos;
// Check we really got something to work on
if(text == null || text.length() == 0) {
return text;
}
// Firstly, do the easy ones which are static // Firstly, do the easy ones which are static
for(int i=0; i<Field.ALL_FIELDS.size(); i++) { for(int i=0; i<Field.ALL_FIELDS.size(); i++) {
String seq = ((Field)Field.ALL_FIELDS.get(i)).sequence; String seq = ((Field)Field.ALL_FIELDS.get(i)).sequence;
@ -257,6 +262,7 @@ public abstract class HeaderFooter {
} }
// Now do the tricky, dynamic ones // Now do the tricky, dynamic ones
// These are things like font sizes and font names
text = text.replaceAll("\\&\\d+", ""); text = text.replaceAll("\\&\\d+", "");
text = text.replaceAll("\\&\".*?,.*?\"", ""); text = text.replaceAll("\\&\".*?,.*?\"", "");
@ -292,9 +298,9 @@ public abstract class HeaderFooter {
public static final Field TIME_FIELD = new Field("&T"); public static final Field TIME_FIELD = new Field("&T");
public static final Field NUM_PAGES_FIELD = new Field("&N"); public static final Field NUM_PAGES_FIELD = new Field("&N");
public static final Field PICTURE_FIELD = new Field("&P"); public static final Field PICTURE_FIELD = new Field("&G");
public static final PairField BOLD_FIELD = new PairField("&B"); // PAID public static final PairField BOLD_FIELD = new PairField("&B");
public static final PairField ITALIC_FIELD = new PairField("&I"); public static final PairField ITALIC_FIELD = new PairField("&I");
public static final PairField STRIKETHROUGH_FIELD = new PairField("&S"); public static final PairField STRIKETHROUGH_FIELD = new PairField("&S");
public static final PairField SUBSCRIPT_FIELD = new PairField("&Y"); public static final PairField SUBSCRIPT_FIELD = new PairField("&Y");

View File

@ -0,0 +1,90 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hpbf.dev;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.ddf.DefaultEscherRecordFactory;
import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.QuillContents;
import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.HexDump;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
/**
* For dumping out the PLC contents of QC Bits of a
* HPBF (Publisher) file, while we try to figure out
* what the format of them is.
*/
public class PLCDumper {
private HPBFDocument doc;
private QuillContents qc;
public PLCDumper(HPBFDocument doc) {
this.doc = doc;
qc = doc.getQuillContents();
}
public PLCDumper(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs));
}
public PLCDumper(InputStream inp) throws IOException {
this(new POIFSFileSystem(inp));
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" PLCDumper <filename>");
System.exit(1);
}
PLCDumper dump = new PLCDumper(
new FileInputStream(args[0])
);
System.out.println("Dumping " + args[0]);
dump.dumpPLC();
}
private void dumpPLC() {
QuillContents qc = doc.getQuillContents();
QCBit[] bits = qc.getBits();
for(int i=0; i<bits.length; i++) {
if(bits[i] == null) continue;
if(bits[i].getBitType().equals("PLC ")) {
dumpBit(bits[i], i);
}
}
}
private void dumpBit(QCBit bit, int index) {
System.out.println("");
System.out.println("Dumping " + bit.getBitType() + " bit at " + index);
System.out.println(" Is a " + bit.getThingType() + ", number is " + bit.getOptA());
System.out.println(" Starts at " + bit.getDataOffset() + " (" + Integer.toHexString(bit.getDataOffset()) + ")");
System.out.println(" Runs for " + bit.getLength() + " (" + Integer.toHexString(bit.getLength()) + ")");
System.out.println(HexDump.dump(bit.getData(), 0, 0));
}
}

View File

@ -70,6 +70,7 @@ public final class QuillContents extends HPBFPart {
bits[i].setOptA(optA); bits[i].setOptA(optA);
bits[i].setOptB(optB); bits[i].setOptB(optB);
bits[i].setOptC(optC); bits[i].setOptC(optC);
bits[i].setDataOffset(from);
} else { } else {
// Doesn't have data // Doesn't have data
} }

View File

@ -28,6 +28,8 @@ public abstract class QCBit {
protected int optB; protected int optB;
protected int optC; protected int optC;
protected int dataOffset;
public QCBit(String thingType, String bitType, byte[] data) { public QCBit(String thingType, String bitType, byte[] data) {
this.thingType = thingType; this.thingType = thingType;
this.bitType = bitType; this.bitType = bitType;
@ -66,4 +68,15 @@ public abstract class QCBit {
public void setOptC(int optC) { public void setOptC(int optC) {
this.optC = optC; this.optC = optC;
} }
public int getDataOffset() {
return dataOffset;
}
public void setDataOffset(int offset) {
this.dataOffset = offset;
}
public int getLength() {
return data.length;
}
} }

View File

@ -167,6 +167,13 @@ public class HeaderStories {
if(stripFields) { if(stripFields) {
return Range.stripFields(text); return Range.stripFields(text);
} }
// If you create a header/footer, then remove it again, word
// will leave \r\r. Turn these back into an empty string,
// which is more what you'd expect
if(text.equals("\r\r")) {
return "";
}
return text; return text;
} }

View File

@ -59,11 +59,11 @@ public class TextPublisherTextExtractor extends TestCase {
assertEquals( assertEquals(
"This is some text on the first page\n" + "This is some text on the first page\n" +
"Its in times new roman, font size 10, all normal\n" + "It\u2019s in times new roman, font size 10, all normal\n" +
"" + "" +
"This is in bold and italic\n" + "This is in bold and italic\n" +
"Its Arial, 20 point font\n" + "It\u2019s Arial, 20 point font\n" +
"Its in the second textbox on the first page\n" + "It\u2019s in the second textbox on the first page\n" +
"" + "" +
"This is the second page\n\n" + "This is the second page\n\n" +
"" + "" +
@ -102,4 +102,36 @@ public class TextPublisherTextExtractor extends TestCase {
, text , text
); );
} }
/**
* We have the same file saved for Publisher 98, Publisher
* 2000 and Publisher 2007. Check they all agree.
* @throws Exception
*/
public void testMultipleVersions() throws Exception {
File f;
HPBFDocument doc;
f = new File(dir, "Sample.pub");
doc = new HPBFDocument(
new FileInputStream(f)
);
String s2007 = (new PublisherTextExtractor(doc)).getText();
f = new File(dir, "Sample2000.pub");
doc = new HPBFDocument(
new FileInputStream(f)
);
String s2000 = (new PublisherTextExtractor(doc)).getText();
f = new File(dir, "Sample98.pub");
doc = new HPBFDocument(
new FileInputStream(f)
);
String s98 = (new PublisherTextExtractor(doc)).getText();
// Check they all agree
assertEquals(s2007, s2000);
assertEquals(s2007, s98);
}
} }

View File

@ -47,4 +47,38 @@ public class TestEscherParts extends TestCase {
// TODO - check the contents // TODO - check the contents
} }
public void testComplex() throws Exception {
File f = new File(dir, "SampleBrochure.pub");
HPBFDocument doc = new HPBFDocument(
new FileInputStream(f)
);
EscherStm es = doc.getEscherStm();
EscherDelayStm eds = doc.getEscherDelayStm();
assertNotNull(es);
assertNotNull(eds);
assertEquals(30, es.getEscherRecords().length);
assertEquals(19, eds.getEscherRecords().length);
// TODO - check contents
// Now do another complex file
f = new File(dir, "SampleNewsletter.pub");
doc = new HPBFDocument(
new FileInputStream(f)
);
es = doc.getEscherStm();
eds = doc.getEscherDelayStm();
assertNotNull(es);
assertNotNull(eds);
assertEquals(51, es.getEscherRecords().length);
assertEquals(92, eds.getEscherRecords().length);
}
} }

View File

@ -183,7 +183,7 @@ public class TestWordExtractor extends TestCase {
extractor = new WordExtractor(doc); extractor = new WordExtractor(doc);
assertEquals( assertEquals(
"\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n", "This is a simple header, with a \u20ac euro symbol in it.\n\n",
extractor.getHeaderText() extractor.getHeaderText()
); );
text = extractor.getText(); text = extractor.getText();
@ -217,7 +217,7 @@ public class TestWordExtractor extends TestCase {
extractor = new WordExtractor(doc); extractor = new WordExtractor(doc);
assertEquals( assertEquals(
"\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n", "The footer, with Moli\u00e8re, has Unicode in it.\n",
extractor.getFooterText() extractor.getFooterText()
); );
text = extractor.getText(); text = extractor.getText();

View File

@ -123,7 +123,7 @@ public class TestHeaderStories extends TestCase {
assertEquals("", hs.getFirstHeader()); assertEquals("", hs.getFirstHeader());
assertEquals("", hs.getEvenHeader()); assertEquals("", hs.getEvenHeader());
assertEquals("\r\r", hs.getOddHeader()); assertEquals("", hs.getOddHeader()); // Was \r\r but gets emptied
assertEquals("", hs.getFirstFooter()); assertEquals("", hs.getFirstFooter());
@ -181,13 +181,13 @@ public class TestHeaderStories extends TestCase {
public void testUnicode() throws Exception { public void testUnicode() throws Exception {
HeaderStories hs = new HeaderStories(unicode); HeaderStories hs = new HeaderStories(unicode);
assertEquals("\r\r", hs.getFirstHeader()); assertEquals("", hs.getFirstHeader());
assertEquals("\r\r", hs.getEvenHeader()); assertEquals("", hs.getEvenHeader());
assertEquals("This is a simple header, with a \u20ac euro symbol in it.\r\r\r", hs.getOddHeader()); assertEquals("This is a simple header, with a \u20ac euro symbol in it.\r\r\r", hs.getOddHeader());
assertEquals("\r\r", hs.getFirstFooter()); assertEquals("", hs.getFirstFooter());
assertEquals("\r\r", hs.getEvenFooter()); assertEquals("", hs.getEvenFooter());
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter()); assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
} }

View File

@ -87,8 +87,8 @@ public final class TestHSSFHeaderFooter extends TestCase {
assertTrue(head.areFieldsStripped()); assertTrue(head.areFieldsStripped());
// Now even more complex // Now even more complex
head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G"); head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G&X END");
assertEquals("HEADER TEXT &G", head.getCenter()); assertEquals("HEADER TEXT END", head.getCenter());
} }
/** /**