Various bug fixes, and hpbf updates

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690517 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-30 14:47:33 +00:00
parent 988b6c869f
commit 4b40a5cb0c
16 changed files with 242 additions and 24 deletions

View File

@ -37,6 +37,11 @@
<!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
<action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
<action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
<action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
<action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
<action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>

View File

@ -165,6 +165,12 @@ PL 62 1a 00 00 48 00 00 00 // PL from: 1a62 (6754), len: 48 (72)
(the text will then start)
</source>
<p>We think that the first 4 bytes of text describes the
the function of the data at the offset. The first short is
then the count of that type, eg the 2nd will have 1. We
think that the second 4 bytes of text describes the format
of data block at the offset. The format of the text block
is easy, but we're still trying to figure out the others.</p>
</section>
</body>
</document>

View File

@ -33,11 +33,20 @@
<title>Overview</title>
<p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
<p>Currently, HPBF is in the experimental stage, while we try
to figure out the file format. Our initial aim is to provide
a text extractor for the format, with low level code following
after that if demand and developer interest warrant it.</p>
<p>At this time, there is no <em>usermodel</em> api or similar.</p>
<p>Currently, HPBF is in an early stage, whilst we try to
figure out the file format. So far, we have basic text
extraction support, and are able to read some parts within
the file. Writing is not yet supported, as we are unable
to make sense of the Contents stream, which we think has
lots of offsets to other parts of the file.</p>
<p>Our initial aim is to provude a text extractor for the format
(now done), and be able to extract hyperlinks from within
the document (not yet supported). Additional low level
code to process the file format may follow, if there
is demand and developer interest warrant it.</p>
<p>At this time, there is no <em>usermodel</em> api or similar.
There is only low level support for certain parts of
the file, but by no means all of it.</p>
<p>Our current understanding of the file format is documented
<link href="file-format.html">here</link>.</p>
<note>

View File

@ -165,8 +165,8 @@
</section>
<section><title>HPBF for Publisher Documents</title>
<p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure
Java. At the moment, we are still figuring out the file format, but we hope
to have simple text extraction shortly. Please see <link
Java. It currently only supports reading at a low level for around
half of the file parts, and simple text extraction. Please see <link
href="./hpbf/index.html">the HPBF project page for more
information</link>.</p>
</section>

View File

@ -34,6 +34,11 @@
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in EscherBSERecord on older escher records</action>
<action dev="POI-DEVELOPERS" type="add">Basic text extractraction support in HPBF</action>
<action dev="POI-DEVELOPERS" type="add">Initial, low level support for Publisher files, in the form of HPBF</action>
<action dev="POI-DEVELOPERS" type="fix">45699 - Fix RowRecordsAggregate to tolerate intervening MERGEDCELLS records</action>
<action dev="POI-DEVELOPERS" type="fix">45698 - Fix LinkTable to tolerate multiple EXTERNSHEET records</action>
<action dev="POI-DEVELOPERS" type="fix">45682 - Fix for cloning of CFRecordsAggregate</action>

View File

@ -87,9 +87,10 @@ public class EscherBSERecord
field_10_unused2 = data[pos + 34];
field_11_unused3 = data[pos + 35];
bytesRemaining -= 36;
int bytesRead = 0;
if (bytesRemaining > 0)
{
if (bytesRemaining > 0) {
// Some older escher formats skip this last record
field_12_blipRecord = (EscherBlipRecord) recordFactory.createRecord( data, pos + 36 );
bytesRead = field_12_blipRecord.fillFields( data, pos + 36, recordFactory );
}
@ -168,7 +169,16 @@ public class EscherBSERecord
*/
public int getRecordSize()
{
return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 + 1 + 1 + field_12_blipRecord.getRecordSize() + (remainingData == null ? 0 : remainingData.length);
int field_12_size = 0;
if(field_12_blipRecord != null) {
field_12_size = field_12_blipRecord.getRecordSize();
}
int remaining_size = 0;
if(remainingData != null) {
remaining_size = remainingData.length;
}
return 8 + 1 + 1 + 16 + 2 + 4 + 4 + 4 + 1 + 1 +
1 + 1 + field_12_size + remaining_size;
}
/**

View File

@ -247,6 +247,11 @@ public abstract class HeaderFooter {
public static String stripFields(String text) {
int pos;
// Check we really got something to work on
if(text == null || text.length() == 0) {
return text;
}
// Firstly, do the easy ones which are static
for(int i=0; i<Field.ALL_FIELDS.size(); i++) {
String seq = ((Field)Field.ALL_FIELDS.get(i)).sequence;
@ -257,6 +262,7 @@ public abstract class HeaderFooter {
}
// Now do the tricky, dynamic ones
// These are things like font sizes and font names
text = text.replaceAll("\\&\\d+", "");
text = text.replaceAll("\\&\".*?,.*?\"", "");
@ -292,9 +298,9 @@ public abstract class HeaderFooter {
public static final Field TIME_FIELD = new Field("&T");
public static final Field NUM_PAGES_FIELD = new Field("&N");
public static final Field PICTURE_FIELD = new Field("&P");
public static final Field PICTURE_FIELD = new Field("&G");
public static final PairField BOLD_FIELD = new PairField("&B"); // PAID
public static final PairField BOLD_FIELD = new PairField("&B");
public static final PairField ITALIC_FIELD = new PairField("&I");
public static final PairField STRIKETHROUGH_FIELD = new PairField("&S");
public static final PairField SUBSCRIPT_FIELD = new PairField("&Y");

View File

@ -0,0 +1,90 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hpbf.dev;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.ddf.DefaultEscherRecordFactory;
import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.QuillContents;
import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.HexDump;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
/**
* For dumping out the PLC contents of QC Bits of a
* HPBF (Publisher) file, while we try to figure out
* what the format of them is.
*/
public class PLCDumper {
private HPBFDocument doc;
private QuillContents qc;
public PLCDumper(HPBFDocument doc) {
this.doc = doc;
qc = doc.getQuillContents();
}
public PLCDumper(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs));
}
public PLCDumper(InputStream inp) throws IOException {
this(new POIFSFileSystem(inp));
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" PLCDumper <filename>");
System.exit(1);
}
PLCDumper dump = new PLCDumper(
new FileInputStream(args[0])
);
System.out.println("Dumping " + args[0]);
dump.dumpPLC();
}
private void dumpPLC() {
QuillContents qc = doc.getQuillContents();
QCBit[] bits = qc.getBits();
for(int i=0; i<bits.length; i++) {
if(bits[i] == null) continue;
if(bits[i].getBitType().equals("PLC ")) {
dumpBit(bits[i], i);
}
}
}
private void dumpBit(QCBit bit, int index) {
System.out.println("");
System.out.println("Dumping " + bit.getBitType() + " bit at " + index);
System.out.println(" Is a " + bit.getThingType() + ", number is " + bit.getOptA());
System.out.println(" Starts at " + bit.getDataOffset() + " (" + Integer.toHexString(bit.getDataOffset()) + ")");
System.out.println(" Runs for " + bit.getLength() + " (" + Integer.toHexString(bit.getLength()) + ")");
System.out.println(HexDump.dump(bit.getData(), 0, 0));
}
}

View File

@ -70,6 +70,7 @@ public final class QuillContents extends HPBFPart {
bits[i].setOptA(optA);
bits[i].setOptB(optB);
bits[i].setOptC(optC);
bits[i].setDataOffset(from);
} else {
// Doesn't have data
}

View File

@ -28,6 +28,8 @@ public abstract class QCBit {
protected int optB;
protected int optC;
protected int dataOffset;
public QCBit(String thingType, String bitType, byte[] data) {
this.thingType = thingType;
this.bitType = bitType;
@ -66,4 +68,15 @@ public abstract class QCBit {
public void setOptC(int optC) {
this.optC = optC;
}
public int getDataOffset() {
return dataOffset;
}
public void setDataOffset(int offset) {
this.dataOffset = offset;
}
public int getLength() {
return data.length;
}
}

View File

@ -167,6 +167,13 @@ public class HeaderStories {
if(stripFields) {
return Range.stripFields(text);
}
// If you create a header/footer, then remove it again, word
// will leave \r\r. Turn these back into an empty string,
// which is more what you'd expect
if(text.equals("\r\r")) {
return "";
}
return text;
}

View File

@ -59,11 +59,11 @@ public class TextPublisherTextExtractor extends TestCase {
assertEquals(
"This is some text on the first page\n" +
"Its in times new roman, font size 10, all normal\n" +
"It\u2019s in times new roman, font size 10, all normal\n" +
"" +
"This is in bold and italic\n" +
"Its Arial, 20 point font\n" +
"Its in the second textbox on the first page\n" +
"It\u2019s Arial, 20 point font\n" +
"It\u2019s in the second textbox on the first page\n" +
"" +
"This is the second page\n\n" +
"" +
@ -102,4 +102,36 @@ public class TextPublisherTextExtractor extends TestCase {
, text
);
}
/**
* We have the same file saved for Publisher 98, Publisher
* 2000 and Publisher 2007. Check they all agree.
* @throws Exception
*/
public void testMultipleVersions() throws Exception {
File f;
HPBFDocument doc;
f = new File(dir, "Sample.pub");
doc = new HPBFDocument(
new FileInputStream(f)
);
String s2007 = (new PublisherTextExtractor(doc)).getText();
f = new File(dir, "Sample2000.pub");
doc = new HPBFDocument(
new FileInputStream(f)
);
String s2000 = (new PublisherTextExtractor(doc)).getText();
f = new File(dir, "Sample98.pub");
doc = new HPBFDocument(
new FileInputStream(f)
);
String s98 = (new PublisherTextExtractor(doc)).getText();
// Check they all agree
assertEquals(s2007, s2000);
assertEquals(s2007, s98);
}
}

View File

@ -47,4 +47,38 @@ public class TestEscherParts extends TestCase {
// TODO - check the contents
}
public void testComplex() throws Exception {
File f = new File(dir, "SampleBrochure.pub");
HPBFDocument doc = new HPBFDocument(
new FileInputStream(f)
);
EscherStm es = doc.getEscherStm();
EscherDelayStm eds = doc.getEscherDelayStm();
assertNotNull(es);
assertNotNull(eds);
assertEquals(30, es.getEscherRecords().length);
assertEquals(19, eds.getEscherRecords().length);
// TODO - check contents
// Now do another complex file
f = new File(dir, "SampleNewsletter.pub");
doc = new HPBFDocument(
new FileInputStream(f)
);
es = doc.getEscherStm();
eds = doc.getEscherDelayStm();
assertNotNull(es);
assertNotNull(eds);
assertEquals(51, es.getEscherRecords().length);
assertEquals(92, eds.getEscherRecords().length);
}
}

View File

@ -183,7 +183,7 @@ public class TestWordExtractor extends TestCase {
extractor = new WordExtractor(doc);
assertEquals(
"\n\nThis is a simple header, with a \u20ac euro symbol in it.\n\n",
"This is a simple header, with a \u20ac euro symbol in it.\n\n",
extractor.getHeaderText()
);
text = extractor.getText();
@ -217,7 +217,7 @@ public class TestWordExtractor extends TestCase {
extractor = new WordExtractor(doc);
assertEquals(
"\n\nThe footer, with Moli\u00e8re, has Unicode in it.\n",
"The footer, with Moli\u00e8re, has Unicode in it.\n",
extractor.getFooterText()
);
text = extractor.getText();

View File

@ -123,7 +123,7 @@ public class TestHeaderStories extends TestCase {
assertEquals("", hs.getFirstHeader());
assertEquals("", hs.getEvenHeader());
assertEquals("\r\r", hs.getOddHeader());
assertEquals("", hs.getOddHeader()); // Was \r\r but gets emptied
assertEquals("", hs.getFirstFooter());
@ -181,13 +181,13 @@ public class TestHeaderStories extends TestCase {
public void testUnicode() throws Exception {
HeaderStories hs = new HeaderStories(unicode);
assertEquals("\r\r", hs.getFirstHeader());
assertEquals("\r\r", hs.getEvenHeader());
assertEquals("", hs.getFirstHeader());
assertEquals("", hs.getEvenHeader());
assertEquals("This is a simple header, with a \u20ac euro symbol in it.\r\r\r", hs.getOddHeader());
assertEquals("\r\r", hs.getFirstFooter());
assertEquals("\r\r", hs.getEvenFooter());
assertEquals("", hs.getFirstFooter());
assertEquals("", hs.getEvenFooter());
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
}

View File

@ -87,8 +87,8 @@ public final class TestHSSFHeaderFooter extends TestCase {
assertTrue(head.areFieldsStripped());
// Now even more complex
head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G");
assertEquals("HEADER TEXT &G", head.getCenter());
head.setCenter("HEADER TEXT &P&N&D&T&Z&F&F&A&G&X END");
assertEquals("HEADER TEXT END", head.getCenter());
}
/**