diff --git a/src/documentation/content/xdocs/book.xml b/src/documentation/content/xdocs/book.xml index ab1af452e..39424524f 100644 --- a/src/documentation/content/xdocs/book.xml +++ b/src/documentation/content/xdocs/book.xml @@ -41,6 +41,7 @@ + diff --git a/src/documentation/content/xdocs/hpbf/file-format.xml b/src/documentation/content/xdocs/hpbf/file-format.xml index 591204951..97d5a33d7 100644 --- a/src/documentation/content/xdocs/hpbf/file-format.xml +++ b/src/documentation/content/xdocs/hpbf/file-format.xml @@ -38,19 +38,19 @@ Root Entry - Objects - (no children) - SummaryInformation <(0x05)SummaryInformation> - DocumentSummaryInformation <(0x05)DocumentSummaryInformation> + SummaryInformation <(0x05)SummaryInformation> + DocumentSummaryInformation <(0x05)DocumentSummaryInformation> Escher - EscherStm EscherDelayStm Quill - QuillSub - CONTENTS - CompObj <(0x01)CompObj> + CompObj <(0x01)CompObj> Envelope Contents - Internal <(0x03)Internal> - CompObj <(0x01)CompObj> + Internal <(0x03)Internal> + CompObj <(0x01)CompObj> VBA - (no children) @@ -69,7 +69,7 @@ Root Entry - then both Contents and CONTENTS streams change. There are no changes to the Escher streams.

If you set the background colour of a textbox, but make - no changes to the text, + no changes to the text, (to finish off)

Structure of CONTENTS

First we have "CHNKINK ", followed by 24 bytes.

@@ -162,6 +162,8 @@ PL 62 1a 00 00 48 00 00 00 // PL from: 1a62 (6754), len: 48 (72) 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + +(the text will then start)
diff --git a/src/documentation/content/xdocs/hpbf/index.xml b/src/documentation/content/xdocs/hpbf/index.xml index 2601a4174..c74dc2362 100755 --- a/src/documentation/content/xdocs/hpbf/index.xml +++ b/src/documentation/content/xdocs/hpbf/index.xml @@ -39,7 +39,7 @@ after that if demand and developer interest warrant it.

At this time, there is no usermodel api or similar.

Our current understanding of the file format is documented - here.

+ here.

This code currently lives the scratchpad area diff --git a/src/documentation/content/xdocs/index.xml b/src/documentation/content/xdocs/index.xml index 3cf305954..17e4336d0 100644 --- a/src/documentation/content/xdocs/index.xml +++ b/src/documentation/content/xdocs/index.xml @@ -146,6 +146,16 @@ href="./hslf/index.html">the HSLF project page for more information.

+
HPSF for Document Properties +

HPSF is our port of the OLE 2 property set format to pure + Java. Property sets are mostly use to store a document's properties + (title, author, date of last modification etc.), but they can be used + for application-specific purposes as well.

+ +

HPSF supports both reading and writing of properties.

+

Please see the HPSF project + page for more information.

+
HDGF for Visio Documents

HDGF is our port of the Microsoft Viso 97(-2003) file format to pure Java. It currently only supports reading at a very low level, and @@ -153,19 +163,13 @@ href="./hdgf/index.html">the HDGF project page for more information.

-
HPSF for Document Properties -

HPSF is our port of the OLE 2 property set format to pure - Java. Property sets are mostly use to store a document's properties - (title, author, date of last modification etc.), but they can be used - for application-specific purposes as well.

- -

HPSF supports reading and writing of properties. However, you will - need to be using version 3.0 of POI to utilise the write support.

- -

Please see the HPSF project - page for more information.

+
HPBF for Publisher Documents +

HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure + Java. At the moment, we are still figuring out the file format, but we hope + to have simple text extraction shortly. Please see the HPBF project page for more + information.

-
Contributing diff --git a/src/scratchpad/src/org/apache/poi/hpbf/dev/HPBFDumper.java b/src/scratchpad/src/org/apache/poi/hpbf/dev/HPBFDumper.java index ae3cb56d5..6c52bbb04 100644 --- a/src/scratchpad/src/org/apache/poi/hpbf/dev/HPBFDumper.java +++ b/src/scratchpad/src/org/apache/poi/hpbf/dev/HPBFDumper.java @@ -310,8 +310,30 @@ public class HPBFDumper { ); } } + + // Text System.out.println(""); + System.out.println("TEXT:"); System.out.println(text); + System.out.println(""); + + // All the others + for(int i=0; i<20; i++) { + if(startType[i] == null) { + continue; + } + int start = from[i]; + + System.out.println( + startType[i] + " -> " + endType[i] + + " @ " + Integer.toHexString(start) + + " (" + start + ")" + ); + System.out.println("\t" + dumpBytes(data, start, 4)); + System.out.println("\t" + dumpBytes(data, start+4, 4)); + System.out.println("\t" + dumpBytes(data, start+8, 4)); + System.out.println("\t(etc)"); + } } protected void dump001CompObj(DirectoryNode dir) { diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample2.pub b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample2.pub new file mode 100755 index 000000000..610362c47 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample2.pub differ diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample2.txt b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample2.txt new file mode 100644 index 000000000..f8a68bb64 --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample2.txt @@ -0,0 +1,34 @@ +This is some text on the first page +It’s in times new roman, font size 10, all normal + +We’ve added some more text in here, to push all the offsets about a bit. + + + +This is in bold and italic +It’s Arial, 20 point font +It’s in the second textbox on the first page + +Ditto with more text in here. + + +This is the second page + +It is also times new roman, 10 point + + +Table on page 2 Top right +P2 table left P2 table right +Bottom Left Bottom Right + + +This text is on page two +This is a link to Apache POI +More normal text +Link to a file + + +More text, more hyperlinks +email link +Final hyperlink +Within doc to page 1 diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample3.pub b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample3.pub new file mode 100755 index 000000000..4f19bec93 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample3.pub differ diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample3.txt b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample3.txt new file mode 100644 index 000000000..c2d791b9a --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample3.txt @@ -0,0 +1,29 @@ +This is some text on the first page +It’s in times new roman, font size 10, all normal + + +This is in bold and italic +It’s Arial, 20 point font +It’s in the second textbox on the first page + + +This is the second page12345678 + +It is also times new roman, 10 point + + +Table on page 2 Top right +P2 table left P2 table right +Bottom Left Bottom Right + + +This text is on page two +This is a link to Apache POI +More normal text +Link to a file + + +More text, more hyperlinks +email link +Final hyperlink +Within doc to page 1 diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample4.pub b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample4.pub new file mode 100755 index 000000000..445df85f0 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample4.pub differ diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample4.txt b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample4.txt new file mode 100644 index 000000000..279395e5d --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hpbf/data/Sample4.txt @@ -0,0 +1,29 @@ +This is some text on the first page +It’s in times new roman, font size 10, all normal + + +This is in bold and italic +It’s Arial, 20 point font +It’s in the second textbox on the first page + + +This is the second page + +It is also times new roman, 10 point + + +Table on page 2 Top right +P2 table left P2 table right +Bottom Left Bottom Right + + +This text is on page two +This is a link to Apache POI +More normal text +Link to a file + + +More text, more hyperlinks +email link +Final hyperlink +Within doc to page 1