Further HPBF documentation, and some more sample files used

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@686640 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-17 20:15:51 +00:00
parent d35d590c07
commit b185a26d00
11 changed files with 140 additions and 19 deletions

View File

@ -41,6 +41,7 @@
<menu-item label="HSLF" href="hslf/index.html"/>
<menu-item label="HSMF" href="hsmf/index.html"/>
<menu-item label="HDGF" href="hdgf/index.html"/>
<menu-item label="HPBF" href="hpbf/index.html"/>
<menu-item label="POI-Ruby" href="poi-ruby.html"/>
<menu-item label="POI-Utils" href="utils/index.html"/>
<menu-item label="Text Extraction" href="text-extraction.html"/>

View File

@ -38,19 +38,19 @@
Root Entry -
Objects -
(no children)
SummaryInformation <(0x05)SummaryInformation>
DocumentSummaryInformation <(0x05)DocumentSummaryInformation>
SummaryInformation &lt;(0x05)SummaryInformation&gt;
DocumentSummaryInformation &lt;(0x05)DocumentSummaryInformation&gt;
Escher -
EscherStm
EscherDelayStm
Quill -
QuillSub -
CONTENTS
CompObj <(0x01)CompObj>
CompObj &lt;(0x01)CompObj&gt;
Envelope
Contents
Internal <(0x03)Internal>
CompObj <(0x01)CompObj>
Internal &lt;(0x03)Internal&gt;
CompObj &lt;(0x01)CompObj&gt;
VBA -
(no children)
</source>
@ -69,7 +69,7 @@ Root Entry -
then both <em>Contents</em> and <em>CONTENTS</em> streams
change. There are no changes to the Escher streams.</p>
<p>If you set the background colour of a textbox, but make
no changes to the text,
no changes to the text, (to finish off)</p>
</section>
<section><title>Structure of CONTENTS</title>
<p>First we have "CHNKINK ", followed by 24 bytes.</p>
@ -162,6 +162,8 @@ PL 62 1a 00 00 48 00 00 00 // PL from: 1a62 (6754), len: 48 (72)
00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
(the text will then start)
</source>
</section>
</body>

View File

@ -39,7 +39,7 @@
after that if demand and developer interest warrant it.</p>
<p>At this time, there is no <em>usermodel</em> api or similar.</p>
<p>Our current understanding of the file format is documented
<link href="file-format.html">here</a>.</p>
<link href="file-format.html">here</link>.</p>
<note>
This code currently lives the
<link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link>

View File

@ -146,6 +146,16 @@
href="./hslf/index.html">the HSLF project page for more
information</link>.</p>
</section>
<section><title>HPSF for Document Properties</title>
<p>HPSF is our port of the OLE 2 property set format to pure
Java. Property sets are mostly use to store a document's properties
(title, author, date of last modification etc.), but they can be used
for application-specific purposes as well.</p>
<p>HPSF supports both reading and writing of properties.</p>
<p>Please see <link href="./hpsf/index.html">the HPSF project
page</link> for more information.</p>
</section>
<section><title>HDGF for Visio Documents</title>
<p>HDGF is our port of the Microsoft Viso 97(-2003) file format to pure
Java. It currently only supports reading at a very low level, and
@ -153,19 +163,13 @@
href="./hdgf/index.html">the HDGF project page for more
information</link>.</p>
</section>
<section><title>HPSF for Document Properties</title>
<p>HPSF is our port of the OLE 2 property set format to pure
Java. Property sets are mostly use to store a document's properties
(title, author, date of last modification etc.), but they can be used
for application-specific purposes as well.</p>
<p>HPSF supports reading and writing of properties. However, you will
need to be using version 3.0 of POI to utilise the write support.</p>
<p>Please see <link href="./hpsf/index.html">the HPSF project
page</link> for more information.</p>
<section><title>HPBF for Publisher Documents</title>
<p>HPBF is our port of the Microsoft Publisher 98(-2007) file format to pure
Java. At the moment, we are still figuring out the file format, but we hope
to have simple text extraction shortly. Please see <link
href="./hpbf/index.html">the HPBF project page for more
information</link>.</p>
</section>
</section>
<section><title>Contributing </title>

View File

@ -310,8 +310,30 @@ public class HPBFDumper {
);
}
}
// Text
System.out.println("");
System.out.println("TEXT:");
System.out.println(text);
System.out.println("");
// All the others
for(int i=0; i<20; i++) {
if(startType[i] == null) {
continue;
}
int start = from[i];
System.out.println(
startType[i] + " -> " + endType[i] +
" @ " + Integer.toHexString(start) +
" (" + start + ")"
);
System.out.println("\t" + dumpBytes(data, start, 4));
System.out.println("\t" + dumpBytes(data, start+4, 4));
System.out.println("\t" + dumpBytes(data, start+8, 4));
System.out.println("\t(etc)");
}
}
protected void dump001CompObj(DirectoryNode dir) {

View File

@ -0,0 +1,34 @@
This is some text on the first page
Its in times new roman, font size 10, all normal
Weve added some more text in here, to push all the offsets about a bit.
This is in bold and italic
Its Arial, 20 point font
Its in the second textbox on the first page
Ditto with more text in here.
This is the second page
It is also times new roman, 10 point
Table on page 2 Top right
P2 table left P2 table right
Bottom Left Bottom Right
This text is on page two
This is a link to Apache POI
More normal text
Link to a file
More text, more hyperlinks
email link
Final hyperlink
Within doc to page 1

View File

@ -0,0 +1,29 @@
This is some text on the first page
Its in times new roman, font size 10, all normal
This is in bold and italic
Its Arial, 20 point font
Its in the second textbox on the first page
This is the second page12345678
It is also times new roman, 10 point
Table on page 2 Top right
P2 table left P2 table right
Bottom Left Bottom Right
This text is on page two
This is a link to Apache POI
More normal text
Link to a file
More text, more hyperlinks
email link
Final hyperlink
Within doc to page 1

View File

@ -0,0 +1,29 @@
This is some text on the first page
Its in times new roman, font size 10, all normal
This is in bold and italic
Its Arial, 20 point font
Its in the second textbox on the first page
This is the second page
It is also times new roman, 10 point
Table on page 2 Top right
P2 table left P2 table right
Bottom Left Bottom Right
This text is on page two
This is a link to Apache POI
More normal text
Link to a file
More text, more hyperlinks
email link
Final hyperlink
Within doc to page 1