More work understanding hpbf

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@686624 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-17 17:39:10 +00:00
parent 78907b6420
commit eab2286d49
3 changed files with 204 additions and 2 deletions

View File

@ -0,0 +1,75 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
====================================================================
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
<document>
<header>
<title>POI-HPBF - A Guide to the Publisher File Format</title>
<subtitle>Overview</subtitle>
<authors>
<person name="Nick Burch" email="nick at torchbox dot com"/>
</authors>
</header>
<body>
<section><title>Document Streams</title>
<p>
The file is made up of a number of POIFS streams. A typical
file will be made up as follows:
</p>
<source>
Root Entry -
Objects -
(no children)
SummaryInformation <(0x05)SummaryInformation>
DocumentSummaryInformation <(0x05)DocumentSummaryInformation>
Escher -
EscherStm
EscherDelayStm
Quill -
QuillSub -
CONTENTS
CompObj <(0x01)CompObj>
Envelope
Contents
Internal <(0x03)Internal>
CompObj <(0x01)CompObj>
VBA -
(no children)
</source>
</section>
<section><title>Changing Text</title>
<p>If you make a change to the text of a file, but not change
how much text there is, then the <em>CONTENTS</em> stream
will undergo a small change, and the <em>Contents</em> stream
will undergo a large change.</p>
<p>If you make a change to the text of a file, and change the
amount of text there is, then both the <em>Contents</em> and
the <em>CONTENTS</em> streams change.</p>
</section>
<section><title>Changing Shapes</title>
<p>If you alter the size of a textbox, but make no text changes,
then both <em>Contents</em> and <em>CONTENTS</em> streams
change. There are no changes to the Escher streams.</p>
<p>If you set the background colour of a textbox, but make
no changes to the text,
</section>
</body>
</document>

View File

@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
====================================================================
-->
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
<document>
<header>
<title>POI-HPBF - Java API To Access Microsoft Publisher Format Files</title>
<subtitle>Overview</subtitle>
<authors>
<person name="Nick Burch" email="nick at apache dot org"/>
</authors>
</header>
<body>
<section>
<title>Overview</title>
<p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
<p>Currently, HPBF is in the experimental stage, while we try
to figure out the file format. Our initial aim is to provide
a text extractor for the format, with low level code following
after that if demand and developer interest warrant it.</p>
<p>At this time, there is no <em>usermodel</em> api or similar.</p>
<p>Our current understanding of the file format is documented
<link href="file-format.html">here</a>.</p>
<note>
This code currently lives the
<link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link>
of the POI SVN repository.
Ensure that you have the scratchpad jar or the scratchpad
build area in your
classpath before experimenting with this code.
</note>
</section>
</body>
</document>

View File

@ -25,6 +25,8 @@ import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
/**
* For dumping out the contents of HPBF (Publisher)
@ -52,6 +54,26 @@ public class HPBFDumper {
return d;
}
/**
* Dumps out the given number of bytes as hex,
* two chars
*/
private String dumpBytes(byte[] data, int offset, int len) {
StringBuffer ret = new StringBuffer();
for(int i=0; i<len; i++) {
int j = i + offset;
int b = data[j];
if(b < 0) { b += 256; }
String bs = Integer.toHexString(b);
if(bs.length() == 1)
ret.append('0');
ret.append(bs);
ret.append(' ');
}
return ret.toString();
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
@ -159,9 +181,61 @@ public class HPBFDumper {
System.out.println("");
System.out.println("CONTENTS - " + data.length + " bytes long:");
// Dump out up to 0x200
// Between the start and 0x200 we have
// CHNKINK(space) + 24 bytes + 0x1800
// TEXT + 6 bytes
// TEXT + 8 bytes + 0x1800
// STSH + 6 bytes
// STSH + 8 bytes + 0x1800
// STSH + 6 bytes
// STSH + 8 bytes + 0x1800
// but towards 0x200 the pattern may
// break down a little bit
// Text from 0x200 onwards for a bit
// After the second of a given type,
// it seems to be 4 bytes giving the start,
// then 4 bytes giving the length, then
// 18 00
System.out.println(
new String(data, 0, 8) +
dumpBytes(data, 8, 0x22-8)
);
int pos = 0x22;
boolean sixNotTen = true;
while(pos < 0x200) {
String text = new String(data, pos, 4);
int blen = 10;
if(sixNotTen)
blen = 6;
System.out.println(
text + " " + dumpBytes(data, pos+4, blen)
);
pos += 4 + blen;
sixNotTen = ! sixNotTen;
}
// Text from 0x200 onwards until we get
// to \r(00)\n(00)(00)(00)
int textStop = -1;
for(int i=0x200; i<data.length-2 && textStop == -1; i++) {
if(data[i] == 0 && data[i+1] == 0 && data[i+2] == 0) {
textStop = i;
}
}
if(textStop > 0) {
int len = (textStop - 0x200) / 2;
System.out.println("");
System.out.println(
StringUtil.getFromUnicodeLE(data, 0x200, len)
);
}
// The font list comes slightly later
// The hyperlinks may come before the fonts,
// or slightly in front
}
protected void dump001CompObj(DirectoryNode dir) {