More work understanding hpbf
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@686624 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
78907b6420
commit
eab2286d49
75
src/documentation/content/xdocs/hpbf/file-format.xml
Normal file
75
src/documentation/content/xdocs/hpbf/file-format.xml
Normal file
@ -0,0 +1,75 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
====================================================================
|
||||
-->
|
||||
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
|
||||
|
||||
<document>
|
||||
<header>
|
||||
<title>POI-HPBF - A Guide to the Publisher File Format</title>
|
||||
<subtitle>Overview</subtitle>
|
||||
<authors>
|
||||
<person name="Nick Burch" email="nick at torchbox dot com"/>
|
||||
</authors>
|
||||
</header>
|
||||
|
||||
<body>
|
||||
<section><title>Document Streams</title>
|
||||
<p>
|
||||
The file is made up of a number of POIFS streams. A typical
|
||||
file will be made up as follows:
|
||||
</p>
|
||||
<source>
|
||||
Root Entry -
|
||||
Objects -
|
||||
(no children)
|
||||
SummaryInformation <(0x05)SummaryInformation>
|
||||
DocumentSummaryInformation <(0x05)DocumentSummaryInformation>
|
||||
Escher -
|
||||
EscherStm
|
||||
EscherDelayStm
|
||||
Quill -
|
||||
QuillSub -
|
||||
CONTENTS
|
||||
CompObj <(0x01)CompObj>
|
||||
Envelope
|
||||
Contents
|
||||
Internal <(0x03)Internal>
|
||||
CompObj <(0x01)CompObj>
|
||||
VBA -
|
||||
(no children)
|
||||
</source>
|
||||
</section>
|
||||
<section><title>Changing Text</title>
|
||||
<p>If you make a change to the text of a file, but not change
|
||||
how much text there is, then the <em>CONTENTS</em> stream
|
||||
will undergo a small change, and the <em>Contents</em> stream
|
||||
will undergo a large change.</p>
|
||||
<p>If you make a change to the text of a file, and change the
|
||||
amount of text there is, then both the <em>Contents</em> and
|
||||
the <em>CONTENTS</em> streams change.</p>
|
||||
</section>
|
||||
<section><title>Changing Shapes</title>
|
||||
<p>If you alter the size of a textbox, but make no text changes,
|
||||
then both <em>Contents</em> and <em>CONTENTS</em> streams
|
||||
change. There are no changes to the Escher streams.</p>
|
||||
<p>If you set the background colour of a textbox, but make
|
||||
no changes to the text,
|
||||
</section>
|
||||
</body>
|
||||
</document>
|
53
src/documentation/content/xdocs/hpbf/index.xml
Executable file
53
src/documentation/content/xdocs/hpbf/index.xml
Executable file
@ -0,0 +1,53 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
====================================================================
|
||||
-->
|
||||
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
|
||||
|
||||
<document>
|
||||
<header>
|
||||
<title>POI-HPBF - Java API To Access Microsoft Publisher Format Files</title>
|
||||
<subtitle>Overview</subtitle>
|
||||
<authors>
|
||||
<person name="Nick Burch" email="nick at apache dot org"/>
|
||||
</authors>
|
||||
</header>
|
||||
|
||||
<body>
|
||||
<section>
|
||||
<title>Overview</title>
|
||||
|
||||
<p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
|
||||
<p>Currently, HPBF is in the experimental stage, while we try
|
||||
to figure out the file format. Our initial aim is to provide
|
||||
a text extractor for the format, with low level code following
|
||||
after that if demand and developer interest warrant it.</p>
|
||||
<p>At this time, there is no <em>usermodel</em> api or similar.</p>
|
||||
<p>Our current understanding of the file format is documented
|
||||
<link href="file-format.html">here</a>.</p>
|
||||
<note>
|
||||
This code currently lives the
|
||||
<link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link>
|
||||
of the POI SVN repository.
|
||||
Ensure that you have the scratchpad jar or the scratchpad
|
||||
build area in your
|
||||
classpath before experimenting with this code.
|
||||
</note>
|
||||
</section>
|
||||
</body>
|
||||
</document>
|
@ -25,6 +25,8 @@ import org.apache.poi.ddf.EscherRecord;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
/**
|
||||
* For dumping out the contents of HPBF (Publisher)
|
||||
@ -52,6 +54,26 @@ public class HPBFDumper {
|
||||
return d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Dumps out the given number of bytes as hex,
|
||||
* two chars
|
||||
*/
|
||||
private String dumpBytes(byte[] data, int offset, int len) {
|
||||
StringBuffer ret = new StringBuffer();
|
||||
for(int i=0; i<len; i++) {
|
||||
int j = i + offset;
|
||||
int b = data[j];
|
||||
if(b < 0) { b += 256; }
|
||||
|
||||
String bs = Integer.toHexString(b);
|
||||
if(bs.length() == 1)
|
||||
ret.append('0');
|
||||
ret.append(bs);
|
||||
ret.append(' ');
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
@ -159,9 +181,61 @@ public class HPBFDumper {
|
||||
System.out.println("");
|
||||
System.out.println("CONTENTS - " + data.length + " bytes long:");
|
||||
|
||||
// Dump out up to 0x200
|
||||
// Between the start and 0x200 we have
|
||||
// CHNKINK(space) + 24 bytes + 0x1800
|
||||
// TEXT + 6 bytes
|
||||
// TEXT + 8 bytes + 0x1800
|
||||
// STSH + 6 bytes
|
||||
// STSH + 8 bytes + 0x1800
|
||||
// STSH + 6 bytes
|
||||
// STSH + 8 bytes + 0x1800
|
||||
// but towards 0x200 the pattern may
|
||||
// break down a little bit
|
||||
|
||||
// Text from 0x200 onwards for a bit
|
||||
// After the second of a given type,
|
||||
// it seems to be 4 bytes giving the start,
|
||||
// then 4 bytes giving the length, then
|
||||
// 18 00
|
||||
System.out.println(
|
||||
new String(data, 0, 8) +
|
||||
dumpBytes(data, 8, 0x22-8)
|
||||
);
|
||||
|
||||
int pos = 0x22;
|
||||
boolean sixNotTen = true;
|
||||
while(pos < 0x200) {
|
||||
String text = new String(data, pos, 4);
|
||||
int blen = 10;
|
||||
if(sixNotTen)
|
||||
blen = 6;
|
||||
System.out.println(
|
||||
text + " " + dumpBytes(data, pos+4, blen)
|
||||
);
|
||||
|
||||
pos += 4 + blen;
|
||||
sixNotTen = ! sixNotTen;
|
||||
}
|
||||
|
||||
// Text from 0x200 onwards until we get
|
||||
// to \r(00)\n(00)(00)(00)
|
||||
int textStop = -1;
|
||||
for(int i=0x200; i<data.length-2 && textStop == -1; i++) {
|
||||
if(data[i] == 0 && data[i+1] == 0 && data[i+2] == 0) {
|
||||
textStop = i;
|
||||
}
|
||||
}
|
||||
if(textStop > 0) {
|
||||
int len = (textStop - 0x200) / 2;
|
||||
System.out.println("");
|
||||
System.out.println(
|
||||
StringUtil.getFromUnicodeLE(data, 0x200, len)
|
||||
);
|
||||
}
|
||||
|
||||
// The font list comes slightly later
|
||||
|
||||
// The hyperlinks may come before the fonts,
|
||||
// or slightly in front
|
||||
}
|
||||
|
||||
protected void dump001CompObj(DirectoryNode dir) {
|
||||
|
Loading…
Reference in New Issue
Block a user