Tag as 3.0.1-RC3
git-svn-id: https://svn.apache.org/repos/asf/poi/tags/REL_3_0_1_RC3@551531 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cd3e480727
commit
2949f09894
13
legal/NOTICE
13
legal/NOTICE
|
@ -1,5 +1,16 @@
|
|||
Apache Jakarta POI
|
||||
Apache POI
|
||||
Copyright 2001-2007 The Apache Software Foundation
|
||||
|
||||
This product includes software developed by
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
||||
|
||||
|
||||
Unit testing support is provided by JUnit, under the
|
||||
Common Public License Version 1.0:
|
||||
http://www.opensource.org/licenses/cpl.php
|
||||
See http://www.junit.org/
|
||||
|
||||
Small parts of the POI component HDGF are based on VSDump,
|
||||
and are under the GNU General Public Licence version 3 (GPL v3):
|
||||
http://gplv3.fsf.org/
|
||||
See http://www.gnome.ru/projects/vsdump_en.html
|
||||
|
|
|
@ -39,6 +39,7 @@
|
|||
<menu-item label="HWPF" href="hwpf/index.html"/>
|
||||
<menu-item label="HPSF" href="hpsf/index.html"/>
|
||||
<menu-item label="HSLF" href="hslf/index.html"/>
|
||||
<menu-item label="HDGF" href="hdgf/index.html"/>
|
||||
<menu-item label="POI-Ruby" href="poi-ruby.html"/>
|
||||
<menu-item label="POI-Utils" href="utils/index.html"/>
|
||||
<menu-item label="Download" href="ext:download"/>
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
<person id="YK" name="Yegor Kozlov" email="yegor@apache.org"/>
|
||||
</devs>
|
||||
|
||||
<release version="3.0.1-FINAL" date="2007-06-15">
|
||||
<release version="3.0.1-FINAL" date="2007-07-05">
|
||||
<action dev="POI-DEVELOPERS" type="fix">Administrative updates to the Maven POMs, and the release artificat build process</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">23951 - [PATCH] Fix for HSSF setSheetOrder and tab names</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">42524 - [PATCH] Better HSLF support for problem shape groups</action>
|
||||
|
@ -44,6 +44,9 @@
|
|||
<action dev="POI-DEVELOPERS" type="add">Additional HSLF support for Title and Slide Master Sheets</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">42474 - [PATCH] Improved HSLF note to slide matching, and a NPE</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">42481 - [PATCH] Tweak some HSLF exceptions, to make it clearer what you're catching</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">42667 - [PATCH] Fix for HSLF writing of files with tables</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Improved way of detecting HSSF cells that contain dates, isADateFormat</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Initial, read-only support for Visio documents, as HDGF</action>
|
||||
</release>
|
||||
|
||||
<release version="3.0-FINAL" date="2007-05-18">
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
<?xml version="1.0"?>
|
||||
<!--
|
||||
====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
====================================================================
|
||||
-->
|
||||
<!DOCTYPE book PUBLIC "-//APACHE//DTD Cocoon Documentation Book V1.0//EN" "../dtd/book-cocoon-v10.dtd">
|
||||
|
||||
<book software="POI Project"
|
||||
title="HGDF"
|
||||
copyright="@year@ POI Project">
|
||||
|
||||
<menu label="Apache POI">
|
||||
<menu-item label="Top" href="../index.html"/>
|
||||
</menu>
|
||||
|
||||
<menu label="HDGF">
|
||||
<menu-item label="Overview" href="index.html"/>
|
||||
</menu>
|
||||
|
||||
</book>
|
|
@ -0,0 +1,98 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
====================================================================
|
||||
-->
|
||||
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
|
||||
|
||||
<document>
|
||||
<header>
|
||||
<title>POI-HDGF - Java API To Access Microsoft Visio Format Files</title>
|
||||
<subtitle>Overview</subtitle>
|
||||
<authors>
|
||||
<person name="Nick Burch" email="nick at apache dot org"/>
|
||||
</authors>
|
||||
</header>
|
||||
|
||||
<body>
|
||||
<section>
|
||||
<title>Overview</title>
|
||||
|
||||
<p>HDGF is the POI Project's pure Java implementation of the Visio file format.</p>
|
||||
<p>Currently, HDGF provides a low-level, read-only api for
|
||||
accessing Visio documents. It also provides a
|
||||
<link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/src/org/apache/poi/hdgf/extractor/">way</link>
|
||||
to extract the textual content from a file.
|
||||
</p>
|
||||
<p>At this time, there is no <em>usermodel</em> api or similar,
|
||||
only low level access to the streams, chunks and chunk commands.
|
||||
Users are advised to check the unit tests to see how everything
|
||||
works. They are also well advised to read the documentation
|
||||
supplied with
|
||||
<link href="http://www.gnome.ru/projects/vsdump_en.html">vsdump</link>
|
||||
to get a feel for how Visio files are structured.</p>
|
||||
<p>To get a feel for the contents of a file, and to track down
|
||||
where data of interest is stored, HDGF comes with
|
||||
<link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/src/org/apache/poi/hdgf/dev/">VSDDumper</link>
|
||||
to print out the contents of the file. Users should also make
|
||||
use of
|
||||
<link href="http://www.gnome.ru/projects/vsdump_en.html">vsdump</link>
|
||||
to probe the structure of files.</p>
|
||||
<note>
|
||||
This code currently lives the
|
||||
<link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link>
|
||||
of the POI SVN repository.
|
||||
Ensure that you have the scratchpad jar or the scratchpad
|
||||
build area in your
|
||||
classpath before experimenting with this code.
|
||||
</note>
|
||||
|
||||
<section>
|
||||
<title>Steps required for write support</title>
|
||||
<p>Currently, HDGF is only able to read visio files, it is
|
||||
not able to write them back out again. We believe the
|
||||
following are the steps that would need to be taken to
|
||||
implement it.</p>
|
||||
<ol>
|
||||
<li>Re-write the decompression support in LZW4HDGF to be
|
||||
less opaque, and also under the ASL.</li>
|
||||
<li>Add compression support to the new LZw4HDGF.</li>
|
||||
<li>Have HDGF just write back the raw bytes it read in, and
|
||||
have a test to ensure the file is un-changed.</li>
|
||||
<li>Have HDGF generate the bytes to write out from the
|
||||
Stream stores, using the compressed data as appropriate,
|
||||
without re-compressing. Plus test to ensure file is
|
||||
un-changed.</li>
|
||||
<li>Have HDGF generate the bytes to write out from the
|
||||
Stream stores, re-compressing any streams that were
|
||||
decompressed. Plus test to ensure file is un-changed.</li>
|
||||
<li>Have HDGF re-generate the offsets in pointers for the
|
||||
locations of the streams. Plus test to ensure file is
|
||||
un-changed.</li>
|
||||
<li>Have HDGF re-generate the bytes for all the chunks, from
|
||||
the chunk commands. Tests to ensure the chunks are
|
||||
serialized properly, and then that the file is un-changed</li>
|
||||
<li>Alter the data of one command, but keep it the same
|
||||
length, and check visio can open the file when written
|
||||
out.</li>
|
||||
<li>Alter the data of one command, to a new length, and
|
||||
check that visio can open the file when written out.</li>
|
||||
</ol>
|
||||
</section>
|
||||
</section>
|
||||
</body>
|
||||
</document>
|
|
@ -20,7 +20,7 @@
|
|||
<!DOCTYPE book PUBLIC "-//APACHE//DTD Cocoon Documentation Book V1.0//EN" "../dtd/book-cocoon-v10.dtd">
|
||||
|
||||
<book software="POI Project"
|
||||
title="HSSF"
|
||||
title="HSLF"
|
||||
copyright="@year@ POI Project">
|
||||
|
||||
<menu label="Apache POI">
|
||||
|
|
|
@ -34,12 +34,12 @@
|
|||
<title>Overview</title>
|
||||
|
||||
<p>HSLF is the POI Project's pure Java implementation of the Powerpoint file format.</p>
|
||||
<p>HSSF provides a way to read powerpoint presentations, and extract text from it.
|
||||
<p>HSLF provides a way to read powerpoint presentations, and extract text from it.
|
||||
It also provides some (currently limited) edit capabilities.
|
||||
</p>
|
||||
<note>
|
||||
This code currently lives the
|
||||
<link href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/">scratchpad area</link>
|
||||
<link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link>
|
||||
of the POI SVN repository.
|
||||
Ensure that you have the scratchpad jar or the scratchpad
|
||||
build area in your
|
||||
|
|
|
@ -460,7 +460,7 @@ some of the rows or cells. It can be found at
|
|||
<code>/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java</code>,
|
||||
and may be called on the command line, or from within your own code.
|
||||
The latest version is always available from
|
||||
<link href="http://svn.apache.org/repos/asf/jakarta/poi/trunk/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/">subversion</link>.
|
||||
<link href="http://svn.apache.org/repos/asf/poi/trunk/src/scratchpad/examples/src/org/apache/poi/hssf/eventusermodel/examples/">subversion</link>.
|
||||
</p>
|
||||
<p>
|
||||
<em>This code is currently in the scratchpad section, so you will either
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
to pure Java.</p>
|
||||
|
||||
<p>HWPF is still in early development. It is in the <link
|
||||
href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/">
|
||||
href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">
|
||||
scratchpad section of the SVN.</link> You will need to ensure you
|
||||
either have a recent SVN checkout, or a recent SVN nightly build
|
||||
(including the scratchpad jar!)</p>
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
|
||||
<body>
|
||||
<p>HWPF is still in early development. It is in the <link
|
||||
href="http://svn.apache.org/viewcvs.cgi/jakarta/poi/trunk/src/scratchpad/">
|
||||
href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">
|
||||
scratchpad section of the SVN.</link> You will need to ensure you
|
||||
either have a recent SVN checkout, or a recent SVN nightly build
|
||||
(including the scratchpad jar!)</p>
|
||||
|
@ -68,7 +68,7 @@ can then get text and other properties.
|
|||
<section><title>Further Examples</title>
|
||||
<p>For now, the best source of additional examples is in the unit
|
||||
tests. <link
|
||||
href="http://svn.apache.org/viewvc/jakarta/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/">
|
||||
href="http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hwpf/">
|
||||
Browse the HWPF unit tests.</link>
|
||||
</p>
|
||||
</section>
|
||||
|
|
|
@ -38,6 +38,10 @@
|
|||
<link href="http://www.apache.org/dyn/closer.cgi/poi/release/">download</link>
|
||||
the source and binaries from your
|
||||
<link href="http://www.apache.org/dyn/closer.cgi/poi/release/">local mirror</link>.</p>
|
||||
<p>We would also like to confirm that verion 3.0 of Apache POI does
|
||||
<em>not</em> contain any viruses. Users of broken virus checkers
|
||||
which do detect a 94 byte file, sci_cec.db, as containing one are
|
||||
advised to contact their vendor for a fix.</p>
|
||||
</section>
|
||||
|
||||
<section><title>Purpose</title>
|
||||
|
@ -107,12 +111,19 @@
|
|||
development. Jump in!</p>
|
||||
</section>
|
||||
<section><title>HSLF for PowerPoint Documents</title>
|
||||
<p>HWSL is our port of the Microsoft PowerPoint 97(-2003) file format to pure
|
||||
<p>HSLF is our port of the Microsoft PowerPoint 97(-2003) file format to pure
|
||||
Java. It supports read and write capabilities of some, but not yet all
|
||||
of the core records. Please see <link
|
||||
href="./hslf/index.html">the HSLF project page for more
|
||||
information</link>.</p>
|
||||
</section>
|
||||
<section><title>HDGF for Visio Documents</title>
|
||||
<p>HDGF is our port of the Microsoft Viso 97(-2003) file format to pure
|
||||
Java. It currently only supports reading at a very low level, and
|
||||
simple text extraction. Please see <link
|
||||
href="./hdgf/index.html">the HDGF project page for more
|
||||
information</link>.</p>
|
||||
</section>
|
||||
<section><title>HPSF for Document Properties</title>
|
||||
<p>HPSF is our port of the OLE 2 property set format to pure
|
||||
Java. Property sets are mostly use to store a document's properties
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
</developers>
|
||||
|
||||
<changes>
|
||||
<release version="3.0.1-FINAL" date="2007-06-15">
|
||||
<release version="3.0.1-FINAL" date="2007-07-05">
|
||||
<action dev="POI-DEVELOPERS" type="fix">Administrative updates to the Maven POMs, and the release artificat build process</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">23951 - [PATCH] Fix for HSSF setSheetOrder and tab names</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">42524 - [PATCH] Better HSLF support for problem shape groups</action>
|
||||
|
@ -41,6 +41,9 @@
|
|||
<action dev="POI-DEVELOPERS" type="add">Additional HSLF support for Title and Slide Master Sheets</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">42474 - [PATCH] Improved HSLF note to slide matching, and a NPE</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">42481 - [PATCH] Tweak some HSLF exceptions, to make it clearer what you're catching</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">42667 - [PATCH] Fix for HSLF writing of files with tables</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Improved way of detecting HSSF cells that contain dates, isADateFormat</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Initial, read-only support for Visio documents, as HDGF</action>
|
||||
</release>
|
||||
|
||||
<release version="3.0-FINAL" date="2007-05-18">
|
||||
|
|
|
@ -20,6 +20,9 @@ import java.util.ArrayList;
|
|||
|
||||
import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
/**
|
||||
* Base of all chunks, which hold data, flags etc
|
||||
|
@ -44,6 +47,9 @@ public class Chunk {
|
|||
/** The name of the chunk, as found from the commandDefinitions */
|
||||
private String name;
|
||||
|
||||
/** For logging warnings about the structure of the file */
|
||||
private POILogger logger = POILogFactory.getLogger(Chunk.class);
|
||||
|
||||
public Chunk(ChunkHeader header, ChunkTrailer trailer, ChunkSeparator separator, byte[] contents) {
|
||||
this.header = header;
|
||||
this.trailer = trailer;
|
||||
|
@ -148,7 +154,9 @@ public class Chunk {
|
|||
|
||||
// Check we seem to have enough data
|
||||
if(offset >= contents.length) {
|
||||
System.err.println("Command offset " + offset + " past end of data at " + contents.length);
|
||||
logger.log(POILogger.WARN,
|
||||
"Command offset " + offset + " past end of data at " + contents.length
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -167,9 +175,27 @@ public class Chunk {
|
|||
LittleEndian.getDouble(contents, offset)
|
||||
);
|
||||
break;
|
||||
case 12:
|
||||
// A Little Endian String
|
||||
// Starts 8 bytes into the data segment
|
||||
// Ends at end of data, or 00 00
|
||||
int startsAt = 8;
|
||||
int endsAt = startsAt;
|
||||
for(int j=startsAt; j<contents.length-1 && endsAt == startsAt; j++) {
|
||||
if(contents[j] == 0 && contents[j+1] == 0) {
|
||||
endsAt = j;
|
||||
}
|
||||
}
|
||||
if(endsAt == startsAt) {
|
||||
endsAt = contents.length;
|
||||
}
|
||||
|
||||
int strLen = (endsAt-startsAt) / 2;
|
||||
command.value = StringUtil.getFromUnicodeLE(contents, startsAt, strLen);
|
||||
break;
|
||||
case 25:
|
||||
command.value = new Short(
|
||||
LittleEndian.getShort(contents, offset)
|
||||
LittleEndian.getShort(contents, offset)
|
||||
);
|
||||
break;
|
||||
case 26:
|
||||
|
@ -188,7 +214,8 @@ public class Chunk {
|
|||
break;
|
||||
|
||||
default:
|
||||
//System.err.println("Warning - Command of type " + type + " not processed!");
|
||||
logger.log(POILogger.INFO,
|
||||
"Command of type " + type + " not processed!");
|
||||
}
|
||||
|
||||
// Add to the array
|
||||
|
|
|
@ -24,6 +24,9 @@ import java.util.ArrayList;
|
|||
import java.util.Hashtable;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* Factor class to create the appropriate chunks, which
|
||||
* needs the version of the file to process the chunk header
|
||||
|
@ -42,6 +45,9 @@ public class ChunkFactory {
|
|||
private static String chunkTableName =
|
||||
"/org/apache/poi/hdgf/chunks/chunks_parse_cmds.tbl";
|
||||
|
||||
/** For logging problems we spot with the file */
|
||||
private POILogger logger = POILogFactory.getLogger(ChunkFactory.class);
|
||||
|
||||
public ChunkFactory(int version) throws IOException {
|
||||
this.version = version;
|
||||
|
||||
|
@ -107,7 +113,8 @@ public class ChunkFactory {
|
|||
// Check we have enough data, and tweak the header size
|
||||
// as required
|
||||
if(endOfDataPos > data.length) {
|
||||
System.err.println("Header called for " + header.getLength() +" bytes, but that would take us passed the end of the data!");
|
||||
logger.log(POILogger.WARN,
|
||||
"Header called for " + header.getLength() +" bytes, but that would take us passed the end of the data!");
|
||||
|
||||
endOfDataPos = data.length;
|
||||
header.length = data.length - offset - header.getSizeInBytes();
|
||||
|
|
|
@ -24,6 +24,10 @@ public class ChunkHeaderV11 extends ChunkHeaderV6 {
|
|||
* Does the chunk have a separator?
|
||||
*/
|
||||
public boolean hasSeparator() {
|
||||
// For some reason, there are two types that don't have a
|
||||
// separator despite the flags that indicate they do
|
||||
if(type == 0x1f || type == 0xc9) { return false; }
|
||||
|
||||
// If there's a trailer, there's a separator
|
||||
if(hasTrailer()) { return true; }
|
||||
|
||||
|
|
|
@ -27,4 +27,8 @@ public class ChunkSeparator {
|
|||
separatorData = new byte[4];
|
||||
System.arraycopy(data, offset, separatorData, 0, 4);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "<ChunkSeparator of length " + separatorData.length + ">";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,4 +26,8 @@ public class ChunkTrailer {
|
|||
trailerData = new byte[8];
|
||||
System.arraycopy(data, offset, trailerData, 0, 8);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "<ChunkTrailer of length " + trailerData.length + ">";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -70,6 +70,11 @@ public class VSDDumper {
|
|||
" - " + Integer.toHexString(ptr.getFormat()));
|
||||
System.out.println(ind + " Length is\t" + ptr.getLength() +
|
||||
" - " + Integer.toHexString(ptr.getLength()));
|
||||
if(ptr.destinationCompressed()) {
|
||||
int decompLen = stream._getContentsLength();
|
||||
System.out.println(ind + " DC.Length is\t" + decompLen +
|
||||
" - " + Integer.toHexString(decompLen));
|
||||
}
|
||||
System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed());
|
||||
System.out.println(ind + " Stream is\t" + stream.getClass().getName());
|
||||
|
||||
|
@ -100,6 +105,9 @@ public class VSDDumper {
|
|||
for(int i=0; i<cs.getChunks().length; i++) {
|
||||
Chunk chunk = cs.getChunks()[i];
|
||||
System.out.println(ind2 + "" + chunk.getName());
|
||||
System.out.println(ind2 + " Length is " + chunk._getContents().length + " (" + Integer.toHexString(chunk._getContents().length) + ")");
|
||||
System.out.println(ind2 + " OD Size is " + chunk.getOnDiskSize() + " (" + Integer.toHexString(chunk.getOnDiskSize()) + ")");
|
||||
System.out.println(ind2 + " T / S is " + chunk.getTrailer() + " / " + chunk.getSeparator());
|
||||
System.out.println(ind2 + " Holds " + chunk.getCommands().length + " commands");
|
||||
for(int j=0; j<chunk.getCommands().length; j++) {
|
||||
Command command = chunk.getCommands()[j];
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hdgf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.poi.hdgf.HDGFDiagram;
|
||||
import org.apache.poi.hdgf.chunks.Chunk.Command;
|
||||
import org.apache.poi.hdgf.streams.ChunkStream;
|
||||
import org.apache.poi.hdgf.streams.PointerContainingStream;
|
||||
import org.apache.poi.hdgf.streams.Stream;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Class to find all the text in a Visio file, and return it.
|
||||
* Can opperate on the command line (outputs to stdout), or
|
||||
* can return the text for you (eg for use with Lucene).
|
||||
*/
|
||||
public class VisioTextExtractor {
|
||||
private HDGFDiagram hdgf;
|
||||
private POIFSFileSystem fs;
|
||||
|
||||
public VisioTextExtractor(HDGFDiagram hdgf) {
|
||||
this.hdgf = hdgf;
|
||||
}
|
||||
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HDGFDiagram(fs));
|
||||
this.fs = fs;
|
||||
}
|
||||
public VisioTextExtractor(InputStream inp) throws IOException {
|
||||
this(new POIFSFileSystem(inp));
|
||||
}
|
||||
|
||||
/**
|
||||
* Locates all the text entries in the file, and returns their
|
||||
* contents.
|
||||
*/
|
||||
public String[] getAllText() {
|
||||
ArrayList text = new ArrayList();
|
||||
for(int i=0; i<hdgf.getTopLevelStreams().length; i++) {
|
||||
findText(hdgf.getTopLevelStreams()[i], text);
|
||||
}
|
||||
return (String[])text.toArray( new String[text.size()] );
|
||||
}
|
||||
private void findText(Stream stream, ArrayList text) {
|
||||
if(stream instanceof PointerContainingStream) {
|
||||
PointerContainingStream ps = (PointerContainingStream)stream;
|
||||
for(int i=0; i<ps.getPointedToStreams().length; i++) {
|
||||
findText(ps.getPointedToStreams()[i], text);
|
||||
}
|
||||
}
|
||||
if(stream instanceof ChunkStream) {
|
||||
ChunkStream cs = (ChunkStream)stream;
|
||||
for(int i=0; i<cs.getChunks().length; i++) {
|
||||
if(cs.getChunks()[i] != null &&
|
||||
cs.getChunks()[i].getName() != null &&
|
||||
cs.getChunks()[i].getName().equals("Text")) {
|
||||
// First command
|
||||
Command cmd = cs.getChunks()[i].getCommands()[0];
|
||||
if(cmd != null && cmd.getValue() != null) {
|
||||
text.add( cmd.getValue().toString() );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the textual contents of the file.
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
String[] allText = getAllText();
|
||||
for(int i=0; i<allText.length; i++) {
|
||||
text.append(allText[i]);
|
||||
if(!allText[i].endsWith("\r") &&
|
||||
!allText[i].endsWith("\n")) {
|
||||
text.append("\n");
|
||||
}
|
||||
}
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length == 0) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" VisioTextExtractor <file.vsd>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
VisioTextExtractor extractor =
|
||||
new VisioTextExtractor(new FileInputStream(args[0]));
|
||||
|
||||
// Print not PrintLn as already has \n added to it
|
||||
System.out.print(extractor.getText());
|
||||
}
|
||||
}
|
|
@ -43,6 +43,11 @@ public class ChunkStream extends Stream {
|
|||
public void findChunks() {
|
||||
ArrayList chunksA = new ArrayList();
|
||||
|
||||
if(getPointer().getOffset() == 0x64b3) {
|
||||
int i = 0;
|
||||
i++;
|
||||
}
|
||||
|
||||
int pos = 0;
|
||||
byte[] contents = getStore().getContents();
|
||||
while(pos < contents.length) {
|
||||
|
|
|
@ -83,7 +83,7 @@ public abstract class Stream {
|
|||
return new ChunkStream(pointer, store, chunkFactory);
|
||||
}
|
||||
else if(pointer.destinationHasStrings()) {
|
||||
return new StringsStream(pointer, store);
|
||||
return new StringsStream(pointer, store, chunkFactory);
|
||||
}
|
||||
|
||||
// Give up and return a generic one
|
||||
|
|
|
@ -16,13 +16,16 @@ limitations under the License.
|
|||
==================================================================== */
|
||||
package org.apache.poi.hdgf.streams;
|
||||
|
||||
import org.apache.poi.hdgf.chunks.ChunkFactory;
|
||||
import org.apache.poi.hdgf.pointers.Pointer;
|
||||
|
||||
/**
|
||||
* A Stream which holds Strings
|
||||
* A Stream which holds Strings. This is just another kind
|
||||
* of ChunkStream, it seems
|
||||
*/
|
||||
public class StringsStream extends Stream {
|
||||
protected StringsStream(Pointer pointer, StreamStore store) {
|
||||
protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) {
|
||||
super(pointer, store);
|
||||
// super(pointer, store, chunkFactory);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hdgf.extractor;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.PrintStream;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.poi.hdgf.HDGFDiagram;
|
||||
import org.apache.poi.hdgf.chunks.Chunk;
|
||||
import org.apache.poi.hdgf.chunks.ChunkFactory;
|
||||
import org.apache.poi.hdgf.pointers.Pointer;
|
||||
import org.apache.poi.hdgf.pointers.PointerFactory;
|
||||
import org.apache.poi.hssf.record.formula.eval.StringOperationEval;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
public class TestVisioExtractor extends TestCase {
|
||||
private String filename;
|
||||
protected void setUp() throws Exception {
|
||||
String dirname = System.getProperty("HDGF.testdata.path");
|
||||
filename = dirname + "/Test_Visio-Some_Random_Text.vsd";
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the 3 different ways of creating one
|
||||
*/
|
||||
public void testCreation() throws Exception {
|
||||
VisioTextExtractor extractor;
|
||||
|
||||
extractor = new VisioTextExtractor(new FileInputStream(filename));
|
||||
assertNotNull(extractor);
|
||||
assertNotNull(extractor.getAllText());
|
||||
assertEquals(3, extractor.getAllText().length);
|
||||
|
||||
extractor = new VisioTextExtractor(
|
||||
new POIFSFileSystem(
|
||||
new FileInputStream(filename)
|
||||
)
|
||||
);
|
||||
assertNotNull(extractor);
|
||||
assertNotNull(extractor.getAllText());
|
||||
assertEquals(3, extractor.getAllText().length);
|
||||
|
||||
extractor = new VisioTextExtractor(
|
||||
new HDGFDiagram(
|
||||
new POIFSFileSystem(
|
||||
new FileInputStream(filename)
|
||||
)
|
||||
)
|
||||
);
|
||||
assertNotNull(extractor);
|
||||
assertNotNull(extractor.getAllText());
|
||||
assertEquals(3, extractor.getAllText().length);
|
||||
}
|
||||
|
||||
public void testExtraction() throws Exception {
|
||||
VisioTextExtractor extractor =
|
||||
new VisioTextExtractor(new FileInputStream(filename));
|
||||
|
||||
// Check the array fetch
|
||||
String[] text = extractor.getAllText();
|
||||
assertNotNull(text);
|
||||
assertEquals(3, text.length);
|
||||
|
||||
assertEquals("Test View\n", text[0]);
|
||||
assertEquals("I am a test view\n", text[1]);
|
||||
assertEquals("Some random text, on a page\n", text[2]);
|
||||
|
||||
// And the all-in fetch
|
||||
String textS = extractor.getText();
|
||||
assertEquals("Test View\nI am a test view\nSome random text, on a page\n", textS);
|
||||
}
|
||||
|
||||
public void testMain() throws Exception {
|
||||
PrintStream oldOut = System.out;
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
PrintStream capture = new PrintStream(baos);
|
||||
System.setOut(capture);
|
||||
|
||||
VisioTextExtractor.main(new String[] {filename});
|
||||
|
||||
// Put things back
|
||||
System.setOut(oldOut);
|
||||
|
||||
// Check
|
||||
capture.flush();
|
||||
String text = baos.toString();
|
||||
assertEquals("Test View\nI am a test view\nSome random text, on a page\n", text);
|
||||
}
|
||||
}
|
|
@ -18,6 +18,7 @@ package org.apache.poi.hdgf.streams;
|
|||
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.poi.hdgf.chunks.Chunk;
|
||||
import org.apache.poi.hdgf.chunks.ChunkFactory;
|
||||
import org.apache.poi.hdgf.pointers.Pointer;
|
||||
import org.apache.poi.hdgf.pointers.PointerFactory;
|
||||
|
@ -202,4 +203,63 @@ public class TestStreamComplex extends StreamTest {
|
|||
assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream);
|
||||
assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream);
|
||||
}
|
||||
|
||||
public void testChunkWithText() throws Exception {
|
||||
// Parent ChunkStream is at 0x7194
|
||||
// This is one of the last children of the trailer
|
||||
Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt);
|
||||
TrailerStream ts = (TrailerStream)
|
||||
Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory);
|
||||
|
||||
ts.findChildren(contents);
|
||||
|
||||
assertNotNull(ts.getChildPointers());
|
||||
assertNotNull(ts.getPointedToStreams());
|
||||
assertEquals(20, ts.getChildPointers().length);
|
||||
assertEquals(20, ts.getPointedToStreams().length);
|
||||
|
||||
assertEquals(0x7194, ts.getChildPointers()[13].getOffset());
|
||||
assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset());
|
||||
|
||||
PointerContainingStream ps7194 = (PointerContainingStream)
|
||||
ts.getPointedToStreams()[13];
|
||||
|
||||
// First child is at 0x64b3
|
||||
assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset());
|
||||
assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset());
|
||||
|
||||
ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0];
|
||||
|
||||
// Should be 26bc bytes un-compressed
|
||||
assertEquals(0x26bc, cs.getStore().getContents().length);
|
||||
// And should have lots of children
|
||||
assertEquals(131, cs.getChunks().length);
|
||||
|
||||
// One of which is Text
|
||||
boolean hasText = false;
|
||||
for(int i=0; i<cs.getChunks().length; i++) {
|
||||
if(cs.getChunks()[i].getName().equals("Text")) {
|
||||
hasText = true;
|
||||
}
|
||||
}
|
||||
assertTrue(hasText);
|
||||
// Which is the 72nd command
|
||||
assertEquals("Text", cs.getChunks()[72].getName());
|
||||
|
||||
Chunk text = cs.getChunks()[72];
|
||||
assertEquals("Text", text.getName());
|
||||
|
||||
// Which contains our text
|
||||
assertEquals(1, text.getCommands().length);
|
||||
assertEquals("Test View\n", text.getCommands()[0].getValue());
|
||||
|
||||
|
||||
// Almost at the end is some more text
|
||||
assertEquals("Text", cs.getChunks()[128].getName());
|
||||
text = cs.getChunks()[128];
|
||||
assertEquals("Text", text.getName());
|
||||
|
||||
assertEquals(1, text.getCommands().length);
|
||||
assertEquals("Some random text, on a page\n", text.getCommands()[0].getValue());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue