Add HPBF hyperlinks support to the extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690729 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-31 16:58:29 +00:00
parent e786480817
commit 7ba82ba657
5 changed files with 77 additions and 0 deletions

View File

@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>

View File

@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>

View File

@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@ -31,6 +32,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
*/
public class PublisherTextExtractor extends POIOLE2TextExtractor {
private HPBFDocument doc;
private boolean hyperlinksByDefault = false;
public PublisherTextExtractor(HPBFDocument doc) {
super(doc);
@ -43,6 +45,16 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
this(new POIFSFileSystem(is));
}
/**
* Should a call to getText() return hyperlinks inline
* with the text?
* Default is no
*/
public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
this.hyperlinksByDefault = hyperlinksByDefault;
}
public String getText() {
StringBuffer text = new StringBuffer();
@ -55,6 +67,24 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
}
}
// If requested, add in the hyperlinks
// Ideally, we'd do these inline, but the hyperlink
// positions are relative to the text area the
// hyperlink is in, and we have yet to figure out
// how to tie that together.
if(hyperlinksByDefault) {
for(int i=0; i<bits.length; i++) {
if(bits[i] != null && bits[i] instanceof Type12) {
Type12 hyperlinks = (Type12)bits[i];
for(int j=0; j<hyperlinks.getNumberOfHyperlinks(); j++) {
text.append("<");
text.append(hyperlinks.getHyperlink(j));
text.append(">\n");
}
}
}
}
// Get more text
// TODO

View File

@ -167,6 +167,10 @@ public class QCPLCBit extends QCBit {
/**
* Type 12 holds hyperlinks, and is very complex.
* There is normally one of these for each text
* area that contains at least one hyperlinks.
* The character offsets are relative to the start
* of the text area that this applies to.
*/
public static class Type12 extends QCPLCBit {
private String[] hyperlinks;
@ -249,6 +253,8 @@ public class QCPLCBit extends QCBit {
* Returns where in the text (in characters) the
* hyperlink at the given index starts
* applying to.
* This position is relative to the text area that this
* PLCBit applies to.
* @param number The hyperlink number, zero based
*/
public int getTextStartAt(int number) {
@ -258,6 +264,8 @@ public class QCPLCBit extends QCBit {
* Returns where in the text that this block
* of hyperlinks stops applying to. Normally,
* but not always the end of the text.
* This position is relative to the text area that this
* PLCBit applies to.
*/
public int getAllTextEndAt() {
return preData[numberOfPLCs+1];

View File

@ -134,4 +134,41 @@ public class TextPublisherTextExtractor extends TestCase {
assertEquals(s2007, s2000);
assertEquals(s2007, s98);
}
/**
* Test that the hyperlink extraction stuff works as well
* as we can hope it to.
*/
public void testWithHyperlinks() throws Exception {
File f = new File(dir, "LinkAt10.pub");
HPBFDocument doc = new HPBFDocument(
new FileInputStream(f)
);
PublisherTextExtractor ext =
new PublisherTextExtractor(doc);
ext.getText();
// Default is no hyperlinks
assertEquals("1234567890LINK\n", ext.getText());
// Turn on
ext.setHyperlinksByDefault(true);
assertEquals("1234567890LINK\n<http://poi.apache.org/>\n", ext.getText());
// Now a much more complex document
f = new File(dir, "Sample.pub");
ext = new PublisherTextExtractor(new FileInputStream(f));
ext.setHyperlinksByDefault(true);
String text = ext.getText();
assertTrue(text.endsWith(
"<http://poi.apache.org/>\n" +
"<C:\\Documents and Settings\\Nick\\My Documents\\Booleans.xlsx>\n" +
"<>\n" +
"<mailto:dev@poi.apache.org?subject=HPBF>\n" +
"<mailto:dev@poi.apache.org?subject=HPBF>\n"
));
}
}