diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index 7165203f6..d4ad48c82 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ + Support for HPBF Publisher hyperlinks, including during text extraction 26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records Impove empty header or footer handling in HWPF HeaderStories Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 8fe08bfec..e1f136174 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Support for HPBF Publisher hyperlinks, including during text extraction 26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records Impove empty header or footer handling in HWPF HeaderStories Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out diff --git a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java index 2257283a0..a28f16b7e 100644 --- a/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hpbf/extractor/PublisherTextExtractor.java @@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.hpbf.HPBFDocument; import org.apache.poi.hpbf.model.qcbits.QCBit; import org.apache.poi.hpbf.model.qcbits.QCTextBit; +import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** @@ -31,6 +32,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; */ public class PublisherTextExtractor extends POIOLE2TextExtractor { private HPBFDocument doc; + private boolean hyperlinksByDefault = false; public PublisherTextExtractor(HPBFDocument doc) { super(doc); @@ -43,6 +45,16 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor { this(new POIFSFileSystem(is)); } + /** + * Should a call to getText() return hyperlinks inline + * with the text? + * Default is no + */ + public void setHyperlinksByDefault(boolean hyperlinksByDefault) { + this.hyperlinksByDefault = hyperlinksByDefault; + } + + public String getText() { StringBuffer text = new StringBuffer(); @@ -55,6 +67,24 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor { } } + // If requested, add in the hyperlinks + // Ideally, we'd do these inline, but the hyperlink + // positions are relative to the text area the + // hyperlink is in, and we have yet to figure out + // how to tie that together. + if(hyperlinksByDefault) { + for(int i=0; i\n"); + } + } + } + } + // Get more text // TODO diff --git a/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java b/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java index a2eadc52d..4bd57d514 100644 --- a/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java +++ b/src/scratchpad/src/org/apache/poi/hpbf/model/qcbits/QCPLCBit.java @@ -167,6 +167,10 @@ public class QCPLCBit extends QCBit { /** * Type 12 holds hyperlinks, and is very complex. + * There is normally one of these for each text + * area that contains at least one hyperlinks. + * The character offsets are relative to the start + * of the text area that this applies to. */ public static class Type12 extends QCPLCBit { private String[] hyperlinks; @@ -249,6 +253,8 @@ public class QCPLCBit extends QCBit { * Returns where in the text (in characters) the * hyperlink at the given index starts * applying to. + * This position is relative to the text area that this + * PLCBit applies to. * @param number The hyperlink number, zero based */ public int getTextStartAt(int number) { @@ -258,6 +264,8 @@ public class QCPLCBit extends QCBit { * Returns where in the text that this block * of hyperlinks stops applying to. Normally, * but not always the end of the text. + * This position is relative to the text area that this + * PLCBit applies to. */ public int getAllTextEndAt() { return preData[numberOfPLCs+1]; diff --git a/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java b/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java index d5b471222..128988225 100644 --- a/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java +++ b/src/scratchpad/testcases/org/apache/poi/hpbf/extractor/TextPublisherTextExtractor.java @@ -134,4 +134,41 @@ public class TextPublisherTextExtractor extends TestCase { assertEquals(s2007, s2000); assertEquals(s2007, s98); } + + /** + * Test that the hyperlink extraction stuff works as well + * as we can hope it to. + */ + public void testWithHyperlinks() throws Exception { + File f = new File(dir, "LinkAt10.pub"); + HPBFDocument doc = new HPBFDocument( + new FileInputStream(f) + ); + + PublisherTextExtractor ext = + new PublisherTextExtractor(doc); + ext.getText(); + + // Default is no hyperlinks + assertEquals("1234567890LINK\n", ext.getText()); + + // Turn on + ext.setHyperlinksByDefault(true); + assertEquals("1234567890LINK\n\n", ext.getText()); + + + // Now a much more complex document + f = new File(dir, "Sample.pub"); + ext = new PublisherTextExtractor(new FileInputStream(f)); + ext.setHyperlinksByDefault(true); + String text = ext.getText(); + + assertTrue(text.endsWith( + "\n" + + "\n" + + "<>\n" + + "\n" + + "\n" + )); + } }