Add HPBF hyperlinks support to the extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@690729 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e786480817
commit
7ba82ba657
@ -37,6 +37,7 @@
|
||||
|
||||
<!-- Don't forget to update status.xml too! -->
|
||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
|
||||
|
@ -34,6 +34,7 @@
|
||||
<!-- Don't forget to update changes.xml too! -->
|
||||
<changes>
|
||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">Support for HPBF Publisher hyperlinks, including during text extraction</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">26321 and 44958 - preserve position of ArrayRecords and TableRecords among cell value records</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Impove empty header or footer handling in HWPF HeaderStories</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Avoid NPE in hssf.usermodel.HeaderFooter when stripping fields out</action>
|
||||
|
@ -24,6 +24,7 @@ import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hpbf.HPBFDocument;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCBit;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
@ -31,6 +32,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
*/
|
||||
public class PublisherTextExtractor extends POIOLE2TextExtractor {
|
||||
private HPBFDocument doc;
|
||||
private boolean hyperlinksByDefault = false;
|
||||
|
||||
public PublisherTextExtractor(HPBFDocument doc) {
|
||||
super(doc);
|
||||
@ -43,6 +45,16 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
|
||||
this(new POIFSFileSystem(is));
|
||||
}
|
||||
|
||||
/**
|
||||
* Should a call to getText() return hyperlinks inline
|
||||
* with the text?
|
||||
* Default is no
|
||||
*/
|
||||
public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
|
||||
this.hyperlinksByDefault = hyperlinksByDefault;
|
||||
}
|
||||
|
||||
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
@ -55,6 +67,24 @@ public class PublisherTextExtractor extends POIOLE2TextExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
// If requested, add in the hyperlinks
|
||||
// Ideally, we'd do these inline, but the hyperlink
|
||||
// positions are relative to the text area the
|
||||
// hyperlink is in, and we have yet to figure out
|
||||
// how to tie that together.
|
||||
if(hyperlinksByDefault) {
|
||||
for(int i=0; i<bits.length; i++) {
|
||||
if(bits[i] != null && bits[i] instanceof Type12) {
|
||||
Type12 hyperlinks = (Type12)bits[i];
|
||||
for(int j=0; j<hyperlinks.getNumberOfHyperlinks(); j++) {
|
||||
text.append("<");
|
||||
text.append(hyperlinks.getHyperlink(j));
|
||||
text.append(">\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get more text
|
||||
// TODO
|
||||
|
||||
|
@ -167,6 +167,10 @@ public class QCPLCBit extends QCBit {
|
||||
|
||||
/**
|
||||
* Type 12 holds hyperlinks, and is very complex.
|
||||
* There is normally one of these for each text
|
||||
* area that contains at least one hyperlinks.
|
||||
* The character offsets are relative to the start
|
||||
* of the text area that this applies to.
|
||||
*/
|
||||
public static class Type12 extends QCPLCBit {
|
||||
private String[] hyperlinks;
|
||||
@ -249,6 +253,8 @@ public class QCPLCBit extends QCBit {
|
||||
* Returns where in the text (in characters) the
|
||||
* hyperlink at the given index starts
|
||||
* applying to.
|
||||
* This position is relative to the text area that this
|
||||
* PLCBit applies to.
|
||||
* @param number The hyperlink number, zero based
|
||||
*/
|
||||
public int getTextStartAt(int number) {
|
||||
@ -258,6 +264,8 @@ public class QCPLCBit extends QCBit {
|
||||
* Returns where in the text that this block
|
||||
* of hyperlinks stops applying to. Normally,
|
||||
* but not always the end of the text.
|
||||
* This position is relative to the text area that this
|
||||
* PLCBit applies to.
|
||||
*/
|
||||
public int getAllTextEndAt() {
|
||||
return preData[numberOfPLCs+1];
|
||||
|
@ -134,4 +134,41 @@ public class TextPublisherTextExtractor extends TestCase {
|
||||
assertEquals(s2007, s2000);
|
||||
assertEquals(s2007, s98);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that the hyperlink extraction stuff works as well
|
||||
* as we can hope it to.
|
||||
*/
|
||||
public void testWithHyperlinks() throws Exception {
|
||||
File f = new File(dir, "LinkAt10.pub");
|
||||
HPBFDocument doc = new HPBFDocument(
|
||||
new FileInputStream(f)
|
||||
);
|
||||
|
||||
PublisherTextExtractor ext =
|
||||
new PublisherTextExtractor(doc);
|
||||
ext.getText();
|
||||
|
||||
// Default is no hyperlinks
|
||||
assertEquals("1234567890LINK\n", ext.getText());
|
||||
|
||||
// Turn on
|
||||
ext.setHyperlinksByDefault(true);
|
||||
assertEquals("1234567890LINK\n<http://poi.apache.org/>\n", ext.getText());
|
||||
|
||||
|
||||
// Now a much more complex document
|
||||
f = new File(dir, "Sample.pub");
|
||||
ext = new PublisherTextExtractor(new FileInputStream(f));
|
||||
ext.setHyperlinksByDefault(true);
|
||||
String text = ext.getText();
|
||||
|
||||
assertTrue(text.endsWith(
|
||||
"<http://poi.apache.org/>\n" +
|
||||
"<C:\\Documents and Settings\\Nick\\My Documents\\Booleans.xlsx>\n" +
|
||||
"<>\n" +
|
||||
"<mailto:dev@poi.apache.org?subject=HPBF>\n" +
|
||||
"<mailto:dev@poi.apache.org?subject=HPBF>\n"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user