Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685283 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d60c98c37b
commit
b362207a4a
@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor {
|
|||||||
|
|
||||||
return ret.toString();
|
return ret.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes any fields (eg macros, page markers etc)
|
||||||
|
* from the string.
|
||||||
|
*/
|
||||||
|
public static String stripFields(String text) {
|
||||||
|
return Range.stripFields(text);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,8 @@ public class HeaderStories {
|
|||||||
private Range headerStories;
|
private Range headerStories;
|
||||||
private PlexOfCps plcfHdd;
|
private PlexOfCps plcfHdd;
|
||||||
|
|
||||||
|
private boolean stripFields = false;
|
||||||
|
|
||||||
public HeaderStories(HWPFDocument doc) {
|
public HeaderStories(HWPFDocument doc) {
|
||||||
this.headerStories = doc.getHeaderStoryRange();
|
this.headerStories = doc.getHeaderStoryRange();
|
||||||
FileInformationBlock fib = doc.getFileInformationBlock();
|
FileInformationBlock fib = doc.getFileInformationBlock();
|
||||||
@ -157,8 +159,15 @@ public class HeaderStories {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the contents
|
// Grab the contents
|
||||||
return headerStories.text().substring(prop.getStart(), prop.getEnd());
|
String text =
|
||||||
|
headerStories.text().substring(prop.getStart(), prop.getEnd());
|
||||||
|
|
||||||
|
// Strip off fields and macros if requested
|
||||||
|
if(stripFields) {
|
||||||
|
return Range.stripFields(text);
|
||||||
|
}
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Range getRange() {
|
public Range getRange() {
|
||||||
@ -167,4 +176,22 @@ public class HeaderStories {
|
|||||||
protected PlexOfCps getPlcfHdd() {
|
protected PlexOfCps getPlcfHdd() {
|
||||||
return plcfHdd;
|
return plcfHdd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Are fields currently being stripped from
|
||||||
|
* the text that this {@link HeaderStories} returns?
|
||||||
|
* Default is false, but can be changed
|
||||||
|
*/
|
||||||
|
public boolean areFieldsStripped() {
|
||||||
|
return stripFields;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Should fields (eg macros) be stripped from
|
||||||
|
* the text that this class returns?
|
||||||
|
* Default is not to strip.
|
||||||
|
* @param stripFields
|
||||||
|
*/
|
||||||
|
public void setAreFieldsStripped(boolean stripFields) {
|
||||||
|
this.stripFields = stripFields;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -300,6 +300,63 @@ public class Range
|
|||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Removes any fields (eg macros, page markers etc)
|
||||||
|
* from the string.
|
||||||
|
* Normally used to make some text suitable for showing
|
||||||
|
* to humans, and the resultant text should not normally
|
||||||
|
* be saved back into the document!
|
||||||
|
*/
|
||||||
|
public static String stripFields(String text) {
|
||||||
|
// First up, fields can be nested...
|
||||||
|
// A field can be 0x13 [contents] 0x15
|
||||||
|
// Or it can be 0x13 [contents] 0x14 [real text] 0x15
|
||||||
|
|
||||||
|
// If there are no fields, all easy
|
||||||
|
if(text.indexOf('\u0013') == -1) return text;
|
||||||
|
|
||||||
|
// Loop over until they're all gone
|
||||||
|
// That's when we're out of both 0x13s and 0x15s
|
||||||
|
while( text.indexOf('\u0013') > -1 &&
|
||||||
|
text.indexOf('\u0015') > -1) {
|
||||||
|
int first13 = text.indexOf('\u0013');
|
||||||
|
int next13 = text.indexOf('\u0013', first13+1);
|
||||||
|
int first14 = text.indexOf('\u0014', first13+1);
|
||||||
|
int last15 = text.lastIndexOf('\u0015');
|
||||||
|
|
||||||
|
// If they're the wrong way around, give up
|
||||||
|
if(last15 < first13) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no more 13s and 14s, just zap
|
||||||
|
if(next13 == -1 && first14 == -1) {
|
||||||
|
text = text.substring(0, first13) +
|
||||||
|
text.substring(last15+1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a 14 comes before the next 13, then
|
||||||
|
// zap from the 13 to the 14, and remove
|
||||||
|
// the 15
|
||||||
|
if(first14 != -1 && (first14 < next13 || next13 == -1)) {
|
||||||
|
text = text.substring(0, first13) +
|
||||||
|
text.substring(first14+1, last15) +
|
||||||
|
text.substring(last15+1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Another 13 comes before the next 14.
|
||||||
|
// This means there's nested stuff, so we
|
||||||
|
// can just zap the lot
|
||||||
|
text = text.substring(0, first13) +
|
||||||
|
text.substring(last15+1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Used to get the number of sections in a range. If this range is smaller
|
* Used to get the number of sections in a range. If this range is smaller
|
||||||
* than a section, it will return 1 for its containing section.
|
* than a section, it will return 1 for its containing section.
|
||||||
|
Binary file not shown.
@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase {
|
|||||||
private HWPFDocument oddEven;
|
private HWPFDocument oddEven;
|
||||||
private HWPFDocument diffFirst;
|
private HWPFDocument diffFirst;
|
||||||
private HWPFDocument unicode;
|
private HWPFDocument unicode;
|
||||||
|
private HWPFDocument withFields;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
String dirname = System.getProperty("HWPF.testdata.path");
|
String dirname = System.getProperty("HWPF.testdata.path");
|
||||||
@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase {
|
|||||||
unicode = new HWPFDocument(
|
unicode = new HWPFDocument(
|
||||||
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
|
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
|
||||||
);
|
);
|
||||||
|
withFields = new HWPFDocument(
|
||||||
|
new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNone() throws Exception {
|
public void testNone() throws Exception {
|
||||||
@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase {
|
|||||||
assertEquals("\r\r", hs.getEvenFooter());
|
assertEquals("\r\r", hs.getEvenFooter());
|
||||||
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
|
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testWithFields() throws Exception {
|
||||||
|
HeaderStories hs = new HeaderStories(withFields);
|
||||||
|
assertFalse(hs.areFieldsStripped());
|
||||||
|
|
||||||
|
assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
|
||||||
|
|
||||||
|
// Now turn on stripping
|
||||||
|
hs.setAreFieldsStripped(true);
|
||||||
|
assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel;
|
|||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileOutputStream;
|
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
@ -0,0 +1,53 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.poi.hwpf.usermodel;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for Range which aren't around deletion, insertion,
|
||||||
|
* text replacement or textual contents
|
||||||
|
*/
|
||||||
|
public class TestRange extends TestCase {
|
||||||
|
public void testFieldStripping() throws Exception {
|
||||||
|
String exp = "This is some text.";
|
||||||
|
|
||||||
|
String single = "This is some \u0013Blah!\u0015text.";
|
||||||
|
String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
|
||||||
|
String withNested =
|
||||||
|
"This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
|
||||||
|
String withNested14 =
|
||||||
|
"This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
|
||||||
|
String withNestedIn14 =
|
||||||
|
"This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
|
||||||
|
|
||||||
|
// Check all comes out right
|
||||||
|
assertEquals(exp, Range.stripFields(exp));
|
||||||
|
assertEquals(exp, Range.stripFields(single));
|
||||||
|
assertEquals(exp, Range.stripFields(with14));
|
||||||
|
assertEquals(exp, Range.stripFields(withNested));
|
||||||
|
assertEquals(exp, Range.stripFields(withNested14));
|
||||||
|
assertEquals(exp, Range.stripFields(withNestedIn14));
|
||||||
|
|
||||||
|
// Ones that are odd and we won't change
|
||||||
|
String odd1 = "This\u0015 is \u0013 odd";
|
||||||
|
String odd2 = "This\u0015 is \u0014 also \u0013 odd";
|
||||||
|
|
||||||
|
assertEquals(odd1, Range.stripFields(odd1));
|
||||||
|
assertEquals(odd2, Range.stripFields(odd2));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user