Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685283 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d60c98c37b
commit
b362207a4a
@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor {
|
||||
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes any fields (eg macros, page markers etc)
|
||||
* from the string.
|
||||
*/
|
||||
public static String stripFields(String text) {
|
||||
return Range.stripFields(text);
|
||||
}
|
||||
}
|
||||
|
@ -35,6 +35,8 @@ public class HeaderStories {
|
||||
private Range headerStories;
|
||||
private PlexOfCps plcfHdd;
|
||||
|
||||
private boolean stripFields = false;
|
||||
|
||||
public HeaderStories(HWPFDocument doc) {
|
||||
this.headerStories = doc.getHeaderStoryRange();
|
||||
FileInformationBlock fib = doc.getFileInformationBlock();
|
||||
@ -157,8 +159,15 @@ public class HeaderStories {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Return the contents
|
||||
return headerStories.text().substring(prop.getStart(), prop.getEnd());
|
||||
// Grab the contents
|
||||
String text =
|
||||
headerStories.text().substring(prop.getStart(), prop.getEnd());
|
||||
|
||||
// Strip off fields and macros if requested
|
||||
if(stripFields) {
|
||||
return Range.stripFields(text);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
public Range getRange() {
|
||||
@ -167,4 +176,22 @@ public class HeaderStories {
|
||||
protected PlexOfCps getPlcfHdd() {
|
||||
return plcfHdd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are fields currently being stripped from
|
||||
* the text that this {@link HeaderStories} returns?
|
||||
* Default is false, but can be changed
|
||||
*/
|
||||
public boolean areFieldsStripped() {
|
||||
return stripFields;
|
||||
}
|
||||
/**
|
||||
* Should fields (eg macros) be stripped from
|
||||
* the text that this class returns?
|
||||
* Default is not to strip.
|
||||
* @param stripFields
|
||||
*/
|
||||
public void setAreFieldsStripped(boolean stripFields) {
|
||||
this.stripFields = stripFields;
|
||||
}
|
||||
}
|
||||
|
@ -300,6 +300,63 @@ public class Range
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes any fields (eg macros, page markers etc)
|
||||
* from the string.
|
||||
* Normally used to make some text suitable for showing
|
||||
* to humans, and the resultant text should not normally
|
||||
* be saved back into the document!
|
||||
*/
|
||||
public static String stripFields(String text) {
|
||||
// First up, fields can be nested...
|
||||
// A field can be 0x13 [contents] 0x15
|
||||
// Or it can be 0x13 [contents] 0x14 [real text] 0x15
|
||||
|
||||
// If there are no fields, all easy
|
||||
if(text.indexOf('\u0013') == -1) return text;
|
||||
|
||||
// Loop over until they're all gone
|
||||
// That's when we're out of both 0x13s and 0x15s
|
||||
while( text.indexOf('\u0013') > -1 &&
|
||||
text.indexOf('\u0015') > -1) {
|
||||
int first13 = text.indexOf('\u0013');
|
||||
int next13 = text.indexOf('\u0013', first13+1);
|
||||
int first14 = text.indexOf('\u0014', first13+1);
|
||||
int last15 = text.lastIndexOf('\u0015');
|
||||
|
||||
// If they're the wrong way around, give up
|
||||
if(last15 < first13) {
|
||||
break;
|
||||
}
|
||||
|
||||
// If no more 13s and 14s, just zap
|
||||
if(next13 == -1 && first14 == -1) {
|
||||
text = text.substring(0, first13) +
|
||||
text.substring(last15+1);
|
||||
break;
|
||||
}
|
||||
|
||||
// If a 14 comes before the next 13, then
|
||||
// zap from the 13 to the 14, and remove
|
||||
// the 15
|
||||
if(first14 != -1 && (first14 < next13 || next13 == -1)) {
|
||||
text = text.substring(0, first13) +
|
||||
text.substring(first14+1, last15) +
|
||||
text.substring(last15+1);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Another 13 comes before the next 14.
|
||||
// This means there's nested stuff, so we
|
||||
// can just zap the lot
|
||||
text = text.substring(0, first13) +
|
||||
text.substring(last15+1);
|
||||
continue;
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to get the number of sections in a range. If this range is smaller
|
||||
* than a section, it will return 1 for its containing section.
|
||||
|
Binary file not shown.
@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase {
|
||||
private HWPFDocument oddEven;
|
||||
private HWPFDocument diffFirst;
|
||||
private HWPFDocument unicode;
|
||||
private HWPFDocument withFields;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
String dirname = System.getProperty("HWPF.testdata.path");
|
||||
@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase {
|
||||
unicode = new HWPFDocument(
|
||||
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
|
||||
);
|
||||
withFields = new HWPFDocument(
|
||||
new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
|
||||
);
|
||||
}
|
||||
|
||||
public void testNone() throws Exception {
|
||||
@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase {
|
||||
assertEquals("\r\r", hs.getEvenFooter());
|
||||
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
|
||||
}
|
||||
|
||||
public void testWithFields() throws Exception {
|
||||
HeaderStories hs = new HeaderStories(withFields);
|
||||
assertFalse(hs.areFieldsStripped());
|
||||
|
||||
assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
|
||||
|
||||
// Now turn on stripping
|
||||
hs.setAreFieldsStripped(true);
|
||||
assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
|
||||
}
|
||||
}
|
||||
|
@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Tests for Range which aren't around deletion, insertion,
|
||||
* text replacement or textual contents
|
||||
*/
|
||||
public class TestRange extends TestCase {
|
||||
public void testFieldStripping() throws Exception {
|
||||
String exp = "This is some text.";
|
||||
|
||||
String single = "This is some \u0013Blah!\u0015text.";
|
||||
String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
|
||||
String withNested =
|
||||
"This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
|
||||
String withNested14 =
|
||||
"This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
|
||||
String withNestedIn14 =
|
||||
"This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
|
||||
|
||||
// Check all comes out right
|
||||
assertEquals(exp, Range.stripFields(exp));
|
||||
assertEquals(exp, Range.stripFields(single));
|
||||
assertEquals(exp, Range.stripFields(with14));
|
||||
assertEquals(exp, Range.stripFields(withNested));
|
||||
assertEquals(exp, Range.stripFields(withNested14));
|
||||
assertEquals(exp, Range.stripFields(withNestedIn14));
|
||||
|
||||
// Ones that are odd and we won't change
|
||||
String odd1 = "This\u0015 is \u0013 odd";
|
||||
String odd2 = "This\u0015 is \u0014 also \u0013 odd";
|
||||
|
||||
assertEquals(odd1, Range.stripFields(odd1));
|
||||
assertEquals(odd2, Range.stripFields(odd2));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user