diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index e9164fb46..020c6c960 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -58,6 +58,8 @@ Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx + 45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text) + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/documentation/content/xdocs/hpsf/how-to.xml b/src/documentation/content/xdocs/hpsf/how-to.xml index 0073126c9..964005bf2 100644 --- a/src/documentation/content/xdocs/hpsf/how-to.xml +++ b/src/documentation/content/xdocs/hpsf/how-to.xml @@ -92,6 +92,12 @@ properties. Chances are that you will find here what you need and don't have to read the other sections. +

If all you are interested in is getting the textual content of + all the document properties, such as for full text indexing, then + take a look at + org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor. However, + if you want full access to the properties, please read on!

+

The first thing you should understand is that a Microsoft Office file is not one large bunch of bytes but has an internal filesystem structure with files and directories. You can access these files and directories using diff --git a/src/documentation/content/xdocs/hwpf/quick-guide.xml b/src/documentation/content/xdocs/hwpf/quick-guide.xml index bf046258e..d717b0ef0 100644 --- a/src/documentation/content/xdocs/hwpf/quick-guide.xml +++ b/src/documentation/content/xdocs/hwpf/quick-guide.xml @@ -55,13 +55,25 @@ can then get text and other properties.

+
Headers and Footers +

To get at the headers and footers of a word document, first create a +org.apache.poi.hwpf.HWPFDocument. Next, you need to create a +org.apache.poi.hwpf.usermodel.HeaderStores, passing it your +HWPFDocument. Finally, the HeaderStores gives you access to the headers and +footers, including first / even / odd page ones if defined in your +document. Additionally, HeaderStores provides a method for removing +any macros in the text, which is helpful as many headers and footers +do end up with macros in them.

+
+
Changing Text

It is possible to change the text via insertBefore() and insertAfter() on a Range object (either a Range, Paragraph or CharacterRun). - It is also possible to delete a Range, but this - code is know to have bugs in it. + It is also possible to delete a Range. + This code will work in many, but not all cases, and patches to + improve it are gratefully received!

diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 5c4ffadb1..998263d8d 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -55,6 +55,8 @@ Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx
+ 45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text) + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java index f5aee4cc6..d46c7e4aa 100644 --- a/src/java/org/apache/poi/POIOLE2TextExtractor.java +++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java @@ -18,6 +18,7 @@ package org.apache.poi; import org.apache.poi.hpsf.DocumentSummaryInformation; import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; /** * Common Parent for OLE2 based Text Extractors @@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { public SummaryInformation getSummaryInformation() { return document.getSummaryInformation(); } + + /** + * Returns an HPSF powered text extractor for the + * document properties metadata, such as title and author. + */ + public POITextExtractor getMetadataTextExtractor() { + return new HPSFPropertiesExtractor(this); + } } diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java index 3ba71880e..0b69894d0 100644 --- a/src/java/org/apache/poi/POITextExtractor.java +++ b/src/java/org/apache/poi/POITextExtractor.java @@ -37,6 +37,14 @@ public abstract class POITextExtractor { public POITextExtractor(POIDocument document) { this.document = document; } + /** + * Creates a new text extractor, using the same + * document as another text extractor. Normally + * only used by properties extractors. + */ + protected POITextExtractor(POITextExtractor otherExtractor) { + this.document = otherExtractor.document; + } /** * Retrieves all the text from the document. @@ -46,4 +54,11 @@ public abstract class POITextExtractor { * @return All the text from the document */ public abstract String getText(); + + /** + * Returns another text extractor, which is able to + * output the textual content of the document + * metadata / properties, such as author and title. + */ + public abstract POITextExtractor getMetadataTextExtractor(); } diff --git a/src/java/org/apache/poi/hpsf/CustomProperties.java b/src/java/org/apache/poi/hpsf/CustomProperties.java index 24b19e5d0..420fc2f9b 100644 --- a/src/java/org/apache/poi/hpsf/CustomProperties.java +++ b/src/java/org/apache/poi/hpsf/CustomProperties.java @@ -21,6 +21,7 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Set; import org.apache.poi.hpsf.wellknown.PropertyIDMap; @@ -293,8 +294,18 @@ public class CustomProperties extends HashMap final CustomProperty cp = new CustomProperty(p, name); return put(cp); } - + /** + * Returns a set of all the names of our + * custom properties + */ + public Set keySet() { + return dictionaryNameToID.keySet(); + } + + + + /** *

Sets the codepage.

* * @param codepage the codepage diff --git a/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java b/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java index b7a7c9ae6..62c6127ee 100644 --- a/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java +++ b/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java @@ -45,6 +45,9 @@ public class DocumentSummaryInformation extends SpecialPropertySet public static final String DEFAULT_STREAM_NAME = "\005DocumentSummaryInformation"; + public PropertyIDMap getPropertySetIDMap() { + return PropertyIDMap.getDocumentSummaryInformationProperties(); + } /** diff --git a/src/java/org/apache/poi/hpsf/SpecialPropertySet.java b/src/java/org/apache/poi/hpsf/SpecialPropertySet.java index 6a02bbc18..f415bd5d1 100644 --- a/src/java/org/apache/poi/hpsf/SpecialPropertySet.java +++ b/src/java/org/apache/poi/hpsf/SpecialPropertySet.java @@ -22,6 +22,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.List; +import org.apache.poi.hpsf.wellknown.PropertyIDMap; import org.apache.poi.poifs.filesystem.DirectoryEntry; /** @@ -57,6 +58,11 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry; */ public abstract class SpecialPropertySet extends MutablePropertySet { + /** + * The id to name mapping of the properties + * in this set. + */ + public abstract PropertyIDMap getPropertySetIDMap(); /** *

The "real" property set SpecialPropertySet diff --git a/src/java/org/apache/poi/hpsf/SummaryInformation.java b/src/java/org/apache/poi/hpsf/SummaryInformation.java index 66d9ce093..a143e2bad 100644 --- a/src/java/org/apache/poi/hpsf/SummaryInformation.java +++ b/src/java/org/apache/poi/hpsf/SummaryInformation.java @@ -40,6 +40,9 @@ public class SummaryInformation extends SpecialPropertySet */ public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation"; + public PropertyIDMap getPropertySetIDMap() { + return PropertyIDMap.getSummaryInformationProperties(); + } /** diff --git a/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java new file mode 100644 index 000000000..ecad5c05b --- /dev/null +++ b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java @@ -0,0 +1,151 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hpsf.extractor; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Iterator; + +import org.apache.poi.POIDocument; +import org.apache.poi.POITextExtractor; +import org.apache.poi.hpsf.CustomProperties; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.Property; +import org.apache.poi.hpsf.SpecialPropertySet; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.wellknown.PropertyIDMap; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.LittleEndian; + +/** + * Extracts all of the HPSF properties, both + * build in and custom, returning them in + * textual form. + */ +public class HPSFPropertiesExtractor extends POITextExtractor { + public HPSFPropertiesExtractor(POITextExtractor mainExtractor) { + super(mainExtractor); + } + public HPSFPropertiesExtractor(POIDocument doc) { + super(doc); + } + public HPSFPropertiesExtractor(POIFSFileSystem fs) { + super(new PropertiesOnlyDocument(fs)); + } + + public String getDocumentSummaryInformationText() { + DocumentSummaryInformation dsi = document.getDocumentSummaryInformation(); + StringBuffer text = new StringBuffer(); + + // Normal properties + text.append( getPropertiesText(dsi) ); + + // Now custom ones + CustomProperties cps = dsi.getCustomProperties(); + Iterator keys = cps.keySet().iterator(); + while(keys.hasNext()) { + String key = (String)keys.next(); + String val = getPropertyValueText( cps.get(key) ); + text.append(key + " = " + val + "\n"); + } + + // All done + return text.toString(); + } + public String getSummaryInformationText() { + SummaryInformation si = document.getSummaryInformation(); + + // Just normal properties + return getPropertiesText(si); + } + + private static String getPropertiesText(SpecialPropertySet ps) { + if(ps == null) { + // Not defined, oh well + return ""; + } + + StringBuffer text = new StringBuffer(); + + PropertyIDMap idMap = ps.getPropertySetIDMap(); + Property[] props = ps.getProperties(); + for(int i=0; i -1 && + text.indexOf('\u0015') > -1) { + int first13 = text.indexOf('\u0013'); + int next13 = text.indexOf('\u0013', first13+1); + int first14 = text.indexOf('\u0014', first13+1); + int last15 = text.lastIndexOf('\u0015'); + + // If they're the wrong way around, give up + if(last15 < first13) { + break; + } + + // If no more 13s and 14s, just zap + if(next13 == -1 && first14 == -1) { + text = text.substring(0, first13) + + text.substring(last15+1); + break; + } + + // If a 14 comes before the next 13, then + // zap from the 13 to the 14, and remove + // the 15 + if(first14 != -1 && (first14 < next13 || next13 == -1)) { + text = text.substring(0, first13) + + text.substring(first14+1, last15) + + text.substring(last15+1); + continue; + } + + // Another 13 comes before the next 14. + // This means there's nested stuff, so we + // can just zap the lot + text = text.substring(0, first13) + + text.substring(last15+1); + continue; + } + + return text; + } /** * Used to get the number of sections in a range. If this range is smaller diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc new file mode 100644 index 000000000..934970f58 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java index d4d2517f9..404f6e47a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java @@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase { private HWPFDocument oddEven; private HWPFDocument diffFirst; private HWPFDocument unicode; + private HWPFDocument withFields; protected void setUp() throws Exception { String dirname = System.getProperty("HWPF.testdata.path"); @@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase { unicode = new HWPFDocument( new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc")) ); + withFields = new HWPFDocument( + new FileInputStream(new File(dirname, "HeaderWithMacros.doc")) + ); } public void testNone() throws Exception { @@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase { assertEquals("\r\r", hs.getEvenFooter()); assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter()); } + + public void testWithFields() throws Exception { + HeaderStories hs = new HeaderStories(withFields); + assertFalse(hs.areFieldsStripped()); + + assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader()); + + // Now turn on stripping + hs.setAreFieldsStripped(true); + assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader()); + } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java index db28cbd45..6b4200a63 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java @@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel; import java.io.File; import java.io.FileInputStream; -import java.io.FileOutputStream; import junit.framework.TestCase; diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java new file mode 100644 index 000000000..bcb03996a --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java @@ -0,0 +1,53 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.poi.hwpf.usermodel; + +import junit.framework.TestCase; + +/** + * Tests for Range which aren't around deletion, insertion, + * text replacement or textual contents + */ +public class TestRange extends TestCase { + public void testFieldStripping() throws Exception { + String exp = "This is some text."; + + String single = "This is some \u0013Blah!\u0015text."; + String with14 = "This is \u0013Blah!\u0014some\u0015 text."; + String withNested = + "This is \u0013Blah!\u0013Blah!\u0015\u0015some text."; + String withNested14 = + "This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text."; + String withNestedIn14 = + "This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text."; + + // Check all comes out right + assertEquals(exp, Range.stripFields(exp)); + assertEquals(exp, Range.stripFields(single)); + assertEquals(exp, Range.stripFields(with14)); + assertEquals(exp, Range.stripFields(withNested)); + assertEquals(exp, Range.stripFields(withNested14)); + assertEquals(exp, Range.stripFields(withNestedIn14)); + + // Ones that are odd and we won't change + String odd1 = "This\u0015 is \u0013 odd"; + String odd2 = "This\u0015 is \u0014 also \u0013 odd"; + + assertEquals(odd1, Range.stripFields(odd1)); + assertEquals(odd2, Range.stripFields(odd2)); + } +} diff --git a/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java new file mode 100644 index 000000000..3a189353d --- /dev/null +++ b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java @@ -0,0 +1,115 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hpsf.extractor; + +import java.io.File; +import java.io.FileInputStream; + +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +import junit.framework.TestCase; + +public class TestHPSFPropertiesExtractor extends TestCase { + private String dir; + + protected void setUp() throws Exception { + dir = System.getProperty("HPSF.testdata.path"); + assertNotNull("HPSF.testdata.path not set", dir); + } + + public void testNormalProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestMickey.doc")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + ext.getText(); + + // Check each bit in turn + String sinfText = ext.getSummaryInformationText(); + String dinfText = ext.getDocumentSummaryInformationText(); + + assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1); + assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1); + assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1); + assertTrue(dinfText.indexOf("COMPANY = sample company") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("TEMPLATE = Normal") > -1); + assertTrue(text.indexOf("SUBJECT = sample subject") > -1); + assertTrue(text.indexOf("MANAGER = sample manager") > -1); + assertTrue(text.indexOf("COMPANY = sample company") > -1); + } + public void testNormalUnicodeProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + ext.getText(); + + // Check each bit in turn + String sinfText = ext.getSummaryInformationText(); + String dinfText = ext.getDocumentSummaryInformationText(); + + assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1); + assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1); + assertTrue(dinfText.indexOf("SCALE = false") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("AUTHOR = marshall") > -1); + assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(text.indexOf("COMPANY = Schreiner") > -1); + assertTrue(text.indexOf("SCALE = false") > -1); + } + public void testCustomProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestMickey.doc")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + + // Custom properties are part of the document info stream + String dinfText = ext.getDocumentSummaryInformationText(); + assertTrue(dinfText.indexOf("Client = sample client") > -1); + assertTrue(dinfText.indexOf("Division = sample division") > -1); + + String text = ext.getText(); + assertTrue(text.indexOf("Client = sample client") > -1); + assertTrue(text.indexOf("Division = sample division") > -1); + } + + public void testConstructors() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HSSFWorkbook wb = new HSSFWorkbook(fs); + ExcelExtractor excelExt = new ExcelExtractor(wb); + + String fsText = (new HPSFPropertiesExtractor(fs)).getText(); + String hwText = (new HPSFPropertiesExtractor(wb)).getText(); + String eeText = (new HPSFPropertiesExtractor(excelExt)).getText(); + + assertEquals(fsText, hwText); + assertEquals(fsText, eeText); + + assertTrue(fsText.indexOf("AUTHOR = marshall") > -1); + assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1); + } +}