From 0f5af26377036bd9d99adf04ee192b5b7bee39fa Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Tue, 12 Aug 2008 18:44:50 +0000 Subject: [PATCH] New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685260 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/changes.xml | 1 + src/documentation/content/xdocs/status.xml | 1 + src/java/org/apache/poi/POITextExtractor.java | 8 + .../org/apache/poi/hpsf/CustomProperties.java | 13 +- .../poi/hpsf/DocumentSummaryInformation.java | 3 + .../apache/poi/hpsf/SpecialPropertySet.java | 6 + .../apache/poi/hpsf/SummaryInformation.java | 3 + .../extractor/HPFSPropertiesExtractor.java | 144 ++++++++++++++++++ .../TestHPFSPropertiesExtractor.java | 115 ++++++++++++++ 9 files changed, 293 insertions(+), 1 deletion(-) create mode 100644 src/java/org/apache/poi/hpsf/extractor/HPFSPropertiesExtractor.java create mode 100644 src/testcases/org/apache/poi/hpsf/extractor/TestHPFSPropertiesExtractor.java diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index e290d73df..6a0cae267 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,6 +37,7 @@ + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 9a6810660..0146f55e7 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java index 3ba71880e..a7ffd4419 100644 --- a/src/java/org/apache/poi/POITextExtractor.java +++ b/src/java/org/apache/poi/POITextExtractor.java @@ -37,6 +37,14 @@ public abstract class POITextExtractor { public POITextExtractor(POIDocument document) { this.document = document; } + /** + * Creates a new text extractor, using the same + * document as another text extractor. Normally + * only used by properties extractors. + */ + protected POITextExtractor(POITextExtractor otherExtractor) { + this.document = otherExtractor.document; + } /** * Retrieves all the text from the document. diff --git a/src/java/org/apache/poi/hpsf/CustomProperties.java b/src/java/org/apache/poi/hpsf/CustomProperties.java index 24b19e5d0..420fc2f9b 100644 --- a/src/java/org/apache/poi/hpsf/CustomProperties.java +++ b/src/java/org/apache/poi/hpsf/CustomProperties.java @@ -21,6 +21,7 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Set; import org.apache.poi.hpsf.wellknown.PropertyIDMap; @@ -293,8 +294,18 @@ public class CustomProperties extends HashMap final CustomProperty cp = new CustomProperty(p, name); return put(cp); } - + /** + * Returns a set of all the names of our + * custom properties + */ + public Set keySet() { + return dictionaryNameToID.keySet(); + } + + + + /** *

Sets the codepage.

* * @param codepage the codepage diff --git a/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java b/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java index b7a7c9ae6..62c6127ee 100644 --- a/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java +++ b/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java @@ -45,6 +45,9 @@ public class DocumentSummaryInformation extends SpecialPropertySet public static final String DEFAULT_STREAM_NAME = "\005DocumentSummaryInformation"; + public PropertyIDMap getPropertySetIDMap() { + return PropertyIDMap.getDocumentSummaryInformationProperties(); + } /** diff --git a/src/java/org/apache/poi/hpsf/SpecialPropertySet.java b/src/java/org/apache/poi/hpsf/SpecialPropertySet.java index 6a02bbc18..f415bd5d1 100644 --- a/src/java/org/apache/poi/hpsf/SpecialPropertySet.java +++ b/src/java/org/apache/poi/hpsf/SpecialPropertySet.java @@ -22,6 +22,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.List; +import org.apache.poi.hpsf.wellknown.PropertyIDMap; import org.apache.poi.poifs.filesystem.DirectoryEntry; /** @@ -57,6 +58,11 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry; */ public abstract class SpecialPropertySet extends MutablePropertySet { + /** + * The id to name mapping of the properties + * in this set. + */ + public abstract PropertyIDMap getPropertySetIDMap(); /** *

The "real" property set SpecialPropertySet diff --git a/src/java/org/apache/poi/hpsf/SummaryInformation.java b/src/java/org/apache/poi/hpsf/SummaryInformation.java index 66d9ce093..a143e2bad 100644 --- a/src/java/org/apache/poi/hpsf/SummaryInformation.java +++ b/src/java/org/apache/poi/hpsf/SummaryInformation.java @@ -40,6 +40,9 @@ public class SummaryInformation extends SpecialPropertySet */ public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation"; + public PropertyIDMap getPropertySetIDMap() { + return PropertyIDMap.getSummaryInformationProperties(); + } /** diff --git a/src/java/org/apache/poi/hpsf/extractor/HPFSPropertiesExtractor.java b/src/java/org/apache/poi/hpsf/extractor/HPFSPropertiesExtractor.java new file mode 100644 index 000000000..c85f1bb04 --- /dev/null +++ b/src/java/org/apache/poi/hpsf/extractor/HPFSPropertiesExtractor.java @@ -0,0 +1,144 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hpsf.extractor; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Iterator; + +import org.apache.poi.POIDocument; +import org.apache.poi.POITextExtractor; +import org.apache.poi.hpsf.CustomProperties; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.Property; +import org.apache.poi.hpsf.SpecialPropertySet; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.wellknown.PropertyIDMap; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.LittleEndian; + +/** + * Extracts all of the HPSF properties, both + * build in and custom, returning them in + * textual form. + */ +public class HPFSPropertiesExtractor extends POITextExtractor { + public HPFSPropertiesExtractor(POITextExtractor mainExtractor) { + super(mainExtractor); + } + public HPFSPropertiesExtractor(POIDocument doc) { + super(doc); + } + public HPFSPropertiesExtractor(POIFSFileSystem fs) { + super(new PropertiesOnlyDocument(fs)); + } + + public String getDocumentSummaryInformationText() { + DocumentSummaryInformation dsi = document.getDocumentSummaryInformation(); + StringBuffer text = new StringBuffer(); + + // Normal properties + text.append( getPropertiesText(dsi) ); + + // Now custom ones + CustomProperties cps = dsi.getCustomProperties(); + Iterator keys = cps.keySet().iterator(); + while(keys.hasNext()) { + String key = (String)keys.next(); + String val = getPropertyValueText( cps.get(key) ); + text.append(key + " = " + val + "\n"); + } + + // All done + return text.toString(); + } + public String getSummaryInformationText() { + SummaryInformation si = document.getSummaryInformation(); + + // Just normal properties + return getPropertiesText(si); + } + + private static String getPropertiesText(SpecialPropertySet ps) { + if(ps == null) { + // Not defined, oh well + return ""; + } + + StringBuffer text = new StringBuffer(); + + PropertyIDMap idMap = ps.getPropertySetIDMap(); + Property[] props = ps.getProperties(); + for(int i=0; i -1); + assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1); + assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1); + assertTrue(dinfText.indexOf("COMPANY = sample company") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("TEMPLATE = Normal") > -1); + assertTrue(text.indexOf("SUBJECT = sample subject") > -1); + assertTrue(text.indexOf("MANAGER = sample manager") > -1); + assertTrue(text.indexOf("COMPANY = sample company") > -1); + } + public void testNormalUnicodeProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs); + ext.getText(); + + // Check each bit in turn + String sinfText = ext.getSummaryInformationText(); + String dinfText = ext.getDocumentSummaryInformationText(); + + assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1); + assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1); + assertTrue(dinfText.indexOf("SCALE = false") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("AUTHOR = marshall") > -1); + assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(text.indexOf("COMPANY = Schreiner") > -1); + assertTrue(text.indexOf("SCALE = false") > -1); + } + public void testCustomProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestMickey.doc")) + ); + HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs); + + // Custom properties are part of the document info stream + String dinfText = ext.getDocumentSummaryInformationText(); + assertTrue(dinfText.indexOf("Client = sample client") > -1); + assertTrue(dinfText.indexOf("Division = sample division") > -1); + + String text = ext.getText(); + assertTrue(text.indexOf("Client = sample client") > -1); + assertTrue(text.indexOf("Division = sample division") > -1); + } + + public void testConstructors() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HSSFWorkbook wb = new HSSFWorkbook(fs); + ExcelExtractor excelExt = new ExcelExtractor(wb); + + String fsText = (new HPFSPropertiesExtractor(fs)).getText(); + String hwText = (new HPFSPropertiesExtractor(wb)).getText(); + String eeText = (new HPFSPropertiesExtractor(excelExt)).getText(); + + assertEquals(fsText, hwText); + assertEquals(fsText, eeText); + + assertTrue(fsText.indexOf("AUTHOR = marshall") > -1); + assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1); + } +}