New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685260 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2e33f41e90
commit
0f5af26377
@ -37,6 +37,7 @@
|
|||||||
|
|
||||||
<!-- Don't forget to update status.xml too! -->
|
<!-- Don't forget to update status.xml too! -->
|
||||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
||||||
|
@ -34,6 +34,7 @@
|
|||||||
<!-- Don't forget to update changes.xml too! -->
|
<!-- Don't forget to update changes.xml too! -->
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
||||||
|
@ -37,6 +37,14 @@ public abstract class POITextExtractor {
|
|||||||
public POITextExtractor(POIDocument document) {
|
public POITextExtractor(POIDocument document) {
|
||||||
this.document = document;
|
this.document = document;
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* Creates a new text extractor, using the same
|
||||||
|
* document as another text extractor. Normally
|
||||||
|
* only used by properties extractors.
|
||||||
|
*/
|
||||||
|
protected POITextExtractor(POITextExtractor otherExtractor) {
|
||||||
|
this.document = otherExtractor.document;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves all the text from the document.
|
* Retrieves all the text from the document.
|
||||||
|
@ -21,6 +21,7 @@ import java.util.Date;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
|
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
|
||||||
|
|
||||||
@ -294,6 +295,16 @@ public class CustomProperties extends HashMap
|
|||||||
return put(cp);
|
return put(cp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a set of all the names of our
|
||||||
|
* custom properties
|
||||||
|
*/
|
||||||
|
public Set keySet() {
|
||||||
|
return dictionaryNameToID.keySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>Sets the codepage.</p>
|
* <p>Sets the codepage.</p>
|
||||||
*
|
*
|
||||||
|
@ -45,6 +45,9 @@ public class DocumentSummaryInformation extends SpecialPropertySet
|
|||||||
public static final String DEFAULT_STREAM_NAME =
|
public static final String DEFAULT_STREAM_NAME =
|
||||||
"\005DocumentSummaryInformation";
|
"\005DocumentSummaryInformation";
|
||||||
|
|
||||||
|
public PropertyIDMap getPropertySetIDMap() {
|
||||||
|
return PropertyIDMap.getDocumentSummaryInformationProperties();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -22,6 +22,7 @@ import java.io.InputStream;
|
|||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -57,6 +58,11 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
|||||||
*/
|
*/
|
||||||
public abstract class SpecialPropertySet extends MutablePropertySet
|
public abstract class SpecialPropertySet extends MutablePropertySet
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* The id to name mapping of the properties
|
||||||
|
* in this set.
|
||||||
|
*/
|
||||||
|
public abstract PropertyIDMap getPropertySetIDMap();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>The "real" property set <code>SpecialPropertySet</code>
|
* <p>The "real" property set <code>SpecialPropertySet</code>
|
||||||
|
@ -40,6 +40,9 @@ public class SummaryInformation extends SpecialPropertySet
|
|||||||
*/
|
*/
|
||||||
public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation";
|
public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation";
|
||||||
|
|
||||||
|
public PropertyIDMap getPropertySetIDMap() {
|
||||||
|
return PropertyIDMap.getSummaryInformationProperties();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -0,0 +1,144 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hpsf.extractor;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.apache.poi.POIDocument;
|
||||||
|
import org.apache.poi.POITextExtractor;
|
||||||
|
import org.apache.poi.hpsf.CustomProperties;
|
||||||
|
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||||
|
import org.apache.poi.hpsf.Property;
|
||||||
|
import org.apache.poi.hpsf.SpecialPropertySet;
|
||||||
|
import org.apache.poi.hpsf.SummaryInformation;
|
||||||
|
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts all of the HPSF properties, both
|
||||||
|
* build in and custom, returning them in
|
||||||
|
* textual form.
|
||||||
|
*/
|
||||||
|
public class HPFSPropertiesExtractor extends POITextExtractor {
|
||||||
|
public HPFSPropertiesExtractor(POITextExtractor mainExtractor) {
|
||||||
|
super(mainExtractor);
|
||||||
|
}
|
||||||
|
public HPFSPropertiesExtractor(POIDocument doc) {
|
||||||
|
super(doc);
|
||||||
|
}
|
||||||
|
public HPFSPropertiesExtractor(POIFSFileSystem fs) {
|
||||||
|
super(new PropertiesOnlyDocument(fs));
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDocumentSummaryInformationText() {
|
||||||
|
DocumentSummaryInformation dsi = document.getDocumentSummaryInformation();
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
|
// Normal properties
|
||||||
|
text.append( getPropertiesText(dsi) );
|
||||||
|
|
||||||
|
// Now custom ones
|
||||||
|
CustomProperties cps = dsi.getCustomProperties();
|
||||||
|
Iterator keys = cps.keySet().iterator();
|
||||||
|
while(keys.hasNext()) {
|
||||||
|
String key = (String)keys.next();
|
||||||
|
String val = getPropertyValueText( cps.get(key) );
|
||||||
|
text.append(key + " = " + val + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// All done
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
public String getSummaryInformationText() {
|
||||||
|
SummaryInformation si = document.getSummaryInformation();
|
||||||
|
|
||||||
|
// Just normal properties
|
||||||
|
return getPropertiesText(si);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getPropertiesText(SpecialPropertySet ps) {
|
||||||
|
if(ps == null) {
|
||||||
|
// Not defined, oh well
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
|
PropertyIDMap idMap = ps.getPropertySetIDMap();
|
||||||
|
Property[] props = ps.getProperties();
|
||||||
|
for(int i=0; i<props.length; i++) {
|
||||||
|
String type = Long.toString( props[i].getID() );
|
||||||
|
Object typeObj = idMap.get(props[i].getID());
|
||||||
|
if(typeObj != null) {
|
||||||
|
type = typeObj.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
String val = getPropertyValueText( props[i].getValue() );
|
||||||
|
text.append(type + " = " + val + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
private static String getPropertyValueText(Object val) {
|
||||||
|
if(val == null) {
|
||||||
|
return "(not set)";
|
||||||
|
}
|
||||||
|
if(val instanceof byte[]) {
|
||||||
|
byte[] b = (byte[])val;
|
||||||
|
if(b.length == 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
if(b.length == 1) {
|
||||||
|
return Byte.toString(b[0]);
|
||||||
|
}
|
||||||
|
if(b.length == 2) {
|
||||||
|
return Integer.toString( LittleEndian.getUShort(b) );
|
||||||
|
}
|
||||||
|
if(b.length == 4) {
|
||||||
|
return Long.toString( LittleEndian.getUInt(b) );
|
||||||
|
}
|
||||||
|
// Maybe it's a string? who knows!
|
||||||
|
return new String(b);
|
||||||
|
}
|
||||||
|
return val.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the text of all the properties defined in
|
||||||
|
* the document.
|
||||||
|
*/
|
||||||
|
public String getText() {
|
||||||
|
return getSummaryInformationText() + getDocumentSummaryInformationText();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* So we can get at the properties of any
|
||||||
|
* random OLE2 document.
|
||||||
|
*/
|
||||||
|
private static class PropertiesOnlyDocument extends POIDocument {
|
||||||
|
private PropertiesOnlyDocument(POIFSFileSystem fs) {
|
||||||
|
super(fs);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void write(OutputStream out) throws IOException {
|
||||||
|
throw new IllegalStateException("Unable to write, only for properties!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,115 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hpsf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
|
||||||
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
|
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
public class TestHPFSPropertiesExtractor extends TestCase {
|
||||||
|
private String dir;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
dir = System.getProperty("HPSF.testdata.path");
|
||||||
|
assertNotNull("HPSF.testdata.path not set", dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNormalProperties() throws Exception {
|
||||||
|
POIFSFileSystem fs = new POIFSFileSystem(
|
||||||
|
new FileInputStream(new File(dir, "TestMickey.doc"))
|
||||||
|
);
|
||||||
|
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
|
||||||
|
ext.getText();
|
||||||
|
|
||||||
|
// Check each bit in turn
|
||||||
|
String sinfText = ext.getSummaryInformationText();
|
||||||
|
String dinfText = ext.getDocumentSummaryInformationText();
|
||||||
|
|
||||||
|
assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1);
|
||||||
|
assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1);
|
||||||
|
assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1);
|
||||||
|
assertTrue(dinfText.indexOf("COMPANY = sample company") > -1);
|
||||||
|
|
||||||
|
// Now overall
|
||||||
|
String text = ext.getText();
|
||||||
|
assertTrue(text.indexOf("TEMPLATE = Normal") > -1);
|
||||||
|
assertTrue(text.indexOf("SUBJECT = sample subject") > -1);
|
||||||
|
assertTrue(text.indexOf("MANAGER = sample manager") > -1);
|
||||||
|
assertTrue(text.indexOf("COMPANY = sample company") > -1);
|
||||||
|
}
|
||||||
|
public void testNormalUnicodeProperties() throws Exception {
|
||||||
|
POIFSFileSystem fs = new POIFSFileSystem(
|
||||||
|
new FileInputStream(new File(dir, "TestUnicode.xls"))
|
||||||
|
);
|
||||||
|
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
|
||||||
|
ext.getText();
|
||||||
|
|
||||||
|
// Check each bit in turn
|
||||||
|
String sinfText = ext.getSummaryInformationText();
|
||||||
|
String dinfText = ext.getDocumentSummaryInformationText();
|
||||||
|
|
||||||
|
assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1);
|
||||||
|
assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1);
|
||||||
|
assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1);
|
||||||
|
assertTrue(dinfText.indexOf("SCALE = false") > -1);
|
||||||
|
|
||||||
|
// Now overall
|
||||||
|
String text = ext.getText();
|
||||||
|
assertTrue(text.indexOf("AUTHOR = marshall") > -1);
|
||||||
|
assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1);
|
||||||
|
assertTrue(text.indexOf("COMPANY = Schreiner") > -1);
|
||||||
|
assertTrue(text.indexOf("SCALE = false") > -1);
|
||||||
|
}
|
||||||
|
public void testCustomProperties() throws Exception {
|
||||||
|
POIFSFileSystem fs = new POIFSFileSystem(
|
||||||
|
new FileInputStream(new File(dir, "TestMickey.doc"))
|
||||||
|
);
|
||||||
|
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
|
||||||
|
|
||||||
|
// Custom properties are part of the document info stream
|
||||||
|
String dinfText = ext.getDocumentSummaryInformationText();
|
||||||
|
assertTrue(dinfText.indexOf("Client = sample client") > -1);
|
||||||
|
assertTrue(dinfText.indexOf("Division = sample division") > -1);
|
||||||
|
|
||||||
|
String text = ext.getText();
|
||||||
|
assertTrue(text.indexOf("Client = sample client") > -1);
|
||||||
|
assertTrue(text.indexOf("Division = sample division") > -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testConstructors() throws Exception {
|
||||||
|
POIFSFileSystem fs = new POIFSFileSystem(
|
||||||
|
new FileInputStream(new File(dir, "TestUnicode.xls"))
|
||||||
|
);
|
||||||
|
HSSFWorkbook wb = new HSSFWorkbook(fs);
|
||||||
|
ExcelExtractor excelExt = new ExcelExtractor(wb);
|
||||||
|
|
||||||
|
String fsText = (new HPFSPropertiesExtractor(fs)).getText();
|
||||||
|
String hwText = (new HPFSPropertiesExtractor(wb)).getText();
|
||||||
|
String eeText = (new HPFSPropertiesExtractor(excelExt)).getText();
|
||||||
|
|
||||||
|
assertEquals(fsText, hwText);
|
||||||
|
assertEquals(fsText, eeText);
|
||||||
|
|
||||||
|
assertTrue(fsText.indexOf("AUTHOR = marshall") > -1);
|
||||||
|
assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user