Merged revisions 638786-638802,638805-638811,638813-638814,638816-639230,639233-639241,639243-639253,639255-639486,639488-639601,639603-639835,639837-639917,639919-640056,640058-640710,640712-641156,641158-641184,641186-641795,641797-641798,641800-641933,641935-641963,641965-641966,641968-641995,641997-642230,642232-642562,642564-642565,642568-642570,642572-642573,642576-642736,642739-642877,642879,642881-642890,642892-642903,642905-642945,642947-643624,643626-643653,643655-643669,643671,643673-643830,643832-643833,643835-644342,644344-644472,644474-644508,644510-645347,645349-645351,645353-645559,645561-645565,645568-645951,645953-646193,646195-646311,646313-646404,646406-646665,646667-646853,646855-646869,646871-647151,647153-647185,647187-647277,647279-647566,647568-647573,647575,647578-647711,647714-647737,647739-647823,647825-648155,648157-648202,648204-648273,648275,648277-648302,648304-648333,648335-648588,648590-648622,648625-648673,648675-649141,649144,649146-649556,649558-649795,649799,649801-649910,649912-649913,649915-650128,650131-650132,650134-650137,650140-650914,650916-651991,651993-652284,652286-652287,652289,652291,652293-652297,652299-652328,652330-652425,652427-652445,652447-652560,652562-652933,652935,652937-652993,652995-653116,653118-653124,653126-653483,653487-653519,653522-653550,653552-653607,653609-653667,653669-653674,653676-653814,653817-653830,653832-653891,653893-653944,653946-654055,654057-654355,654357-654365,654367-654648,654651-655215,655217-655277,655279-655281,655283-655911,655913-656212,656214,656216-656251,656253-656698,656700-656756,656758-656892,656894-657135,657137-657165,657168-657179,657181-657354,657356-657357,657359-657701,657703-657874,657876-658032,658034-658284,658286,658288-658301,658303-658307,658309-658321,658323-658335,658337-658348,658351,658353-658832,658834-658983,658985,658987-659066,659068-659402,659404-659428,659430-659451,659453-659454,659456-659461,659463-659477,659479-659524,659526-659571,659574,659576-660255,660257-660262,660264-660279,660281-660343,660345-660473,660475-660827,660829-660833,660835-660888,660890-663321,663323-663435,663437-663764,663766-663854,663856-664219,664221-664489,664494-664514,664516-668013,668015-668142,668144-668152,668154,668156-668256,668258,668260-669139,669141-669455,669457-669657,669659-669808,669810-670189,670191-671321,671323-672229,672231-672549,672551-672552,672554-672561,672563-672566,672568,672571-673049,673051-673852,673854-673862,673864-673986,673988-673996,673998-674347,674349-674890,674892-674910,674912-674936,674938-674952,674954-675078,675080-675085,675087-675217,675219-675660,675662-675670,675672-675716,675718-675726,675728-675733,675735-675775,675777-675782,675784,675786-675791,675794-675852,675854-676200,676202,676204,676206-676220,676222-676309,676311-676456,676458-676994,676996-677027,677030-677040,677042-677056,677058-677375,677377-677968,677970-677971,677973,677975-677994,677996-678286,678288-678538,678540-680393,680395-680469,680471-680529,680531-680852,680854-681529,681531-681571,681573-682224,682226,682228,682231-682281,682283-682335,682337-682507,682509,682512-682517,682519-682532,682534-682619,682622-682777,682779-682998,683000-683019,683021-683022,683024-683080,683082-683092,683094-683095,683097-683127,683129-683131,683133-683166,683168-683698,683700-683705,683707-683757,683759-683787,683789-683870,683872-683879,683881-683900,683902-684066,684068-684074,684076-684222,684224-684254,684257-684281,684283-684286,684288-684292,684294-684298,684300-684301,684303-684308,684310-684317,684320,684323-684335,684337-684348,684350-684354,684356-684361,684363-684369,684371-684453,684455-684883,684885-684937,684940-684958,684960-684970,684972-684985,684987-685053,685055-685063,685065-685284 via svnmerge from

https://svn.apache.org/repos/asf/poi/trunk

........
  r685260 | nick | 2008-08-12 19:44:50 +0100 (Tue, 12 Aug 2008) | 1 line
  
  New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor
........
  r685263 | nick | 2008-08-12 19:55:47 +0100 (Tue, 12 Aug 2008) | 1 line
  
  Few documentation updates for recent new code
........
  r685267 | nick | 2008-08-12 20:02:41 +0100 (Tue, 12 Aug 2008) | 1 line
  
  Fix a typo in the file name, and add a generic method to POITextExtractor to get the appropriate metadata text extractor
........
  r685283 | nick | 2008-08-12 20:57:04 +0100 (Tue, 12 Aug 2008) | 1 line
  
  Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers
........
  r685284 | nick | 2008-08-12 20:59:35 +0100 (Tue, 12 Aug 2008) | 1 line
  
  Update changelog
........


git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@685288 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-12 20:08:14 +00:00
parent 54d3f11aeb
commit 1de175a1eb
20 changed files with 510 additions and 7 deletions

View File

@ -58,6 +58,8 @@
<action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
</release>
<release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="fix">45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text)</action>
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>

View File

@ -92,6 +92,12 @@
properties. Chances are that you will find here what you need and don't
have to read the other sections.</note>
<p>If all you are interested in is getting the textual content of
all the document properties, such as for full text indexing, then
take a look at
<code>org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</code>. However,
if you want full access to the properties, please read on!</p>
<p>The first thing you should understand is that a Microsoft Office file is
not one large bunch of bytes but has an internal filesystem structure with
files and directories. You can access these files and directories using

View File

@ -55,13 +55,25 @@ can then get text and other properties.
</p>
</section>
<section><title>Headers and Footers</title>
<p>To get at the headers and footers of a word document, first create a
<code>org.apache.poi.hwpf.HWPFDocument</code>. Next, you need to create a
<code>org.apache.poi.hwpf.usermodel.HeaderStores</code>, passing it your
HWPFDocument. Finally, the HeaderStores gives you access to the headers and
footers, including first / even / odd page ones if defined in your
document. Additionally, HeaderStores provides a method for removing
any macros in the text, which is helpful as many headers and footers
do end up with macros in them.</p>
</section>
<section><title>Changing Text</title>
<p>It is possible to change the text via
<code>insertBefore()</code> and <code>insertAfter()</code>
on a <code>Range</code> object (either a <code>Range</code>,
<code>Paragraph</code> or <code>CharacterRun</code>).
It is also possible to delete a <code>Range</code>, but this
code is know to have bugs in it.
It is also possible to delete a <code>Range</code>.
This code will work in many, but not all cases, and patches to
improve it are gratefully received!
</p>
</section>

View File

@ -55,6 +55,8 @@
<action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
</release>
<release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="fix">45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text)</action>
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>

View File

@ -18,6 +18,7 @@ package org.apache.poi;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
/**
* Common Parent for OLE2 based Text Extractors
@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
public SummaryInformation getSummaryInformation() {
return document.getSummaryInformation();
}
/**
* Returns an HPSF powered text extractor for the
* document properties metadata, such as title and author.
*/
public POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
}

View File

@ -37,6 +37,14 @@ public abstract class POITextExtractor {
public POITextExtractor(POIDocument document) {
this.document = document;
}
/**
* Creates a new text extractor, using the same
* document as another text extractor. Normally
* only used by properties extractors.
*/
protected POITextExtractor(POITextExtractor otherExtractor) {
this.document = otherExtractor.document;
}
/**
* Retrieves all the text from the document.
@ -46,4 +54,11 @@ public abstract class POITextExtractor {
* @return All the text from the document
*/
public abstract String getText();
/**
* Returns another text extractor, which is able to
* output the textual content of the document
* metadata / properties, such as author and title.
*/
public abstract POITextExtractor getMetadataTextExtractor();
}

View File

@ -21,6 +21,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
@ -293,8 +294,18 @@ public class CustomProperties extends HashMap
final CustomProperty cp = new CustomProperty(p, name);
return put(cp);
}
/**
* Returns a set of all the names of our
* custom properties
*/
public Set keySet() {
return dictionaryNameToID.keySet();
}
/**
* <p>Sets the codepage.</p>
*
* @param codepage the codepage

View File

@ -45,6 +45,9 @@ public class DocumentSummaryInformation extends SpecialPropertySet
public static final String DEFAULT_STREAM_NAME =
"\005DocumentSummaryInformation";
public PropertyIDMap getPropertySetIDMap() {
return PropertyIDMap.getDocumentSummaryInformationProperties();
}
/**

View File

@ -22,6 +22,7 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
/**
@ -57,6 +58,11 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
*/
public abstract class SpecialPropertySet extends MutablePropertySet
{
/**
* The id to name mapping of the properties
* in this set.
*/
public abstract PropertyIDMap getPropertySetIDMap();
/**
* <p>The "real" property set <code>SpecialPropertySet</code>

View File

@ -40,6 +40,9 @@ public class SummaryInformation extends SpecialPropertySet
*/
public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation";
public PropertyIDMap getPropertySetIDMap() {
return PropertyIDMap.getSummaryInformationProperties();
}
/**

View File

@ -0,0 +1,151 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hpsf.extractor;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Iterator;
import org.apache.poi.POIDocument;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.Property;
import org.apache.poi.hpsf.SpecialPropertySet;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* Extracts all of the HPSF properties, both
* build in and custom, returning them in
* textual form.
*/
public class HPSFPropertiesExtractor extends POITextExtractor {
public HPSFPropertiesExtractor(POITextExtractor mainExtractor) {
super(mainExtractor);
}
public HPSFPropertiesExtractor(POIDocument doc) {
super(doc);
}
public HPSFPropertiesExtractor(POIFSFileSystem fs) {
super(new PropertiesOnlyDocument(fs));
}
public String getDocumentSummaryInformationText() {
DocumentSummaryInformation dsi = document.getDocumentSummaryInformation();
StringBuffer text = new StringBuffer();
// Normal properties
text.append( getPropertiesText(dsi) );
// Now custom ones
CustomProperties cps = dsi.getCustomProperties();
Iterator keys = cps.keySet().iterator();
while(keys.hasNext()) {
String key = (String)keys.next();
String val = getPropertyValueText( cps.get(key) );
text.append(key + " = " + val + "\n");
}
// All done
return text.toString();
}
public String getSummaryInformationText() {
SummaryInformation si = document.getSummaryInformation();
// Just normal properties
return getPropertiesText(si);
}
private static String getPropertiesText(SpecialPropertySet ps) {
if(ps == null) {
// Not defined, oh well
return "";
}
StringBuffer text = new StringBuffer();
PropertyIDMap idMap = ps.getPropertySetIDMap();
Property[] props = ps.getProperties();
for(int i=0; i<props.length; i++) {
String type = Long.toString( props[i].getID() );
Object typeObj = idMap.get(props[i].getID());
if(typeObj != null) {
type = typeObj.toString();
}
String val = getPropertyValueText( props[i].getValue() );
text.append(type + " = " + val + "\n");
}
return text.toString();
}
private static String getPropertyValueText(Object val) {
if(val == null) {
return "(not set)";
}
if(val instanceof byte[]) {
byte[] b = (byte[])val;
if(b.length == 0) {
return "";
}
if(b.length == 1) {
return Byte.toString(b[0]);
}
if(b.length == 2) {
return Integer.toString( LittleEndian.getUShort(b) );
}
if(b.length == 4) {
return Long.toString( LittleEndian.getUInt(b) );
}
// Maybe it's a string? who knows!
return new String(b);
}
return val.toString();
}
/**
* Return the text of all the properties defined in
* the document.
*/
public String getText() {
return getSummaryInformationText() + getDocumentSummaryInformationText();
}
/**
* Prevent recursion!
*/
public POITextExtractor getMetadataTextExtractor() {
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
}
/**
* So we can get at the properties of any
* random OLE2 document.
*/
private static class PropertiesOnlyDocument extends POIDocument {
private PropertiesOnlyDocument(POIFSFileSystem fs) {
super(fs);
}
public void write(OutputStream out) throws IOException {
throw new IllegalStateException("Unable to write, only for properties!");
}
}
}

View File

@ -30,7 +30,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
* Creates a new text extractor for the given document
*/
public POIXMLTextExtractor(POIXMLDocument document) {
super(null);
super((POIDocument)null);
this.document = document;
}
@ -54,4 +54,13 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
public POIXMLDocument getDocument(){
return document;
}
/**
* Returns an OOXML properties text extractor for the
* document properties metadata, such as title and author.
*/
public POITextExtractor getMetadataTextExtractor() {
throw new RuntimeException("Not yet supported for OOXML!");
}
}

View File

@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor {
return ret.toString();
}
/**
* Removes any fields (eg macros, page markers etc)
* from the string.
*/
public static String stripFields(String text) {
return Range.stripFields(text);
}
}

View File

@ -35,6 +35,8 @@ public class HeaderStories {
private Range headerStories;
private PlexOfCps plcfHdd;
private boolean stripFields = false;
public HeaderStories(HWPFDocument doc) {
this.headerStories = doc.getHeaderStoryRange();
FileInformationBlock fib = doc.getFileInformationBlock();
@ -157,8 +159,15 @@ public class HeaderStories {
return "";
}
// Return the contents
return headerStories.text().substring(prop.getStart(), prop.getEnd());
// Grab the contents
String text =
headerStories.text().substring(prop.getStart(), prop.getEnd());
// Strip off fields and macros if requested
if(stripFields) {
return Range.stripFields(text);
}
return text;
}
public Range getRange() {
@ -167,4 +176,22 @@ public class HeaderStories {
protected PlexOfCps getPlcfHdd() {
return plcfHdd;
}
/**
* Are fields currently being stripped from
* the text that this {@link HeaderStories} returns?
* Default is false, but can be changed
*/
public boolean areFieldsStripped() {
return stripFields;
}
/**
* Should fields (eg macros) be stripped from
* the text that this class returns?
* Default is not to strip.
* @param stripFields
*/
public void setAreFieldsStripped(boolean stripFields) {
this.stripFields = stripFields;
}
}

View File

@ -299,6 +299,63 @@ public class Range
}
return sb.toString();
}
/**
* Removes any fields (eg macros, page markers etc)
* from the string.
* Normally used to make some text suitable for showing
* to humans, and the resultant text should not normally
* be saved back into the document!
*/
public static String stripFields(String text) {
// First up, fields can be nested...
// A field can be 0x13 [contents] 0x15
// Or it can be 0x13 [contents] 0x14 [real text] 0x15
// If there are no fields, all easy
if(text.indexOf('\u0013') == -1) return text;
// Loop over until they're all gone
// That's when we're out of both 0x13s and 0x15s
while( text.indexOf('\u0013') > -1 &&
text.indexOf('\u0015') > -1) {
int first13 = text.indexOf('\u0013');
int next13 = text.indexOf('\u0013', first13+1);
int first14 = text.indexOf('\u0014', first13+1);
int last15 = text.lastIndexOf('\u0015');
// If they're the wrong way around, give up
if(last15 < first13) {
break;
}
// If no more 13s and 14s, just zap
if(next13 == -1 && first14 == -1) {
text = text.substring(0, first13) +
text.substring(last15+1);
break;
}
// If a 14 comes before the next 13, then
// zap from the 13 to the 14, and remove
// the 15
if(first14 != -1 && (first14 < next13 || next13 == -1)) {
text = text.substring(0, first13) +
text.substring(first14+1, last15) +
text.substring(last15+1);
continue;
}
// Another 13 comes before the next 14.
// This means there's nested stuff, so we
// can just zap the lot
text = text.substring(0, first13) +
text.substring(last15+1);
continue;
}
return text;
}
/**
* Used to get the number of sections in a range. If this range is smaller

View File

@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase {
private HWPFDocument oddEven;
private HWPFDocument diffFirst;
private HWPFDocument unicode;
private HWPFDocument withFields;
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase {
unicode = new HWPFDocument(
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
);
withFields = new HWPFDocument(
new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
);
}
public void testNone() throws Exception {
@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase {
assertEquals("\r\r", hs.getEvenFooter());
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
}
public void testWithFields() throws Exception {
HeaderStories hs = new HeaderStories(withFields);
assertFalse(hs.areFieldsStripped());
assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
// Now turn on stripping
hs.setAreFieldsStripped(true);
assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
}
}

View File

@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import junit.framework.TestCase;

View File

@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.poi.hwpf.usermodel;
import junit.framework.TestCase;
/**
* Tests for Range which aren't around deletion, insertion,
* text replacement or textual contents
*/
public class TestRange extends TestCase {
public void testFieldStripping() throws Exception {
String exp = "This is some text.";
String single = "This is some \u0013Blah!\u0015text.";
String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
String withNested =
"This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
String withNested14 =
"This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
String withNestedIn14 =
"This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
// Check all comes out right
assertEquals(exp, Range.stripFields(exp));
assertEquals(exp, Range.stripFields(single));
assertEquals(exp, Range.stripFields(with14));
assertEquals(exp, Range.stripFields(withNested));
assertEquals(exp, Range.stripFields(withNested14));
assertEquals(exp, Range.stripFields(withNestedIn14));
// Ones that are odd and we won't change
String odd1 = "This\u0015 is \u0013 odd";
String odd2 = "This\u0015 is \u0014 also \u0013 odd";
assertEquals(odd1, Range.stripFields(odd1));
assertEquals(odd2, Range.stripFields(odd2));
}
}

View File

@ -0,0 +1,115 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hpsf.extractor;
import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase;
public class TestHPSFPropertiesExtractor extends TestCase {
private String dir;
protected void setUp() throws Exception {
dir = System.getProperty("HPSF.testdata.path");
assertNotNull("HPSF.testdata.path not set", dir);
}
public void testNormalProperties() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(new File(dir, "TestMickey.doc"))
);
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
ext.getText();
// Check each bit in turn
String sinfText = ext.getSummaryInformationText();
String dinfText = ext.getDocumentSummaryInformationText();
assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1);
assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1);
assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1);
assertTrue(dinfText.indexOf("COMPANY = sample company") > -1);
// Now overall
String text = ext.getText();
assertTrue(text.indexOf("TEMPLATE = Normal") > -1);
assertTrue(text.indexOf("SUBJECT = sample subject") > -1);
assertTrue(text.indexOf("MANAGER = sample manager") > -1);
assertTrue(text.indexOf("COMPANY = sample company") > -1);
}
public void testNormalUnicodeProperties() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(new File(dir, "TestUnicode.xls"))
);
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
ext.getText();
// Check each bit in turn
String sinfText = ext.getSummaryInformationText();
String dinfText = ext.getDocumentSummaryInformationText();
assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1);
assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1);
assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1);
assertTrue(dinfText.indexOf("SCALE = false") > -1);
// Now overall
String text = ext.getText();
assertTrue(text.indexOf("AUTHOR = marshall") > -1);
assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1);
assertTrue(text.indexOf("COMPANY = Schreiner") > -1);
assertTrue(text.indexOf("SCALE = false") > -1);
}
public void testCustomProperties() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(new File(dir, "TestMickey.doc"))
);
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
// Custom properties are part of the document info stream
String dinfText = ext.getDocumentSummaryInformationText();
assertTrue(dinfText.indexOf("Client = sample client") > -1);
assertTrue(dinfText.indexOf("Division = sample division") > -1);
String text = ext.getText();
assertTrue(text.indexOf("Client = sample client") > -1);
assertTrue(text.indexOf("Division = sample division") > -1);
}
public void testConstructors() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(new File(dir, "TestUnicode.xls"))
);
HSSFWorkbook wb = new HSSFWorkbook(fs);
ExcelExtractor excelExt = new ExcelExtractor(wb);
String fsText = (new HPSFPropertiesExtractor(fs)).getText();
String hwText = (new HPSFPropertiesExtractor(wb)).getText();
String eeText = (new HPSFPropertiesExtractor(excelExt)).getText();
assertEquals(fsText, hwText);
assertEquals(fsText, eeText);
assertTrue(fsText.indexOf("AUTHOR = marshall") > -1);
assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1);
}
}