From 1de175a1eb99721a8e6e1aab3a4208bd081db20a Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Tue, 12 Aug 2008 20:08:14 +0000 Subject: [PATCH] Merged revisions 638786-638802,638805-638811,638813-638814,638816-639230,639233-639241,639243-639253,639255-639486,639488-639601,639603-639835,639837-639917,639919-640056,640058-640710,640712-641156,641158-641184,641186-641795,641797-641798,641800-641933,641935-641963,641965-641966,641968-641995,641997-642230,642232-642562,642564-642565,642568-642570,642572-642573,642576-642736,642739-642877,642879,642881-642890,642892-642903,642905-642945,642947-643624,643626-643653,643655-643669,643671,643673-643830,643832-643833,643835-644342,644344-644472,644474-644508,644510-645347,645349-645351,645353-645559,645561-645565,645568-645951,645953-646193,646195-646311,646313-646404,646406-646665,646667-646853,646855-646869,646871-647151,647153-647185,647187-647277,647279-647566,647568-647573,647575,647578-647711,647714-647737,647739-647823,647825-648155,648157-648202,648204-648273,648275,648277-648302,648304-648333,648335-648588,648590-648622,648625-648673,648675-649141,649144,649146-649556,649558-649795,649799,649801-649910,649912-649913,649915-650128,650131-650132,650134-650137,650140-650914,650916-651991,651993-652284,652286-652287,652289,652291,652293-652297,652299-652328,652330-652425,652427-652445,652447-652560,652562-652933,652935,652937-652993,652995-653116,653118-653124,653126-653483,653487-653519,653522-653550,653552-653607,653609-653667,653669-653674,653676-653814,653817-653830,653832-653891,653893-653944,653946-654055,654057-654355,654357-654365,654367-654648,654651-655215,655217-655277,655279-655281,655283-655911,655913-656212,656214,656216-656251,656253-656698,656700-656756,656758-656892,656894-657135,657137-657165,657168-657179,657181-657354,657356-657357,657359-657701,657703-657874,657876-658032,658034-658284,658286,658288-658301,658303-658307,658309-658321,658323-658335,658337-658348,658351,658353-658832,658834-658983,658985,658987-659066,659068-659402,659404-659428,659430-659451,659453-659454,659456-659461,659463-659477,659479-659524,659526-659571,659574,659576-660255,660257-660262,660264-660279,660281-660343,660345-660473,660475-660827,660829-660833,660835-660888,660890-663321,663323-663435,663437-663764,663766-663854,663856-664219,664221-664489,664494-664514,664516-668013,668015-668142,668144-668152,668154,668156-668256,668258,668260-669139,669141-669455,669457-669657,669659-669808,669810-670189,670191-671321,671323-672229,672231-672549,672551-672552,672554-672561,672563-672566,672568,672571-673049,673051-673852,673854-673862,673864-673986,673988-673996,673998-674347,674349-674890,674892-674910,674912-674936,674938-674952,674954-675078,675080-675085,675087-675217,675219-675660,675662-675670,675672-675716,675718-675726,675728-675733,675735-675775,675777-675782,675784,675786-675791,675794-675852,675854-676200,676202,676204,676206-676220,676222-676309,676311-676456,676458-676994,676996-677027,677030-677040,677042-677056,677058-677375,677377-677968,677970-677971,677973,677975-677994,677996-678286,678288-678538,678540-680393,680395-680469,680471-680529,680531-680852,680854-681529,681531-681571,681573-682224,682226,682228,682231-682281,682283-682335,682337-682507,682509,682512-682517,682519-682532,682534-682619,682622-682777,682779-682998,683000-683019,683021-683022,683024-683080,683082-683092,683094-683095,683097-683127,683129-683131,683133-683166,683168-683698,683700-683705,683707-683757,683759-683787,683789-683870,683872-683879,683881-683900,683902-684066,684068-684074,684076-684222,684224-684254,684257-684281,684283-684286,684288-684292,684294-684298,684300-684301,684303-684308,684310-684317,684320,684323-684335,684337-684348,684350-684354,684356-684361,684363-684369,684371-684453,684455-684883,684885-684937,684940-684958,684960-684970,684972-684985,684987-685053,685055-685063,685065-685284 via svnmerge from https://svn.apache.org/repos/asf/poi/trunk ........ r685260 | nick | 2008-08-12 19:44:50 +0100 (Tue, 12 Aug 2008) | 1 line New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor ........ r685263 | nick | 2008-08-12 19:55:47 +0100 (Tue, 12 Aug 2008) | 1 line Few documentation updates for recent new code ........ r685267 | nick | 2008-08-12 20:02:41 +0100 (Tue, 12 Aug 2008) | 1 line Fix a typo in the file name, and add a generic method to POITextExtractor to get the appropriate metadata text extractor ........ r685283 | nick | 2008-08-12 20:57:04 +0100 (Tue, 12 Aug 2008) | 1 line Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers ........ r685284 | nick | 2008-08-12 20:59:35 +0100 (Tue, 12 Aug 2008) | 1 line Update changelog ........ git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@685288 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/changes.xml | 2 + .../content/xdocs/hpsf/how-to.xml | 6 + .../content/xdocs/hwpf/quick-guide.xml | 16 +- src/documentation/content/xdocs/status.xml | 2 + .../org/apache/poi/POIOLE2TextExtractor.java | 9 ++ src/java/org/apache/poi/POITextExtractor.java | 15 ++ .../org/apache/poi/hpsf/CustomProperties.java | 13 +- .../poi/hpsf/DocumentSummaryInformation.java | 3 + .../apache/poi/hpsf/SpecialPropertySet.java | 6 + .../apache/poi/hpsf/SummaryInformation.java | 3 + .../extractor/HPSFPropertiesExtractor.java | 151 ++++++++++++++++++ .../org/apache/poi/POIXMLTextExtractor.java | 11 +- .../poi/hwpf/extractor/WordExtractor.java | 8 + .../poi/hwpf/usermodel/HeaderStories.java | 31 +++- .../org/apache/poi/hwpf/usermodel/Range.java | 57 +++++++ .../apache/poi/hwpf/data/HeaderWithMacros.doc | Bin 0 -> 22528 bytes .../poi/hwpf/usermodel/TestHeaderStories.java | 15 ++ .../poi/hwpf/usermodel/TestProblems.java | 1 - .../apache/poi/hwpf/usermodel/TestRange.java | 53 ++++++ .../TestHPSFPropertiesExtractor.java | 115 +++++++++++++ 20 files changed, 510 insertions(+), 7 deletions(-) create mode 100644 src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java create mode 100644 src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc create mode 100644 src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java create mode 100644 src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index e9164fb46..020c6c960 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -58,6 +58,8 @@ Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx + 45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text) + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/documentation/content/xdocs/hpsf/how-to.xml b/src/documentation/content/xdocs/hpsf/how-to.xml index 0073126c9..964005bf2 100644 --- a/src/documentation/content/xdocs/hpsf/how-to.xml +++ b/src/documentation/content/xdocs/hpsf/how-to.xml @@ -92,6 +92,12 @@ properties. Chances are that you will find here what you need and don't have to read the other sections. +

If all you are interested in is getting the textual content of + all the document properties, such as for full text indexing, then + take a look at + org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor. However, + if you want full access to the properties, please read on!

+

The first thing you should understand is that a Microsoft Office file is not one large bunch of bytes but has an internal filesystem structure with files and directories. You can access these files and directories using diff --git a/src/documentation/content/xdocs/hwpf/quick-guide.xml b/src/documentation/content/xdocs/hwpf/quick-guide.xml index bf046258e..d717b0ef0 100644 --- a/src/documentation/content/xdocs/hwpf/quick-guide.xml +++ b/src/documentation/content/xdocs/hwpf/quick-guide.xml @@ -55,13 +55,25 @@ can then get text and other properties.

+
Headers and Footers +

To get at the headers and footers of a word document, first create a +org.apache.poi.hwpf.HWPFDocument. Next, you need to create a +org.apache.poi.hwpf.usermodel.HeaderStores, passing it your +HWPFDocument. Finally, the HeaderStores gives you access to the headers and +footers, including first / even / odd page ones if defined in your +document. Additionally, HeaderStores provides a method for removing +any macros in the text, which is helpful as many headers and footers +do end up with macros in them.

+
+
Changing Text

It is possible to change the text via insertBefore() and insertAfter() on a Range object (either a Range, Paragraph or CharacterRun). - It is also possible to delete a Range, but this - code is know to have bugs in it. + It is also possible to delete a Range. + This code will work in many, but not all cases, and patches to + improve it are gratefully received!

diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 5c4ffadb1..998263d8d 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -55,6 +55,8 @@ Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx
+ 45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text) + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java index f5aee4cc6..d46c7e4aa 100644 --- a/src/java/org/apache/poi/POIOLE2TextExtractor.java +++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java @@ -18,6 +18,7 @@ package org.apache.poi; import org.apache.poi.hpsf.DocumentSummaryInformation; import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; /** * Common Parent for OLE2 based Text Extractors @@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { public SummaryInformation getSummaryInformation() { return document.getSummaryInformation(); } + + /** + * Returns an HPSF powered text extractor for the + * document properties metadata, such as title and author. + */ + public POITextExtractor getMetadataTextExtractor() { + return new HPSFPropertiesExtractor(this); + } } diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java index 3ba71880e..0b69894d0 100644 --- a/src/java/org/apache/poi/POITextExtractor.java +++ b/src/java/org/apache/poi/POITextExtractor.java @@ -37,6 +37,14 @@ public abstract class POITextExtractor { public POITextExtractor(POIDocument document) { this.document = document; } + /** + * Creates a new text extractor, using the same + * document as another text extractor. Normally + * only used by properties extractors. + */ + protected POITextExtractor(POITextExtractor otherExtractor) { + this.document = otherExtractor.document; + } /** * Retrieves all the text from the document. @@ -46,4 +54,11 @@ public abstract class POITextExtractor { * @return All the text from the document */ public abstract String getText(); + + /** + * Returns another text extractor, which is able to + * output the textual content of the document + * metadata / properties, such as author and title. + */ + public abstract POITextExtractor getMetadataTextExtractor(); } diff --git a/src/java/org/apache/poi/hpsf/CustomProperties.java b/src/java/org/apache/poi/hpsf/CustomProperties.java index 24b19e5d0..420fc2f9b 100644 --- a/src/java/org/apache/poi/hpsf/CustomProperties.java +++ b/src/java/org/apache/poi/hpsf/CustomProperties.java @@ -21,6 +21,7 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Set; import org.apache.poi.hpsf.wellknown.PropertyIDMap; @@ -293,8 +294,18 @@ public class CustomProperties extends HashMap final CustomProperty cp = new CustomProperty(p, name); return put(cp); } - + /** + * Returns a set of all the names of our + * custom properties + */ + public Set keySet() { + return dictionaryNameToID.keySet(); + } + + + + /** *

Sets the codepage.

* * @param codepage the codepage diff --git a/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java b/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java index b7a7c9ae6..62c6127ee 100644 --- a/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java +++ b/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java @@ -45,6 +45,9 @@ public class DocumentSummaryInformation extends SpecialPropertySet public static final String DEFAULT_STREAM_NAME = "\005DocumentSummaryInformation"; + public PropertyIDMap getPropertySetIDMap() { + return PropertyIDMap.getDocumentSummaryInformationProperties(); + } /** diff --git a/src/java/org/apache/poi/hpsf/SpecialPropertySet.java b/src/java/org/apache/poi/hpsf/SpecialPropertySet.java index 6a02bbc18..f415bd5d1 100644 --- a/src/java/org/apache/poi/hpsf/SpecialPropertySet.java +++ b/src/java/org/apache/poi/hpsf/SpecialPropertySet.java @@ -22,6 +22,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.List; +import org.apache.poi.hpsf.wellknown.PropertyIDMap; import org.apache.poi.poifs.filesystem.DirectoryEntry; /** @@ -57,6 +58,11 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry; */ public abstract class SpecialPropertySet extends MutablePropertySet { + /** + * The id to name mapping of the properties + * in this set. + */ + public abstract PropertyIDMap getPropertySetIDMap(); /** *

The "real" property set SpecialPropertySet diff --git a/src/java/org/apache/poi/hpsf/SummaryInformation.java b/src/java/org/apache/poi/hpsf/SummaryInformation.java index 66d9ce093..a143e2bad 100644 --- a/src/java/org/apache/poi/hpsf/SummaryInformation.java +++ b/src/java/org/apache/poi/hpsf/SummaryInformation.java @@ -40,6 +40,9 @@ public class SummaryInformation extends SpecialPropertySet */ public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation"; + public PropertyIDMap getPropertySetIDMap() { + return PropertyIDMap.getSummaryInformationProperties(); + } /** diff --git a/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java new file mode 100644 index 000000000..ecad5c05b --- /dev/null +++ b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java @@ -0,0 +1,151 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hpsf.extractor; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Iterator; + +import org.apache.poi.POIDocument; +import org.apache.poi.POITextExtractor; +import org.apache.poi.hpsf.CustomProperties; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.Property; +import org.apache.poi.hpsf.SpecialPropertySet; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.wellknown.PropertyIDMap; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.LittleEndian; + +/** + * Extracts all of the HPSF properties, both + * build in and custom, returning them in + * textual form. + */ +public class HPSFPropertiesExtractor extends POITextExtractor { + public HPSFPropertiesExtractor(POITextExtractor mainExtractor) { + super(mainExtractor); + } + public HPSFPropertiesExtractor(POIDocument doc) { + super(doc); + } + public HPSFPropertiesExtractor(POIFSFileSystem fs) { + super(new PropertiesOnlyDocument(fs)); + } + + public String getDocumentSummaryInformationText() { + DocumentSummaryInformation dsi = document.getDocumentSummaryInformation(); + StringBuffer text = new StringBuffer(); + + // Normal properties + text.append( getPropertiesText(dsi) ); + + // Now custom ones + CustomProperties cps = dsi.getCustomProperties(); + Iterator keys = cps.keySet().iterator(); + while(keys.hasNext()) { + String key = (String)keys.next(); + String val = getPropertyValueText( cps.get(key) ); + text.append(key + " = " + val + "\n"); + } + + // All done + return text.toString(); + } + public String getSummaryInformationText() { + SummaryInformation si = document.getSummaryInformation(); + + // Just normal properties + return getPropertiesText(si); + } + + private static String getPropertiesText(SpecialPropertySet ps) { + if(ps == null) { + // Not defined, oh well + return ""; + } + + StringBuffer text = new StringBuffer(); + + PropertyIDMap idMap = ps.getPropertySetIDMap(); + Property[] props = ps.getProperties(); + for(int i=0; i -1 && + text.indexOf('\u0015') > -1) { + int first13 = text.indexOf('\u0013'); + int next13 = text.indexOf('\u0013', first13+1); + int first14 = text.indexOf('\u0014', first13+1); + int last15 = text.lastIndexOf('\u0015'); + + // If they're the wrong way around, give up + if(last15 < first13) { + break; + } + + // If no more 13s and 14s, just zap + if(next13 == -1 && first14 == -1) { + text = text.substring(0, first13) + + text.substring(last15+1); + break; + } + + // If a 14 comes before the next 13, then + // zap from the 13 to the 14, and remove + // the 15 + if(first14 != -1 && (first14 < next13 || next13 == -1)) { + text = text.substring(0, first13) + + text.substring(first14+1, last15) + + text.substring(last15+1); + continue; + } + + // Another 13 comes before the next 14. + // This means there's nested stuff, so we + // can just zap the lot + text = text.substring(0, first13) + + text.substring(last15+1); + continue; + } + + return text; + } /** * Used to get the number of sections in a range. If this range is smaller diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc new file mode 100644 index 0000000000000000000000000000000000000000..934970f58b3d0d2de6caa560b182f2879acb4d3c GIT binary patch literal 22528 zcmeHP2V7Ih)}MsXI|3r0To4r-RS{hgP!LclA{Oj{1PF!%LqIGjDp+uRVsEUnh+PC% zv0+0Lupw4ZcNMWKC{|-DA~#B5EX(D7QR;J|sY&-!qNO-p z;D97ztSU_-4Kb9)|A@+pNDAU0U?f_`l&3Dh-wb>$pnGJL&c6cP2#0BXR8|iu_Uwu* z-JI?=f=md_X*^V70zW}!mT#rKO$f<^inmM&*(IwlQ-Nwe1=gyS zTuRJM5K|FCZY&ka6>)8~wUxBBS@f@umxrfU5ak`0%Wt@Af% zA?)!C0Y(60Kx;r7fGNNn&=z0;pa52YegHSXC_o4x6c7gB0>S}P0n-4}0l0r`c;8T4 zE&cbn^4H2Ml?@jgE><`E4GjCA=%R{o( z)(x`=iQwSSOI&vYN8*qc$>oY26V`Ubs@KmX#&tn zD?_J75&oZ8Ac|!~3}7l~!YomRGavj%2n&}5vIel>Pv8s(M1O{Om@$8bmSpsUx-I4X zZ}7K4e`&esh|%$u=)rF&)4XmZ?2ml_&G~=FbS_Lc?6)HT4nD{UNS6UB0kr^_ibM}U z0qg8UGw~!^GS5N}&Z#^JmxP1tIFK%e zmIgvx1c;FQ#*j$h4TKg5AQwXr;gQ!(dx2`TmCCkP(IUZ6BNQZXq4bySH7Bj#!8Gl0 zU4<|-l=#w!GH}?)W)e*hVnZfTZ<-2#5?RaaRa7N%C^7`}#Y%KDC4Gk-HnDJLkXJyp zz)bk2wpURhKA@Wo=RaI%H@uCBF3~PhBTOHIG;0R)^Q#^5Y{^I6Ou}H;f;?SWea$cG zQ)B@28|{FhAE7{>0R#0JeA0(D5kk9Yefoc*&r+gK|C2uX`d{h`B5MhQI5NP(+J>_7 zTBJX;hY$Zys{pJV44M-pcF`iO*;Y4tnA%wev>AqN>VKqwG%1>y<8Q)9TX<7^Y6`MZ{$ ziz>9&^QuevIJaznh3K%&=8Dox-zkaRF6)}y-_t#6NpcVMGk2y`9}GOK)?M6QFSK;+ z^x?ag+{@50S^hfs$?)-!DV)T6+jhSB@Vs+=c^8L3rMaDUy-{C2zPxF`}^;*_RYcxmgEOug#&b+L5va(yQr@d;_%*Rf5_Lqjo>P`#RZnr z@v#%%x;?dfq%x*Lch`RJCsku-ow97~ek*8_?ef@9>m*vEc&JllfQq-KIox z*Jf6vmDw$`XSV{1RW$O|lCxgaZFl{+dr|$}6^|(6bvslFs|L88i@Y1}dVJQLGbveH zJE)CGh?%!(`mMYPM*=Tx>U+V#Y_rdP`;`W|3vL7)3EJJCbLouUZV$Xzf?g|z@a~8Hyv^?3u%l_) zxI3=4E6Y1rf4%Xm9i23FmGNBYU@E{IAdcilbM54Q{n!F_1XqNUL+iYsMj@r=emC*w zDQP!)h2`6Lh5mAGcm6vvAl=lXIOFohTjdY#&Fxis$MD+v!P+6+(p9xnxfg5~9H^bq zZL8U~n?B~JyYcT98$EoIU~*>P;#PNjOJav6w!fB^lAZS(bv>`^HOm*>ej9(tI?8pC z_sGXr>N;0>J?q+0gX!}1#;?lDu37~z#y~h)#!Ze~v!z{dz-Cv61PE_A8y?rKm_mt# zVV&YlRpU}C_N?#jXLd?3za4Mks*1o&_shPCW;gF8F$xc}=NnZ-&u5RBZCtVWLysGM z_-)H_a=n9=ogOuQnR7*Ic0X%_`gi;_=lZ+ zCkoDc{b+O~HKw!Pd$q?!eBPlEW5%UA`1M^;X;O0AbjkLBvcTnUoJCcxDme?6O*;`0 zn&I0jBBek@HEwa*up3-v+tBOpk^)XFnRox#E4{*?0?%7J*KRzpZ*rV^PQ;=$qjUG~ z+bx`V<1pVNCYPU(8WU`vU1yfvb&PVN(!{8C<i@0jRLc@)4d+mE-no} zkpHw&yQFI5quR!<+1KVR>XotJ(*21#*==vWh`6d*vSx41p!cRH%A?xMm^UZVCUfv! zlh>ZzQ}ayj+{=WQ0B6)C)^^^YqKf%8&#MxwZLVw8_9@uh=7;;|4q0B%^qo)^oOe`r zm{uEyO?6M}>vA@@c@^t!XzTX=$@{y7_f*|R_b=TMou2$)xZUx_>Wjq|o7^_ceEsTD z>uU)iMNPjKb~@hBFtw-N<%o`@i|jTwysJL5D5qazW4^aht>0{K?)$VN&&>7r)oNaw z7&?uZ+@VwZ$;XTe)vmlsyoBeB#JKT zC+IC)m}!(gC9{I@TvXa;Z18Bd_vZzj-|%v5LsFVX&GVqA6>+P7pEJ1Mh{v{GZHILAbIfYrXDe(#~yzN(zGC zbH@gBsTKQ}cppvj3CVl7iX;y9zf%ynr^|20&TZMV$S}2C@TDWOd)u`?@ZO`>$;qQr z+Ll>&33E$R)yp)gnLJ$i&I7@`$5U#R{EZKP(CP4x{D2&l1H8uzdT8poI_ckB9k}Bs z4f8AaUz~N=6u2Z>YwXzG^O*0eo)sG{eY@Yd%CBPi4oCI}&-!u6KZJ<$$FknkUjuahjnsH{_ii;85_wBu@!5vwry}z?_chQmD0quqrsYIPE(dMn= zsDJP*o9}V7G+oDca_FG;3Fn_4o~x-ryf#Is@Pt;wBP-oLh}p`tm^nd<=4`xEXVvJX8fVL+ouLk zWnDay_4=J>pI@@l)4O?iN9SA{E0|VN%)4b)xYoGcJ;NimaTn)@sI?w&%)q%iz%P9H z11I5r(dl2Uw-xqG$@`Fg(!Z@wO7NIb1D$)E4>o3YyOh4G;hdlSGq27Q+c);i8Wl2k z;`{O}^26cX9qu{29Dd`*X8q_NKCD|A)NiP#R%WHz;tJ2|u$bdI!^=8l80_vi#H-9M zX33W9=q_ta50 zdun=cM_s(Gm)_GY>p0vKmHtH=KWyNZ*tlU3m#Hx2Wolo{YFnY|Mhdss%9-f@t`ccn$ z1(}wE?jBQp;+ESq^sdtfZhzsn0mpRnj0)zf_-UO~$^3m;bl)`FwD=g-;^l4k6=eB_ zTnW!!zJK(wkO?tc$JDyA`Q2}XG$i_mwZ5?ST6>)_fcb^Xi}7;8KkqlRFQskV z4>@M=lE$kk@^3Qf-b2EXdl}32B7Pbt?G2Ob=1fr&3a=d2TiWrr1!dV+m6GrG)OL2W z9$Hb*Kb>h6=6Ks7a)+r}pC>CPuPcZ?Sp5(8AY=c9H-6ycPMq@( zVXqpah^2#V{5o0(+1NU)5$SXqv^>2mHMl-$x67s_Bko&m+Tu}?U*=O6mzeQ#J!8gc z>!YE1)31>>s{+?G?QL{1T^M0!VLa~RP>8_i>ND*_bq!(R>noma!hrJ>^^T2U8+&urln3@bjfI83(;zrGcrwZm1Uw zVaj-fz+ zAhr+N)hbdfj_G4(7aIm!MRu&MFor9Dyl|l?nk|N;D8h~-Vow3V(R}>6=4=h=4fyVfZmgSuTz;z@Y{wkAM>mhuVNv`r@XQNpRd<>jEjzQHw`P zOsENS37pQ)z_dGQ_Y%$_U{!%1(nOMyl1LriUC|}X+GW7fBWkr!S6UHHFm#Dhtv#f5 z39A-WGl+UE3TPgfL_<=KKpku{md9oq$oLH4qfv$t&06@lLykks^vF4EA8CSLHNsLt z)3_-yB)au$v4SKUC|)B;1hC+|76Ybn6OX{|BHGA+S4Yr0K`j;lOF$SP9FPEz%3*@6 zmVwU!>PJgF0AN7Pcrt+>r@TjWBls!G(%Fw_s|1qO=2-+kHsNO|=^4-GwqafDN0U2~ zHXP28iP(R490|-+CBxweU=lz`rob<9Q8^u?rLZ)Pt8Db-;y5$B+ogdhCR0U8nWfBB zV$JVDC}1biawLj?5e&yK(Qwod3)%iarvMECy?7iYfc%~yhe?!`8BB(%3R78Xacu49 zkc81;WGd7Ng_?XwIoV3IYpSrA7+Tcs0Y@-A$-xI9y`*(?m9c^nll6j0GNw~#6=~Qg zqXQ)6d7umHLJL3Xg08Y8y4b*$>nTH06OXVWWj!OrLuwzEGE`ELX?sRV9wtd?d2QG) zX*((62U5f!2R;bGq;ODMU_xr~gHWVI$pmq6$M~?4fha;n@}%R_jz=caiOeEOXc$S( zXRfq;FgcZoEJk5AIC|JG~nSv z3&Cd;W~kRT=`x6QAR+f*1%ZoCvs!}WgCvx3{Hi=lTz{GcJy_Qy?*j=R7c&GMG9as| z$v~FuBsosRV^#{@D?(TT<0MZ#9{N)7%=093nu2-#`H&S?1Y83+K*OO9eHa=G2dvVd zPP`<~MHrO8H6sk_>%)O_v_zf=Rs@Q81D`KBwiUorgomm4p85-gkFapU>h@19VgRQy zW#g;*fJnWaOVvo%PJ3_Qyvl$K_@*pb#nXUd%{2gg&`SU~Qda>uG>-we7{38XR!*pk z3$P(je81TP#Ubho6yGvqf#Nm*7ZW@v7X!t;LlRJUsgngj^?>3Ug3HGmpr{`w7rxF^N5Y*ZHU~Cv6l~I9t0ELb+*VSao>f>E^?tgt1aaia=gY^j ztUqqs__iFOI9gH`zNZtJ(Gj#jY;z}>DNEJb00fhw=7Eo9~ zVF85&6c$ieKw$xe1r!!gSU_QcmMri`{y+2Dg=^`yW_rt4ga6;HemDC6df3JWMKps;|#0tyTKAFu%W!|30l4~>2{`tj)J;`e&|f8FpqJ${=< z{~djE^waU1Jo^3U%cGx<-|*3&$8X^1k7GUb`|(>h-b91{$coT6(b_^B^DTf<084-s zpgq7E&;fw?jR5r1F-`-bitrz{0A5_dt1T2=eAi&;@u&GQNY{-&V>M{pP3+VM_Glyv25WOGPy(ME~zeBcb;qxS-<+ zRQ_7fx5&XVrl236A=(DtT93X$s0>#J@iG!#I7Hj~!YJT%Bk5nnis32|{yk*N!3OlB z#qrtw*8by}+h_gFd;Wdxago{od-{)OX|#R@{4FY8w~QeHa6u{xbG|DW!0JWo{?`7Z zg72H -1); + assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1); + assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1); + assertTrue(dinfText.indexOf("COMPANY = sample company") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("TEMPLATE = Normal") > -1); + assertTrue(text.indexOf("SUBJECT = sample subject") > -1); + assertTrue(text.indexOf("MANAGER = sample manager") > -1); + assertTrue(text.indexOf("COMPANY = sample company") > -1); + } + public void testNormalUnicodeProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + ext.getText(); + + // Check each bit in turn + String sinfText = ext.getSummaryInformationText(); + String dinfText = ext.getDocumentSummaryInformationText(); + + assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1); + assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1); + assertTrue(dinfText.indexOf("SCALE = false") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("AUTHOR = marshall") > -1); + assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(text.indexOf("COMPANY = Schreiner") > -1); + assertTrue(text.indexOf("SCALE = false") > -1); + } + public void testCustomProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestMickey.doc")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + + // Custom properties are part of the document info stream + String dinfText = ext.getDocumentSummaryInformationText(); + assertTrue(dinfText.indexOf("Client = sample client") > -1); + assertTrue(dinfText.indexOf("Division = sample division") > -1); + + String text = ext.getText(); + assertTrue(text.indexOf("Client = sample client") > -1); + assertTrue(text.indexOf("Division = sample division") > -1); + } + + public void testConstructors() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HSSFWorkbook wb = new HSSFWorkbook(fs); + ExcelExtractor excelExt = new ExcelExtractor(wb); + + String fsText = (new HPSFPropertiesExtractor(fs)).getText(); + String hwText = (new HPSFPropertiesExtractor(wb)).getText(); + String eeText = (new HPSFPropertiesExtractor(excelExt)).getText(); + + assertEquals(fsText, hwText); + assertEquals(fsText, eeText); + + assertTrue(fsText.indexOf("AUTHOR = marshall") > -1); + assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1); + } +}