Fix a typo in the file name, and add a generic method to POITextExtractor to get the appropriate metadata text extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685267 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-12 19:02:41 +00:00
parent f2d371df00
commit d60c98c37b
7 changed files with 37 additions and 14 deletions

View File

@ -37,7 +37,7 @@
<!-- Don't forget to update status.xml too! --> <!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??"> <release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</action> <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action> <action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action> <action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action> <action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>

View File

@ -95,7 +95,7 @@
<p>If all you are interested in is getting the textual content of <p>If all you are interested in is getting the textual content of
all the document properties, such as for full text indexing, then all the document properties, such as for full text indexing, then
take a look at take a look at
<code>org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</code>. However, <code>org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</code>. However,
if you want full access to the properties, please read on!</p> if you want full access to the properties, please read on!</p>
<p>The first thing you should understand is that a Microsoft Office file is <p>The first thing you should understand is that a Microsoft Office file is

View File

@ -34,7 +34,7 @@
<!-- Don't forget to update changes.xml too! --> <!-- Don't forget to update changes.xml too! -->
<changes> <changes>
<release version="3.1.1-alpha1" date="2008-??-??"> <release version="3.1.1-alpha1" date="2008-??-??">
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</action> <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action> <action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action> <action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action> <action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>

View File

@ -18,6 +18,7 @@ package org.apache.poi;
import org.apache.poi.hpsf.DocumentSummaryInformation; import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
/** /**
* Common Parent for OLE2 based Text Extractors * Common Parent for OLE2 based Text Extractors
@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
public SummaryInformation getSummaryInformation() { public SummaryInformation getSummaryInformation() {
return document.getSummaryInformation(); return document.getSummaryInformation();
} }
/**
* Returns an HPSF powered text extractor for the
* document properties metadata, such as title and author.
*/
public POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
} }

View File

@ -54,4 +54,11 @@ public abstract class POITextExtractor {
* @return All the text from the document * @return All the text from the document
*/ */
public abstract String getText(); public abstract String getText();
/**
* Returns another text extractor, which is able to
* output the textual content of the document
* metadata / properties, such as author and title.
*/
public abstract POITextExtractor getMetadataTextExtractor();
} }

View File

@ -36,14 +36,14 @@ import org.apache.poi.util.LittleEndian;
* build in and custom, returning them in * build in and custom, returning them in
* textual form. * textual form.
*/ */
public class HPFSPropertiesExtractor extends POITextExtractor { public class HPSFPropertiesExtractor extends POITextExtractor {
public HPFSPropertiesExtractor(POITextExtractor mainExtractor) { public HPSFPropertiesExtractor(POITextExtractor mainExtractor) {
super(mainExtractor); super(mainExtractor);
} }
public HPFSPropertiesExtractor(POIDocument doc) { public HPSFPropertiesExtractor(POIDocument doc) {
super(doc); super(doc);
} }
public HPFSPropertiesExtractor(POIFSFileSystem fs) { public HPSFPropertiesExtractor(POIFSFileSystem fs) {
super(new PropertiesOnlyDocument(fs)); super(new PropertiesOnlyDocument(fs));
} }
@ -127,6 +127,13 @@ public class HPFSPropertiesExtractor extends POITextExtractor {
public String getText() { public String getText() {
return getSummaryInformationText() + getDocumentSummaryInformationText(); return getSummaryInformationText() + getDocumentSummaryInformationText();
} }
/**
* Prevent recursion!
*/
public POITextExtractor getMetadataTextExtractor() {
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
}
/** /**
* So we can get at the properties of any * So we can get at the properties of any

View File

@ -25,7 +25,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase; import junit.framework.TestCase;
public class TestHPFSPropertiesExtractor extends TestCase { public class TestHPSFPropertiesExtractor extends TestCase {
private String dir; private String dir;
protected void setUp() throws Exception { protected void setUp() throws Exception {
@ -37,7 +37,7 @@ public class TestHPFSPropertiesExtractor extends TestCase {
POIFSFileSystem fs = new POIFSFileSystem( POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(new File(dir, "TestMickey.doc")) new FileInputStream(new File(dir, "TestMickey.doc"))
); );
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs); HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
ext.getText(); ext.getText();
// Check each bit in turn // Check each bit in turn
@ -60,7 +60,7 @@ public class TestHPFSPropertiesExtractor extends TestCase {
POIFSFileSystem fs = new POIFSFileSystem( POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(new File(dir, "TestUnicode.xls")) new FileInputStream(new File(dir, "TestUnicode.xls"))
); );
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs); HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
ext.getText(); ext.getText();
// Check each bit in turn // Check each bit in turn
@ -83,7 +83,7 @@ public class TestHPFSPropertiesExtractor extends TestCase {
POIFSFileSystem fs = new POIFSFileSystem( POIFSFileSystem fs = new POIFSFileSystem(
new FileInputStream(new File(dir, "TestMickey.doc")) new FileInputStream(new File(dir, "TestMickey.doc"))
); );
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs); HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
// Custom properties are part of the document info stream // Custom properties are part of the document info stream
String dinfText = ext.getDocumentSummaryInformationText(); String dinfText = ext.getDocumentSummaryInformationText();
@ -102,9 +102,9 @@ public class TestHPFSPropertiesExtractor extends TestCase {
HSSFWorkbook wb = new HSSFWorkbook(fs); HSSFWorkbook wb = new HSSFWorkbook(fs);
ExcelExtractor excelExt = new ExcelExtractor(wb); ExcelExtractor excelExt = new ExcelExtractor(wb);
String fsText = (new HPFSPropertiesExtractor(fs)).getText(); String fsText = (new HPSFPropertiesExtractor(fs)).getText();
String hwText = (new HPFSPropertiesExtractor(wb)).getText(); String hwText = (new HPSFPropertiesExtractor(wb)).getText();
String eeText = (new HPFSPropertiesExtractor(excelExt)).getText(); String eeText = (new HPSFPropertiesExtractor(excelExt)).getText();
assertEquals(fsText, hwText); assertEquals(fsText, hwText);
assertEquals(fsText, eeText); assertEquals(fsText, eeText);