More ExtractorFactory support and tests

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@645870 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-04-08 12:03:05 +00:00
parent dfad3bbafb
commit fdc07f33f8
2 changed files with 161 additions and 9 deletions

View File

@ -32,7 +32,9 @@ import org.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.POITextExtractor; import org.apache.poi.POITextExtractor;
import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.XSLFSlideShow; import org.apache.poi.xslf.XSLFSlideShow;
@ -51,20 +53,21 @@ public class ExtractorFactory {
public static final String CORE_DOCUMENT_REL = public static final String CORE_DOCUMENT_REL =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
public POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
FileInputStream finp = new FileInputStream(f); InputStream inp = new PushbackInputStream(
new FileInputStream(f), 8);
if(POIFSFileSystem.hasPOIFSHeader(finp)) { if(POIFSFileSystem.hasPOIFSHeader(inp)) {
return createExtractor(new POIFSFileSystem(finp)); return createExtractor(new POIFSFileSystem(inp));
} }
if(POIXMLDocument.hasOOXMLHeader(finp)) { if(POIXMLDocument.hasOOXMLHeader(inp)) {
finp.close(); inp.close();
return createExtractor(Package.open(f.toString())); return createExtractor(Package.open(f.toString()));
} }
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
} }
public POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// Figure out the kind of stream // Figure out the kind of stream
// If clearly doesn't do mark/reset, wrap up // If clearly doesn't do mark/reset, wrap up
if(! inp.markSupported()) { if(! inp.markSupported()) {
@ -80,7 +83,7 @@ public class ExtractorFactory {
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
} }
public POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException { public static POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException {
PackageRelationshipCollection core = PackageRelationshipCollection core =
pkg.getRelationshipsByType(CORE_DOCUMENT_REL); pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
if(core.size() != 1) { if(core.size() != 1) {
@ -100,14 +103,23 @@ public class ExtractorFactory {
throw new IllegalArgumentException("No supported documents found in the OOXML package"); throw new IllegalArgumentException("No supported documents found in the OOXML package");
} }
public POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException { public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
// Look for certain entries in the stream, to figure it // Look for certain entries in the stream, to figure it
// out from // out from
for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) { for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
Entry entry = (Entry)entries.next(); Entry entry = (Entry)entries.next();
System.err.println(entry.getName());
if(entry.getName().equals("Workbook")) { if(entry.getName().equals("Workbook")) {
return new ExcelExtractor(fs); return new ExcelExtractor(fs);
} }
if(entry.getName().equals("WordDocument")) {
return new WordExtractor(fs);
}
if(entry.getName().equals("PowerPoint Document")) {
return new PowerPointExtractor(fs);
}
// TODO - visio
} }
throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
} }

View File

@ -0,0 +1,140 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.extractor;
import java.io.File;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import junit.framework.TestCase;
/**
* Test that the extractor factory plays nicely
*/
public class TestExtractorFactory extends TestCase {
private String excel_dir;
private String word_dir;
private String powerpoint_dir;
private File txt;
private File xls;
private File xlsx;
private File doc;
private File docx;
private File ppt;
private File pptx;
protected void setUp() throws Exception {
super.setUp();
excel_dir = System.getProperty("HSSF.testdata.path");
word_dir = System.getProperty("HWPF.testdata.path");
powerpoint_dir = System.getProperty("HSLF.testdata.path");
txt = new File(excel_dir, "SampleSS.txt");
xls = new File(excel_dir, "SampleSS.xls");
xlsx = new File(excel_dir, "SampleSS.xlsx");
doc = new File(word_dir, "SampleDoc.doc");
docx = new File(word_dir, "SampleDoc.docx");
ppt = new File(powerpoint_dir, "SampleShow.ppt");
pptx = new File(powerpoint_dir, "SampleShow.pptx");
}
public void testFile() throws Exception {
// Excel
assertTrue(
ExtractorFactory.createExtractor(xls)
instanceof ExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(xls).getText().length() > 200
);
assertTrue(
ExtractorFactory.createExtractor(xlsx)
instanceof XSSFExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(xlsx).getText().length() > 200
);
// Word
assertTrue(
ExtractorFactory.createExtractor(doc)
instanceof WordExtractor
);
assertTrue(
ExtractorFactory.createExtractor(doc).getText().length() > 120
);
assertTrue(
ExtractorFactory.createExtractor(docx)
instanceof XWPFWordExtractor
);
assertTrue(
ExtractorFactory.createExtractor(docx).getText().length() > 120
);
// PowerPoint
assertTrue(
ExtractorFactory.createExtractor(ppt)
instanceof PowerPointExtractor
);
assertTrue(
ExtractorFactory.createExtractor(ppt).getText().length() > 120
);
assertTrue(
ExtractorFactory.createExtractor(pptx)
instanceof XSLFPowerPointExtractor
);
assertTrue(
ExtractorFactory.createExtractor(pptx).getText().length() > 120
);
// Visio
// TODO
// Text
try {
ExtractorFactory.createExtractor(txt);
fail();
} catch(IllegalArgumentException e) {
// Good
}
}
public void testInputStream() throws Exception {
}
public void testPOIFS() throws Exception {
}
public void testPackage() throws Exception {
}
}