2008-04-08 07:43:37 -04:00
|
|
|
/* ====================================================================
|
|
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
|
|
this work for additional information regarding copyright ownership.
|
|
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
(the "License"); you may not use this file except in compliance with
|
|
|
|
the License. You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
==================================================================== */
|
|
|
|
package org.apache.poi.extractor;
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.io.PushbackInputStream;
|
|
|
|
import java.util.Iterator;
|
|
|
|
|
Merged revisions 638786-638802,638805-638811,638813-638814,638816-639230,639233-639241,639243-639253,639255-639486,639488-639601,639603-639835,639837-639917,639919-640056,640058-640710,640712-641156,641158-641184,641186-641795,641797-641798,641800-641933,641935-641963,641965-641966,641968-641995,641997-642230,642232-642562,642564-642565,642568-642570,642572-642573,642576-642736,642739-642877,642879,642881-642890,642892-642903,642905-642945,642947-643624,643626-643653,643655-643669,643671,643673-643830,643832-643833,643835-644342,644344-644472,644474-644508,644510-645347,645349-645351,645353-645559,645561-645565,645568-645951,645953-646193,646195-646313 via svnmerge from
https://svn.apache.org:443/repos/asf/poi/trunk
........
r646312 | nick | 2008-04-09 13:46:42 +0100 (Wed, 09 Apr 2008) | 1 line
Provide a common ole2 implementation of POITextExtractor, which gives access to the document metadata
........
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646818 13f79535-47bb-0310-9956-ffa450edef68
2008-04-10 10:26:36 -04:00
|
|
|
import org.apache.poi.POIOLE2TextExtractor;
|
2008-04-08 07:43:37 -04:00
|
|
|
import org.apache.poi.POITextExtractor;
|
|
|
|
import org.apache.poi.POIXMLDocument;
|
|
|
|
import org.apache.poi.POIXMLTextExtractor;
|
2008-04-08 08:17:18 -04:00
|
|
|
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
2008-04-08 08:03:05 -04:00
|
|
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
2008-04-08 07:43:37 -04:00
|
|
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
2008-04-08 08:03:05 -04:00
|
|
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
2008-04-08 07:43:37 -04:00
|
|
|
import org.apache.poi.poifs.filesystem.Entry;
|
|
|
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|
|
|
import org.apache.poi.xslf.XSLFSlideShow;
|
|
|
|
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
|
|
|
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
2008-08-11 15:14:03 -04:00
|
|
|
import org.apache.poi.xssf.usermodel.XSSFRelation;
|
2008-04-08 07:43:37 -04:00
|
|
|
import org.apache.poi.xwpf.XWPFDocument;
|
|
|
|
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
|
|
|
import org.apache.xmlbeans.XmlException;
|
2008-08-11 15:14:03 -04:00
|
|
|
import org.openxml4j.exceptions.InvalidFormatException;
|
|
|
|
import org.openxml4j.exceptions.OpenXML4JException;
|
|
|
|
import org.openxml4j.opc.Package;
|
|
|
|
import org.openxml4j.opc.PackagePart;
|
|
|
|
import org.openxml4j.opc.PackageRelationshipCollection;
|
2008-04-08 07:43:37 -04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Figures out the correct POITextExtractor for your supplied
|
|
|
|
* document, and returns it.
|
|
|
|
*/
|
|
|
|
public class ExtractorFactory {
|
|
|
|
public static final String CORE_DOCUMENT_REL =
|
|
|
|
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
|
|
|
|
|
2008-04-08 08:03:05 -04:00
|
|
|
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
|
|
|
InputStream inp = new PushbackInputStream(
|
|
|
|
new FileInputStream(f), 8);
|
2008-04-08 07:43:37 -04:00
|
|
|
|
2008-04-08 08:03:05 -04:00
|
|
|
if(POIFSFileSystem.hasPOIFSHeader(inp)) {
|
|
|
|
return createExtractor(new POIFSFileSystem(inp));
|
2008-04-08 07:43:37 -04:00
|
|
|
}
|
2008-04-08 08:03:05 -04:00
|
|
|
if(POIXMLDocument.hasOOXMLHeader(inp)) {
|
|
|
|
inp.close();
|
2008-04-08 07:43:37 -04:00
|
|
|
return createExtractor(Package.open(f.toString()));
|
|
|
|
}
|
|
|
|
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
|
|
|
|
}
|
|
|
|
|
2008-04-08 08:03:05 -04:00
|
|
|
public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
2008-04-08 07:43:37 -04:00
|
|
|
// Figure out the kind of stream
|
|
|
|
// If clearly doesn't do mark/reset, wrap up
|
|
|
|
if(! inp.markSupported()) {
|
|
|
|
inp = new PushbackInputStream(inp, 8);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(POIFSFileSystem.hasPOIFSHeader(inp)) {
|
|
|
|
return createExtractor(new POIFSFileSystem(inp));
|
|
|
|
}
|
|
|
|
if(POIXMLDocument.hasOOXMLHeader(inp)) {
|
|
|
|
return createExtractor(Package.open(inp));
|
|
|
|
}
|
|
|
|
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
|
|
|
|
}
|
|
|
|
|
2008-04-08 08:03:05 -04:00
|
|
|
public static POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException {
|
2008-04-08 07:43:37 -04:00
|
|
|
PackageRelationshipCollection core =
|
|
|
|
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
|
|
|
if(core.size() != 1) {
|
|
|
|
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
PackagePart corePart = pkg.getPart(core.getRelationship(0));
|
2008-08-11 15:14:03 -04:00
|
|
|
if(corePart.getContentType().equals(XSSFRelation.WORKBOOK.getContentType())) {
|
2008-04-08 07:43:37 -04:00
|
|
|
return new XSSFExcelExtractor(pkg);
|
|
|
|
}
|
|
|
|
if(corePart.getContentType().equals(XWPFDocument.MAIN_CONTENT_TYPE)) {
|
|
|
|
return new XWPFWordExtractor(pkg);
|
|
|
|
}
|
|
|
|
if(corePart.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
|
|
|
|
return new XSLFPowerPointExtractor(pkg);
|
|
|
|
}
|
|
|
|
throw new IllegalArgumentException("No supported documents found in the OOXML package");
|
|
|
|
}
|
|
|
|
|
Merged revisions 638786-638802,638805-638811,638813-638814,638816-639230,639233-639241,639243-639253,639255-639486,639488-639601,639603-639835,639837-639917,639919-640056,640058-640710,640712-641156,641158-641184,641186-641795,641797-641798,641800-641933,641935-641963,641965-641966,641968-641995,641997-642230,642232-642562,642564-642565,642568-642570,642572-642573,642576-642736,642739-642877,642879,642881-642890,642892-642903,642905-642945,642947-643624,643626-643653,643655-643669,643671,643673-643830,643832-643833,643835-644342,644344-644472,644474-644508,644510-645347,645349-645351,645353-645559,645561-645565,645568-645951,645953-646193,646195-646313 via svnmerge from
https://svn.apache.org:443/repos/asf/poi/trunk
........
r646312 | nick | 2008-04-09 13:46:42 +0100 (Wed, 09 Apr 2008) | 1 line
Provide a common ole2 implementation of POITextExtractor, which gives access to the document metadata
........
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646818 13f79535-47bb-0310-9956-ffa450edef68
2008-04-10 10:26:36 -04:00
|
|
|
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
2008-04-08 07:43:37 -04:00
|
|
|
// Look for certain entries in the stream, to figure it
|
|
|
|
// out from
|
|
|
|
for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
|
|
|
|
Entry entry = (Entry)entries.next();
|
2008-04-08 08:03:05 -04:00
|
|
|
|
2008-04-08 07:43:37 -04:00
|
|
|
if(entry.getName().equals("Workbook")) {
|
|
|
|
return new ExcelExtractor(fs);
|
|
|
|
}
|
2008-04-08 08:03:05 -04:00
|
|
|
if(entry.getName().equals("WordDocument")) {
|
|
|
|
return new WordExtractor(fs);
|
|
|
|
}
|
|
|
|
if(entry.getName().equals("PowerPoint Document")) {
|
|
|
|
return new PowerPointExtractor(fs);
|
|
|
|
}
|
2008-04-08 08:17:18 -04:00
|
|
|
if(entry.getName().equals("VisioDocument")) {
|
|
|
|
return new VisioTextExtractor(fs);
|
|
|
|
}
|
2008-04-08 07:43:37 -04:00
|
|
|
}
|
|
|
|
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
|
|
|
}
|
|
|
|
}
|