Pull Scratchpad Extractor logic to own class

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752225 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2016-07-11 22:27:02 +00:00
parent ed7940dee5
commit a5fb04f45f
2 changed files with 154 additions and 7 deletions

View File

@ -27,14 +27,12 @@ import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.xmlbeans.XmlException;
/**
* Figures out the correct POIOLE2TextExtractor for your supplied
@ -104,15 +102,15 @@ public class OLE2ExtractorFactory {
return threadPreferEventExtractors.get();
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
@ -128,7 +126,7 @@ public class OLE2ExtractorFactory {
* {@link org.apache.poi.extractor.ExtractorFactory} for that.
*/
public static POITextExtractor createExtractor(DirectoryNode poifsDir)
throws IOException, OpenXML4JException, XmlException
throws IOException
{
// Look for certain entries in the stream, to figure it
// out from
@ -154,7 +152,7 @@ public class OLE2ExtractorFactory {
* {@link POITextExtractor} for each embedded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext)
throws IOException, OpenXML4JException, XmlException
throws IOException
{
// All the embedded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>();

View File

@ -0,0 +1,149 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.extractor;
import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.xmlbeans.XmlException;
/**
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
* {@link ExtractorFactory}, which permit the other two to run with
* no Scratchpad jar (though without functionality!)
* <p>Note - should not be used standalone, always use via the other
* two classes</p>
*/
@SuppressWarnings("WeakerAccess")
public class OLE2ScrachpadExtractorFactory {
/**
* Look for certain entries in the stream, to figure it
* out what format is desired
* Note - doesn't check for core-supported formats!
* Note - doesn't check for OOXML-supported formats
*/
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
OpenXML4JException, XmlException
{
if (poifsDir.hasEntry("WordDocument")) {
// Old or new style word document?
try {
return new WordExtractor(poifsDir);
} catch (OldWordFileFormatException e) {
return new Word6Extractor(poifsDir);
}
}
if (poifsDir.hasEntry("PowerPoint Document")) {
return new PowerPointExtractor(poifsDir);
}
if (poifsDir.hasEntry("VisioDocument")) {
return new VisioTextExtractor(poifsDir);
}
if (poifsDir.hasEntry("Quill")) {
return new PublisherTextExtractor(poifsDir);
}
final String[] outlookEntryNames = new String[] {
// message bodies, saved as plain text (PtypString)
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
// @see org.apache.poi.hsmf.Types.MAPIType
"__substg1.0_1000001E", //PidTagBody ASCII
"__substg1.0_1000001F", //PidTagBody Unicode
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
"__substg1.0_0037001E", //PidTagSubject ASCII
"__substg1.0_0037001F", //PidTagSubject Unicode
};
for (String entryName : outlookEntryNames) {
if (poifsDir.hasEntry(entryName)) {
return new OutlookTextExtactor(poifsDir);
}
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, ArrayList<Entry> dirs, ArrayList<InputStream> nonPOIFS) throws IOException {
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if(root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if(ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root
try {
DirectoryEntry op = (DirectoryEntry)
root.getEntry("ObjectPool");
Iterator<Entry> it = op.getEntries();
while(it.hasNext()) {
Entry entry = it.next();
if(entry.getName().startsWith("_")) {
dirs.add(entry);
}
}
} catch(FileNotFoundException e) {
// ignored here
}
//} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
} else if(ext instanceof OutlookTextExtactor) {
// Stored in the Attachment blocks
MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage();
for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
if(attachment.attachData != null) {
byte[] data = attachment.attachData.getValue();
nonPOIFS.add( new ByteArrayInputStream(data) );
} else if(attachment.attachmentDirectory != null) {
dirs.add(attachment.attachmentDirectory.getDirectory());
}
}
}
}
}