Refactor to make it easier to tell which content types each POIXMLTextExtractor handles

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@980414 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-07-29 11:57:08 +00:00
parent 507d4dd3a5
commit 443dd75e04
7 changed files with 171 additions and 51 deletions

View File

@ -34,6 +34,7 @@
<changes> <changes>
<release version="3.7-beta2" date="2010-??-??"> <release version="3.7-beta2" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="add">Make it easier to tell which content types each POIXMLTextExtractor handles</action>
<action dev="POI-DEVELOPERS" type="fix">49649 - Added clone support for UserSView* and Feat* families of records</action> <action dev="POI-DEVELOPERS" type="fix">49649 - Added clone support for UserSView* and Feat* families of records</action>
<action dev="POI-DEVELOPERS" type="fix">49653 - Support for escaped unicode characters in Shared String Table</action> <action dev="POI-DEVELOPERS" type="fix">49653 - Support for escaped unicode characters in Shared String Table</action>
<action dev="POI-DEVELOPERS" type="fix">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action> <action dev="POI-DEVELOPERS" type="fix">49579 - prevent ArrayIndexOutOfBoundException in UnknowEscherRecord</action>

View File

@ -52,6 +52,7 @@ import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.XSLFSlideShow; import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFRelation; import org.apache.poi.xssf.usermodel.XSSFRelation;
@ -155,42 +156,40 @@ public class ExtractorFactory {
} }
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
PackageRelationshipCollection core = PackageRelationshipCollection core =
pkg.getRelationshipsByType(CORE_DOCUMENT_REL); pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
if(core.size() != 1) { if(core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
} }
PackagePart corePart = pkg.getPart(core.getRelationship(0)); PackagePart corePart = pkg.getPart(core.getRelationship(0));
if (corePart.getContentType().equals(XSSFRelation.WORKBOOK.getContentType()) ||
corePart.getContentType().equals(XSSFRelation.MACRO_TEMPLATE_WORKBOOK.getContentType()) || // Is it XSSF?
corePart.getContentType().equals(XSSFRelation.MACRO_ADDIN_WORKBOOK.getContentType()) || for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
corePart.getContentType().equals(XSSFRelation.TEMPLATE_WORKBOOK.getContentType()) || if(corePart.getContentType().equals(rel.getContentType())) {
corePart.getContentType().equals(XSSFRelation.MACROS_WORKBOOK.getContentType())) { if(getPreferEventExtractor()) {
if(getPreferEventExtractor()) { return new XSSFEventBasedExcelExtractor(pkg);
return new XSSFEventBasedExcelExtractor(pkg); } else {
} else { return new XSSFExcelExtractor(pkg);
return new XSSFExcelExtractor(pkg); }
} }
} }
if(corePart.getContentType().equals(XWPFRelation.DOCUMENT.getContentType()) || // Is it XWPF?
corePart.getContentType().equals(XWPFRelation.TEMPLATE.getContentType()) || for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
corePart.getContentType().equals(XWPFRelation.MACRO_DOCUMENT.getContentType()) || if(corePart.getContentType().equals(rel.getContentType())) {
corePart.getContentType().equals(XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType()) ) { return new XWPFWordExtractor(pkg);
return new XWPFWordExtractor(pkg); }
} }
if(corePart.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE) || // Is it XSLF?
corePart.getContentType().equals(XSLFSlideShow.MACRO_CONTENT_TYPE) || for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
corePart.getContentType().equals(XSLFSlideShow.MACRO_TEMPLATE_CONTENT_TYPE) || if(corePart.getContentType().equals(rel.getContentType())) {
corePart.getContentType().equals(XSLFSlideShow.PRESENTATIONML_CONTENT_TYPE) || return new XSLFPowerPointExtractor(pkg);
corePart.getContentType().equals(XSLFSlideShow.PRESENTATIONML_TEMPLATE_CONTENT_TYPE) || }
corePart.getContentType().equals(XSLFSlideShow.PRESENTATION_MACRO_CONTENT_TYPE)) { }
return new XSLFPowerPointExtractor(pkg);
} throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
} }
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException { public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {

View File

@ -22,6 +22,7 @@ import java.util.List;
import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLDocument;
import org.apache.poi.util.Internal; import org.apache.poi.util.Internal;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
@ -29,7 +30,6 @@ import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.drawingml.x2006.main.ThemeDocument;
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList; import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide; import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation; import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation;
@ -57,18 +57,6 @@ import org.openxmlformats.schemas.presentationml.x2006.main.SldMasterDocument;
* WARNING - APIs expected to change rapidly * WARNING - APIs expected to change rapidly
*/ */
public class XSLFSlideShow extends POIXMLDocument { public class XSLFSlideShow extends POIXMLDocument {
public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml";
public static final String MACRO_CONTENT_TYPE = "application/vnd.ms-powerpoint.slideshow.macroEnabled.main+xml";
public static final String MACRO_TEMPLATE_CONTENT_TYPE = "application/vnd.ms-powerpoint.template.macroEnabled.main+xml";
public static final String PRESENTATIONML_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slideshow.main+xml";
public static final String PRESENTATIONML_TEMPLATE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.template.main+xml";
public static final String PRESENTATION_MACRO_CONTENT_TYPE = "application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml";
public static final String THEME_MANAGER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.themeManager+xml";
public static final String NOTES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml";
public static final String SLIDE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml";
public static final String SLIDE_LAYOUT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout";
public static final String NOTES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide";
public static final String COMMENT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments";
private PresentationDocument presentationDoc; private PresentationDocument presentationDoc;
/** /**
@ -79,7 +67,7 @@ public class XSLFSlideShow extends POIXMLDocument {
public XSLFSlideShow(OPCPackage container) throws OpenXML4JException, IOException, XmlException { public XSLFSlideShow(OPCPackage container) throws OpenXML4JException, IOException, XmlException {
super(container); super(container);
if(getCorePart().getContentType().equals(THEME_MANAGER_CONTENT_TYPE)) { if(getCorePart().getContentType().equals(XSLFRelation.THEME_MANAGER.getContentType())) {
rebase(getPackage()); rebase(getPackage());
} }
@ -187,7 +175,7 @@ public class XSLFSlideShow extends POIXMLDocument {
PackagePart slidePart = getSlidePart(parentSlide); PackagePart slidePart = getSlidePart(parentSlide);
try { try {
notes = slidePart.getRelationshipsByType(NOTES_RELATION_TYPE); notes = slidePart.getRelationshipsByType(XSLFRelation.NOTES.getRelation());
} catch(InvalidFormatException e) { } catch(InvalidFormatException e) {
throw new IllegalStateException(e); throw new IllegalStateException(e);
} }
@ -231,7 +219,7 @@ public class XSLFSlideShow extends POIXMLDocument {
PackagePart slidePart = getSlidePart(slide); PackagePart slidePart = getSlidePart(slide);
try { try {
commentRels = slidePart.getRelationshipsByType(COMMENT_RELATION_TYPE); commentRels = slidePart.getRelationshipsByType(XSLFRelation.COMMENTS.getRelation());
} catch(InvalidFormatException e) { } catch(InvalidFormatException e) {
throw new IllegalStateException(e); throw new IllegalStateException(e);
} }

View File

@ -23,6 +23,7 @@ import org.apache.poi.xslf.XSLFSlideShow;
import org.apache.poi.xslf.usermodel.DrawingParagraph; import org.apache.poi.xslf.usermodel.DrawingParagraph;
import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFCommonSlideData; import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlide; import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.presentationml.x2006.main.*; import org.openxmlformats.schemas.presentationml.x2006.main.*;
@ -30,6 +31,12 @@ import org.openxmlformats.schemas.presentationml.x2006.main.*;
import java.io.IOException; import java.io.IOException;
public class XSLFPowerPointExtractor extends POIXMLTextExtractor { public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[] {
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
XSLFRelation.PRESENTATION_MACRO
};
private XMLSlideShow slideshow; private XMLSlideShow slideshow;
private boolean slidesByDefault = true; private boolean slidesByDefault = true;
private boolean notesByDefault = false; private boolean notesByDefault = false;

View File

@ -0,0 +1,111 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xslf.usermodel;
import java.util.HashMap;
import java.util.Map;
import org.apache.poi.POIXMLDocumentPart;
import org.apache.poi.POIXMLRelation;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
public class XSLFRelation extends POIXMLRelation {
private static POILogger log = POILogFactory.getLogger(XSLFRelation.class);
/**
* A map to lookup POIXMLRelation by its relation type
*/
protected static Map<String, XSLFRelation> _table = new HashMap<String, XSLFRelation>();
public static final XSLFRelation MAIN = new XSLFRelation(
"application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml",
null, null, null
);
public static final XSLFRelation MACRO = new XSLFRelation(
"application/vnd.ms-powerpoint.slideshow.macroEnabled.main+xml",
null, null, null
);
public static final XSLFRelation MACRO_TEMPLATE = new XSLFRelation(
"application/vnd.ms-powerpoint.template.macroEnabled.main+xml",
null, null, null
);
public static final XSLFRelation PRESENTATIONML = new XSLFRelation(
"application/vnd.openxmlformats-officedocument.presentationml.slideshow.main+xml",
null, null, null
);
public static final XSLFRelation PRESENTATIONML_TEMPLATE = new XSLFRelation(
"application/vnd.openxmlformats-officedocument.presentationml.template.main+xml",
null, null, null
);
public static final XSLFRelation PRESENTATION_MACRO = new XSLFRelation(
"application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml",
null, null, null
);
public static final XSLFRelation THEME_MANAGER = new XSLFRelation(
"application/vnd.openxmlformats-officedocument.themeManager+xml",
null, null, null
);
public static final XSLFRelation NOTES = new XSLFRelation(
"application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide",
null, null
);
public static final XSLFRelation SLIDE = new XSLFRelation(
"application/vnd.openxmlformats-officedocument.presentationml.slide+xml",
null, null, null
);
public static final XSLFRelation SLIDE_LAYOUT = new XSLFRelation(
"application/vnd.openxmlformats-officedocument.presentationml.slideLayout+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout",
null, null
);
public static final XSLFRelation COMMENTS = new XSLFRelation(
"application/vnd.openxmlformats-officedocument.presentationml.comments+xml",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
null, null
);
private XSLFRelation(String type, String rel, String defaultName, Class<? extends POIXMLDocumentPart> cls) {
super(type, rel, defaultName, cls);
if(cls != null && !_table.containsKey(rel)) _table.put(rel, this);
}
/**
* Get POIXMLRelation by relation type
*
* @param rel relation type, for example,
* <code>http://schemas.openxmlformats.org/officeDocument/2006/relationships/image</code>
* @return registered POIXMLRelation or null if not found
*/
public static XSLFRelation getInstance(String rel){
return _table.get(rel);
}
}

View File

@ -28,6 +28,7 @@ import org.apache.poi.ss.usermodel.Comment;
import org.apache.poi.ss.usermodel.HeaderFooter; import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
@ -36,6 +37,12 @@ import org.apache.xmlbeans.XmlException;
* Helper class to extract text from an OOXML Excel file * Helper class to extract text from an OOXML Excel file
*/ */
public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor { public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
public static final XSSFRelation[] SUPPORTED_TYPES = new XSSFRelation[] {
XSSFRelation.WORKBOOK, XSSFRelation.MACRO_TEMPLATE_WORKBOOK,
XSSFRelation.MACRO_ADDIN_WORKBOOK, XSSFRelation.TEMPLATE_WORKBOOK,
XSSFRelation.MACROS_WORKBOOK
};
private XSSFWorkbook workbook; private XSSFWorkbook workbook;
private boolean includeSheetNames = true; private boolean includeSheetNames = true;
private boolean formulasNotResults = false; private boolean formulasNotResults = false;

View File

@ -30,6 +30,7 @@ import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
import org.apache.poi.xwpf.model.XWPFParagraphDecorator; import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
@ -38,6 +39,12 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
* Helper class to extract text from an OOXML Word file * Helper class to extract text from an OOXML Word file
*/ */
public class XWPFWordExtractor extends POIXMLTextExtractor { public class XWPFWordExtractor extends POIXMLTextExtractor {
public static final XWPFRelation[] SUPPORTED_TYPES = new XWPFRelation[] {
XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE,
XWPFRelation.MACRO_DOCUMENT,
XWPFRelation.MACRO_TEMPLATE_DOCUMENT
};
private XWPFDocument document; private XWPFDocument document;
private boolean fetchHyperlinks = false; private boolean fetchHyperlinks = false;