diff --git a/src/ooxml/java/org/apache/poi/POIXMLDocument.java b/src/ooxml/java/org/apache/poi/POIXMLDocument.java index bcc8c0911..9fa4789db 100644 --- a/src/ooxml/java/org/apache/poi/POIXMLDocument.java +++ b/src/ooxml/java/org/apache/poi/POIXMLDocument.java @@ -32,9 +32,6 @@ import org.openxml4j.opc.PackageRelationship; import org.openxml4j.opc.PackageRelationshipCollection; import org.openxml4j.opc.PackageRelationshipTypes; import org.openxml4j.opc.PackagingURIHelper; -import org.openxml4j.opc.internal.PackagePropertiesPart; -import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties; -import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument; public abstract class POIXMLDocument { @@ -48,6 +45,12 @@ public abstract class POIXMLDocument { /** The OPC core Package Part */ private PackagePart corePart; + /** + * The properties of the OPC package, opened as needed + */ + private POIXMLProperties properties; + + protected POIXMLDocument() {} protected POIXMLDocument(Package pkg) throws IOException { @@ -178,28 +181,13 @@ public abstract class POIXMLDocument { } /** - * Get the core document properties (core ooxml properties). - * TODO: Replace with nice usermodel wrapper - * @deprecated To be replaced with a proper user-model style view of the properties + * Get the document properties. This gives you access to the + * core ooxml properties, and the extended ooxml properties. */ - public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException { - PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE); - if(propsPart == null) { - return null; + public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException { + if(properties == null) { + properties = new POIXMLProperties(pkg); } - return (PackagePropertiesPart)propsPart; - } - - /** - * Get the extended document properties (extended ooxml properties) - * TODO: Replace with nice usermodel wrapper - * @deprecated To be replaced with a proper user-model style view of the properties - */ - public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException { - PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE); - - PropertiesDocument props = PropertiesDocument.Factory.parse( - propsPart.getInputStream()); - return props.getProperties(); + return properties; } } diff --git a/src/ooxml/java/org/apache/poi/POIXMLProperties.java b/src/ooxml/java/org/apache/poi/POIXMLProperties.java new file mode 100644 index 000000000..7806c9b78 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/POIXMLProperties.java @@ -0,0 +1,124 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi; + +import java.io.IOException; + +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxml4j.opc.PackageRelationshipCollection; +import org.openxml4j.opc.internal.PackagePropertiesPart; +import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties; +import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument; + +/** + * Wrapper around the two different kinds of OOXML properties + * a document can have + */ +public class POIXMLProperties { + private Package pkg; + private CoreProperties core; + private ExtendedProperties ext; + + public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException { + this.pkg = docPackage; + + // Core properties + PackageRelationshipCollection coreRel = + pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE); + if(coreRel.size() == 1) { + core = new CoreProperties( (PackagePropertiesPart) + pkg.getPart(coreRel.getRelationship(0)) ); + } else { + throw new IllegalArgumentException("A document must always have core properties defined!"); + } + + // Extended properties + PackageRelationshipCollection extRel = + pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE); + if(extRel.size() == 1) { + PropertiesDocument props = PropertiesDocument.Factory.parse( + pkg.getPart( extRel.getRelationship(0) ).getInputStream() + ); + ext = new ExtendedProperties(props); + } else { + ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance()); + } + } + + /** + * Returns the core document properties + */ + public CoreProperties getCoreProperties() { + return core; + } + + /** + * Returns the extended document properties + */ + public ExtendedProperties getExtendedProperties() { + return ext; + } + + /** + * Writes out the ooxml properties into the supplied, + * new Package + */ + public void write(Package pkg) { + // TODO + } + + /** + * The core document properties + */ + public class CoreProperties { + private PackagePropertiesPart part; + private CoreProperties(PackagePropertiesPart part) { + this.part = part; + } + + public void setTitle(String title) { + part.setTitleProperty(title); + } + public String getTitle() { + return part.getTitleProperty().getValue(); + } + + public PackagePropertiesPart getUnderlyingProperties() { + return part; + } + } + + /** + * Extended document properties + */ + public class ExtendedProperties { + private PropertiesDocument props; + private ExtendedProperties(PropertiesDocument props) { + this.props = props; + + if(props.getProperties() == null) { + props.addNewProperties(); + } + } + + public CTProperties getUnderlyingProperties() { + return props.getProperties(); + } + } +} diff --git a/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java b/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java index c28eba49d..ae8514c27 100644 --- a/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java +++ b/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java @@ -16,6 +16,12 @@ ==================================================================== */ package org.apache.poi; +import java.io.IOException; + +import org.apache.poi.POIXMLProperties.*; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; + public abstract class POIXMLTextExtractor extends POITextExtractor { /** The POIXMLDocument that's open */ protected POIXMLDocument document; @@ -28,4 +34,17 @@ public abstract class POIXMLTextExtractor extends POITextExtractor { this.document = document; } + + /** + * Returns the core document properties + */ + public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException { + return document.getProperties().getCoreProperties(); + } + /** + * Returns the extended document properties + */ + public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException { + return document.getProperties().getExtendedProperties(); + } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/XWPFDocument.java b/src/ooxml/java/org/apache/poi/xwpf/XWPFDocument.java index 4b54ed863..05b716d75 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/XWPFDocument.java +++ b/src/ooxml/java/org/apache/poi/xwpf/XWPFDocument.java @@ -24,6 +24,7 @@ import org.openxml4j.exceptions.InvalidFormatException; import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.opc.Package; import org.openxml4j.opc.PackagePart; +import org.openxml4j.opc.PackageRelationshipCollection; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles; @@ -47,6 +48,7 @@ public class XWPFDocument extends POIXMLDocument { public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"; public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"; public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"; + public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"; private DocumentDocument wordDoc; @@ -89,4 +91,18 @@ public class XWPFDocument extends POIXMLDocument { StylesDocument.Factory.parse(parts[0].getInputStream()); return sd.getStyles(); } + + /** + * Returns all the hyperlink relations for the file. + * You'll generally want to get the target to get + * the destination of the hyperlink + */ + public PackageRelationshipCollection getHyperlinks() { + try { + return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE); + } catch(InvalidFormatException e) { + // Should never happen + throw new IllegalStateException(e); + } + } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 58fa4839c..bd1936d16 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -16,7 +16,6 @@ ==================================================================== */ package org.apache.poi.xwpf.extractor; -import java.io.File; import java.io.IOException; import org.apache.poi.POIXMLDocument; @@ -25,7 +24,9 @@ import org.apache.poi.xwpf.XWPFDocument; import org.apache.xmlbeans.XmlException; import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.opc.Package; +import org.openxml4j.opc.PackageRelationship; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; @@ -35,6 +36,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; */ public class XWPFWordExtractor extends POIXMLTextExtractor { private XWPFDocument document; + private boolean fetchHyperlinks = false; public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { this(new XWPFDocument(container)); @@ -56,6 +58,15 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { )); System.out.println(extractor.getText()); } + + /** + * Should we also fetch the hyperlinks, when fetching + * the text content? Default is to only output the + * hyperlink label, and not the contents + */ + public void setFetchHyperlinks(boolean fetch) { + fetchHyperlinks = fetch; + } public String getText() { CTBody body = document.getDocumentBody(); @@ -64,9 +75,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { // Loop over paragraphs CTP[] ps = body.getPArray(); for (int i = 0; i < ps.length; i++) { - // Loop over ranges + // Loop over ranges and hyperlinks + // TODO - properly intersperce ranges and hyperlinks CTR[] rs = ps[i].getRArray(); - for (int j = 0; j < rs.length; j++) { + for(int j = 0; j < rs.length; j++) { // Loop over text runs CTText[] texts = rs[j].getTArray(); for (int k = 0; k < texts.length; k++) { @@ -75,6 +87,26 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { ); } } + + CTHyperlink[] hls = ps[i].getHyperlinkArray(); + for(CTHyperlink hl : hls) { + for(CTR r : hl.getRArray()) { + for(CTText txt : r.getTArray()) { + text.append(txt.getStringValue()); + } + } + if(fetchHyperlinks) { + String id = hl.getId(); + if(id != null) { + PackageRelationship hlRel = + document.getHyperlinks().getRelationshipByID(id); + if(hlRel != null) { + text.append(" <" + hlRel.getTargetURI().toString() + ">"); + } + } + } + } + // New line after each paragraph. text.append("\n"); } diff --git a/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFSlideShow.java b/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFSlideShow.java index c25d08a90..682fb9757 100644 --- a/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFSlideShow.java +++ b/src/ooxml/testcases/org/apache/poi/xslf/TestXSLFSlideShow.java @@ -46,7 +46,7 @@ public class TestXSLFSlideShow extends TestCase { if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) { found = true; } - System.out.println(part); + //System.out.println(part); } assertTrue(found); } @@ -110,14 +110,14 @@ public class TestXSLFSlideShow extends TestCase { public void testMetadataBasics() throws Exception { XSLFSlideShow xml = new XSLFSlideShow(sampleFile); - assertNotNull(xml.getCoreProperties()); - assertNotNull(xml.getExtendedProperties()); + assertNotNull(xml.getProperties().getCoreProperties()); + assertNotNull(xml.getProperties().getExtendedProperties()); - assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication()); - assertEquals(0, xml.getExtendedProperties().getCharacters()); - assertEquals(0, xml.getExtendedProperties().getLines()); + assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); + assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); + assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); - assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); - assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); + assertEquals(null, xml.getProperties().getCoreProperties().getTitle()); + assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); } } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFDocument.java b/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFDocument.java index 94127193f..0bd24a207 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFDocument.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFDocument.java @@ -92,29 +92,29 @@ public class TestXWPFDocument extends TestCase { XWPFDocument xml = new XWPFDocument( POIXMLDocument.openPackage(sampleFile.toString()) ); - assertNotNull(xml.getCoreProperties()); - assertNotNull(xml.getExtendedProperties()); + assertNotNull(xml.getProperties().getCoreProperties()); + assertNotNull(xml.getProperties().getExtendedProperties()); - assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication()); - assertEquals(1315, xml.getExtendedProperties().getCharacters()); - assertEquals(10, xml.getExtendedProperties().getLines()); + assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); + assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); + assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); - assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue()); - assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue()); + assertEquals(null, xml.getProperties().getCoreProperties().getTitle()); + assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); } public void testMetadataComplex() throws Exception { XWPFDocument xml = new XWPFDocument( POIXMLDocument.openPackage(complexFile.toString()) ); - assertNotNull(xml.getCoreProperties()); - assertNotNull(xml.getExtendedProperties()); + assertNotNull(xml.getProperties().getCoreProperties()); + assertNotNull(xml.getProperties().getExtendedProperties()); - assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication()); - assertEquals(5184, xml.getExtendedProperties().getCharacters()); - assertEquals(0, xml.getExtendedProperties().getLines()); + assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication()); + assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters()); + assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines()); - assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue()); - assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue()); + assertEquals(" ", xml.getProperties().getCoreProperties().getTitle()); + assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue()); } } diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index 9a1f239bc..e62dd66ce 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -37,6 +37,12 @@ public class TestXWPFWordExtractor extends TestCase { */ private XWPFDocument xmlB; private File fileB; + + /** + * File with hyperlinks + */ + private XWPFDocument xmlC; + private File fileC; protected void setUp() throws Exception { super.setUp(); @@ -49,11 +55,17 @@ public class TestXWPFWordExtractor extends TestCase { System.getProperty("HWPF.testdata.path") + File.separator + "IllustrativeCases.docx" ); + fileC = new File( + System.getProperty("HWPF.testdata.path") + + File.separator + "TestDocument.docx" + ); assertTrue(fileA.exists()); assertTrue(fileB.exists()); + assertTrue(fileC.exists()); xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString())); xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString())); + xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString())); } /** @@ -117,4 +129,32 @@ public class TestXWPFWordExtractor extends TestCase { } assertEquals(79, ps); } + + public void testGetWithHyperlinks() throws Exception { + XWPFWordExtractor extractor = + new XWPFWordExtractor(xmlC); + extractor.getText(); + extractor.setFetchHyperlinks(true); + extractor.getText(); + + // Now check contents + // TODO - fix once correctly handling contents + extractor.setFetchHyperlinks(false); + assertEquals( +// "This is a test document\nThis bit is in bold and italic\n" + +// "Back to normal\nWe have a hyperlink here, and another.\n", + "This is a test document\nThis bit is in bold and italic\n" + + "Back to normal\nWe have a here, and .hyperlinkanother\n", + extractor.getText() + ); + + extractor.setFetchHyperlinks(true); + assertEquals( +// "This is a test document\nThis bit is in bold and italic\n" + +// "Back to normal\nWe have a hyperlink here, and another.\n", + "This is a test document\nThis bit is in bold and italic\n" + + "Back to normal\nWe have a here, and .hyperlink another\n", + extractor.getText() + ); + } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx b/src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx new file mode 100755 index 000000000..058dec5e4 Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx differ