XLSF text extraction improvements relating to TIKA-712

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1175887 13f79535-47bb-0310-9956-ffa450edef68
2011-09-26 14:37:50 +00:00 · 2011-09-26 14:37:50 +00:00 · 9e1ca2caea
commit 9e1ca2caea
parent 8de1ce7c6a
5 changed files with 139 additions and 40 deletions
--- a/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java
+++ b/src/ooxml/java/org/apache/poi/xslf/extractor/XSLFPowerPointExtractor.java
@ -23,6 +23,8 @@ import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.xslf.XSLFSlideShow;
 import org.apache.poi.xslf.usermodel.DrawingParagraph;
+import org.apache.poi.xslf.usermodel.DrawingTextBody;
+import org.apache.poi.xslf.usermodel.DrawingTextPlaceholder;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
 import org.apache.poi.xslf.usermodel.XSLFComments;
@ -30,6 +32,7 @@ import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
 import org.apache.poi.xslf.usermodel.XSLFNotes;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
 import org.apache.poi.xslf.usermodel.XSLFSlideMaster;
 import org.apache.xmlbeans.XmlException;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
@ -124,6 +127,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
         try {
            XSLFNotes notes = slide.getNotes();
            XSLFComments comments = slide.getComments();
+            XSLFSlideLayout layout = slide.getSlideLayout();
            XSLFSlideMaster master = slide.getMasterSheet();

            // TODO Do the slide's name
@ -131,11 +135,16 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {

            // Do the slide's text if requested
            if (slideText) {
-               extractText(slide.getCommonSlideData(), text);
+               extractText(slide.getCommonSlideData(), false, text);
               
-               // If there's a master sheet and it's requested, grab text from there
-               if(masterText && master != null) {
-                  extractText(master.getCommonSlideData(), text);
+               // If requested, get text from the master and it's layout 
+               if(masterText) {
+                  if(layout != null) {
+                     extractText(layout.getCommonSlideData(), true, text);
+                  }
+                  if(master != null) {
+                     extractText(master.getCommonSlideData(), true, text);
+                  }
               }

               // If the slide has comments, do those too
@ -158,7 +167,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {

            // Do the notes if requested
            if (notesText && notes != null) {
-               extractText(notes.getCommonSlideData(), text);
+               extractText(notes.getCommonSlideData(), false, text);
            }
         } catch (Exception e) {
            throw new RuntimeException(e);
@ -168,10 +177,20 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
      return text.toString();
   }
 	
-	private void extractText(XSLFCommonSlideData data, StringBuffer text) {
-        for (DrawingParagraph p : data.getText()) {
+	private void extractText(XSLFCommonSlideData data, boolean skipPlaceholders, StringBuffer text) {
+	   for(DrawingTextBody textBody : data.getDrawingText()) {
+	      if(skipPlaceholders && textBody instanceof DrawingTextPlaceholder) {
+	         DrawingTextPlaceholder ph = (DrawingTextPlaceholder)textBody;
+	         if(! ph.isPlaceholderCustom()) {
+	            // Skip non-customised placeholder text
+	            continue;
+	         }
+	      }
+	      
+	      for (DrawingParagraph p : textBody.getParagraphs()) {
            text.append(p.getText());
            text.append("\n");
-        }
-    }
+	      }
+	   }
+	}
 }
--- a/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextBody.java
+++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextBody.java
@ -17,16 +17,16 @@

 package org.apache.poi.xslf.usermodel;

+import java.util.List;
+
 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;

-import java.util.List;
-
 public class DrawingTextBody {
    private final CTTextBody textBody;

    public DrawingTextBody(CTTextBody textBody) {
-        this.textBody = textBody;
+       this.textBody = textBody;
    }

    public DrawingParagraph[] getParagraphs() {
--- a/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java
+++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/DrawingTextPlaceholder.java
@ -0,0 +1,57 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.xslf.usermodel;
+
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTPlaceholder;
+import org.openxmlformats.schemas.presentationml.x2006.main.STPlaceholderType;
+
+/**
+ * A {@link DrawingTextBody} which is a placeholder
+ * @author nick
+ *
+ */
+public class DrawingTextPlaceholder extends DrawingTextBody {
+    private final CTPlaceholder placeholder;
+
+    public DrawingTextPlaceholder(CTTextBody textBody, CTPlaceholder placeholder) {
+       super(textBody);
+       this.placeholder = placeholder;
+    }
+    
+    /**
+     * What kind of placeholder is this?
+     */
+    public String getPlaceholderType() {
+       return placeholder.getType().toString();
+    }
+
+    /**
+     * What kind of placeholder is this?
+     */
+    public STPlaceholderType.Enum getPlaceholderTypeEnum() {
+       return placeholder.getType();
+    }
+
+    /**
+     * Is the PlaceHolder text customised?
+     */
+    public boolean isPlaceholderCustom() {
+       return placeholder.getHasCustomPrompt();
+    }
+}
--- a/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFCommonSlideData.java
+++ b/src/ooxml/java/org/apache/poi/xslf/usermodel/XSLFCommonSlideData.java
@ -26,6 +26,7 @@ import org.apache.xmlbeans.impl.values.XmlAnyTypeImpl;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTApplicationNonVisualDrawingProps;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
 import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
@ -42,11 +43,11 @@ public class XSLFCommonSlideData {
    public XSLFCommonSlideData(CTCommonSlideData data) {
        this.data = data;
    }
-
-    public List<DrawingParagraph> getText() {
+    
+    public List<DrawingTextBody> getDrawingText() {
        CTGroupShape gs = data.getSpTree();

-        List<DrawingParagraph> out = new ArrayList<DrawingParagraph>();
+        List<DrawingTextBody> out = new ArrayList<DrawingTextBody>();

        processShape(gs, out);

@ -77,8 +78,7 @@ public class XSLFCommonSlideData {
                    for (DrawingTableRow row : table.getRows()) {
                        for (DrawingTableCell cell : row.getCells()) {
                            DrawingTextBody textBody = cell.getTextBody();
-
-                            out.addAll(Arrays.asList(textBody.getParagraphs()));
+                            out.add(textBody);
                        }
                    }
                }
@ -89,19 +89,31 @@ public class XSLFCommonSlideData {

        return out;
    }
+    public List<DrawingParagraph> getText() {
+       List<DrawingParagraph> paragraphs = new ArrayList<DrawingParagraph>();
+       for(DrawingTextBody textBody : getDrawingText()) {
+          paragraphs.addAll(Arrays.asList(textBody.getParagraphs()));
+       }
+       return paragraphs;
+    }

-    private void processShape(CTGroupShape gs, List<DrawingParagraph> out) {
+    private void processShape(CTGroupShape gs, List<DrawingTextBody> out) {
        List<CTShape> shapes = gs.getSpList();
-        for (int i = 0; i < shapes.size(); i++) {
-            CTTextBody ctTextBody = shapes.get(i).getTxBody();
+        for (CTShape shape : shapes) {
+            CTTextBody ctTextBody = shape.getTxBody();
            if (ctTextBody==null) {
                continue;
            }
+            
+            DrawingTextBody textBody;
+            CTApplicationNonVisualDrawingProps nvpr = shape.getNvSpPr().getNvPr(); 
+            if(nvpr.isSetPh()) {
+               textBody = new DrawingTextPlaceholder(ctTextBody, nvpr.getPh());
+            } else {
+               textBody = new DrawingTextBody(ctTextBody);
+            }

-            DrawingTextBody textBody = new DrawingTextBody(ctTextBody);
-
-            out.addAll(Arrays.asList(textBody.getParagraphs()));
+            out.add(textBody);
        }
    }
-
 }
--- a/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java
+++ b/src/ooxml/testcases/org/apache/poi/xslf/extractor/TestXSLFPowerPointExtractor.java
@ -58,9 +58,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
 		assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
 		assertTrue(text.contains("amet\n\n"));

-		// Our master text, for tests
+		// Our placeholder master text
+		// This shouldn't show up in the output
 		String masterText =
         "Click to edit Master title style\n" +
+         "Click to edit Master subtitle style\n" +
+         "\n\n\n\n\n\n" +
+         "Click to edit Master title style\n" +
         "Click to edit Master text styles\n" +
         "Second level\n" +
         "Third level\n" +
@ -111,17 +115,13 @@ public class TestXSLFPowerPointExtractor extends TestCase {
            "Lorem ipsum dolor sit amet\n" +
            "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
            "\n" +
-            masterText +
-            "\n\n\n" +
            "Lorem ipsum dolor sit amet\n" +
            "Lorem\n" +
            "ipsum\n" +
            "dolor\n" +
            "sit\n" +
            "amet\n" +
-            "\n" +
-            masterText +
-            "\n\n\n"
+            "\n"
            , text
      );
 		
@ -131,17 +131,14 @@ public class TestXSLFPowerPointExtractor extends TestCase {
            "Lorem ipsum dolor sit amet\n" +
            "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
            "\n" +
-            masterText +
-            "\n\n\n\n\n" +
+            "\n\n" +
            "Lorem ipsum dolor sit amet\n" +
            "Lorem\n" +
            "ipsum\n" +
            "dolor\n" +
            "sit\n" +
            "amet\n" +
-            "\n" +
-            masterText +
-            "\n\n\n\n\n"
+            "\n\n\n"
            , text
      );
 		
@ -176,6 +173,9 @@ public class TestXSLFPowerPointExtractor extends TestCase {
         new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("WithMaster.pptx")));
      XSLFPowerPointExtractor extractor = 
         new XSLFPowerPointExtractor(xml);
+      extractor.setSlidesByDefault(true);
+      extractor.setNotesByDefault(false);
+      extractor.setMasterByDefault(true);
      
      String text = extractor.getText();
      assertTrue(text.length() > 0);
@ -183,17 +183,28 @@ public class TestXSLFPowerPointExtractor extends TestCase {
      // Check master text is there
      assertTrue("Unable to find expected word in text\n" + text, 
            text.contains("Footer from the master slide"));
+
+      // Theme text shouldn't show up
+      String themeText = 
+         "Theme Master Title\n" +
+         "Theme Master first level\n" +
+         "And the 2nd level\n" +
+         "Our 3rd level goes here\n" +
+         "And onto the 4th, such fun….\n" +
+         "Finally is the Fifth level\n";
      
      // Check the whole text
      assertEquals(
            "First page title\n" +
            "First page subtitle\n" +
-//            "This text comes from the Master Slide\n" + // TODO
-//            "This is the Master Title\n" + // TODO
-            "\n" + // TODO Should be the above
+            "This is the Master Title\n" +
+            "This text comes from the Master Slide\n" +
+            "\n" +
+            // TODO Detect we didn't have a title, and include the master one
            "2nd page subtitle\n" +
-//          "This text comes from the Master Slide\n" + // TODO
-            "Footer from the master slide\n"
+            "Footer from the master slide\n" +
+            "This is the Master Title\n" +
+            "This text comes from the Master Slide\n"
            , text
      );
 	}