From ebbbaefe691f2550372f9b11a593bf8de112e759 Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Thu, 8 Aug 2013 14:04:07 +0000 Subject: [PATCH] 55347 - integrate textbox text extraction with Excel extractors git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1511789 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/xssf/eventusermodel/XSSFReader.java | 33 +++++++++++++++++++ .../XSSFEventBasedExcelExtractor.java | 30 ++++++++++++++++- .../xssf/extractor/XSSFExcelExtractor.java | 26 ++++++++++++++- .../poi/xssf/usermodel/XSSFDrawing.java | 2 +- .../xssf/eventusermodel/TestXSSFReader.java | 32 ++++++++++++++++++ .../TestXSSFEventBasedExcelExtractor.java | 20 +++++++++++ .../extractor/TestXSSFExcelExtractor.java | 12 +++++++ 7 files changed, 152 insertions(+), 3 deletions(-) diff --git a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFReader.java b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFReader.java index 59ea919c5..7c250948c 100644 --- a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFReader.java +++ b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFReader.java @@ -21,6 +21,8 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; import java.util.Map; import org.apache.poi.POIXMLException; @@ -37,7 +39,9 @@ import org.apache.poi.xssf.model.CommentsTable; import org.apache.poi.xssf.model.SharedStringsTable; import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.model.ThemesTable; +import org.apache.poi.xssf.usermodel.XSSFDrawing; import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.apache.poi.xssf.usermodel.XSSFShape; import org.apache.xmlbeans.XmlException; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbook; @@ -273,6 +277,35 @@ public class XSSFReader { return null; } + /** + * Returns the shapes associated with this sheet, + * an empty list or null if there is an exception + */ + public List getShapes() { + PackagePart sheetPkg = getSheetPart(); + List shapes= new LinkedList(); + // Do we have a comments relationship? (Only ever one if so) + try { + PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation()); + for (int i = 0; i < drawingsList.size(); i++){ + PackageRelationship drawings = drawingsList.getRelationship(i); + PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI()); + PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName); + XSSFDrawing drawing = new XSSFDrawing(drawingsPart, drawings); + for (XSSFShape shape : drawing.getShapes()){ + shapes.add(shape); + } + } + } catch (XmlException e){ + return null; + } catch (InvalidFormatException e) { + return null; + } catch (IOException e) { + return null; + } + return shapes; + } + public PackagePart getSheetPart() { String sheetId = ctSheet.getId(); return sheetMap.get(sheetId); diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java index 0c31fe04d..a313271e5 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java @@ -18,6 +18,7 @@ package org.apache.poi.xssf.extractor; import java.io.IOException; import java.io.InputStream; +import java.util.List; import java.util.Locale; import javax.xml.parsers.ParserConfigurationException; @@ -37,6 +38,8 @@ import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.model.StylesTable; +import org.apache.poi.xssf.usermodel.XSSFShape; +import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.xmlbeans.XmlException; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; @@ -54,6 +57,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { private Locale locale; private boolean includeSheetNames = true; private boolean formulasNotResults = false; + private boolean includeTextBoxes = true; public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { this(OPCPackage.open(path)); @@ -89,6 +93,14 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { public void setFormulasNotResults(boolean formulasNotResults) { this.formulasNotResults = formulasNotResults; } + + /** + * Should text from textboxes be included? Default is true + */ + + public void setIncludeTextBoxes(boolean includeTextBoxes) { + this.includeTextBoxes = includeTextBoxes; + } public void setLocale(Locale locale) { this.locale = locale; @@ -175,6 +187,9 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { text.append('\n'); } processSheet(sheetExtractor, styles, strings, stream); + if (includeTextBoxes){ + processShapes(iter.getShapes(), text); + } stream.close(); } @@ -191,7 +206,20 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { } } - @Override + private void processShapes(List shapes, StringBuffer text) { + if (shapes == null){ + return; + } + for (XSSFShape shape : shapes){ + if (shape instanceof XSSFSimpleShape){ + String sText = ((XSSFSimpleShape)shape).getText(); + if (sText != null && sText.length() > 0){ + text.append(sText).append('\n'); + } + } + } + } + @Override public void close() throws IOException { if (container != null) { container.close(); diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java index 32702fd8f..1d7c87715 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java @@ -31,8 +31,11 @@ import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.HeaderFooter; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.xssf.usermodel.XSSFCell; +import org.apache.poi.xssf.usermodel.XSSFDrawing; import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.apache.poi.xssf.usermodel.XSSFShape; import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.xmlbeans.XmlException; @@ -52,6 +55,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach private boolean formulasNotResults = false; private boolean includeCellComments = false; private boolean includeHeadersFooters = true; + private boolean includeTextBoxes = true; /** * @deprecated Use {@link #XSSFExcelExtractor(org.apache.poi.openxml4j.opc.OPCPackage)} instead. @@ -103,6 +107,13 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach public void setIncludeHeadersFooters(boolean includeHeadersFooters) { this.includeHeadersFooters = includeHeadersFooters; } + /** + * Should text within textboxes be included? Default is true + * @param includeTextBoxes + */ + public void setIncludeTextBoxes(boolean includeTextBoxes){ + this.includeTextBoxes = includeTextBoxes; + } /** * What Locale should be used for formatting numbers (based * on the styles applied to the cells) @@ -180,7 +191,20 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach } text.append("\n"); } - + + // add textboxes + if (includeTextBoxes){ + XSSFDrawing drawing = sheet.createDrawingPatriarch(); + for (XSSFShape shape : drawing.getShapes()){ + if (shape instanceof XSSFSimpleShape){ + String boxText = ((XSSFSimpleShape)shape).getText(); + if (boxText.length() > 0){ + text.append(boxText); + text.append('\n'); + } + } + } + } // Finally footer(s), if present if(includeHeadersFooters) { text.append( diff --git a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java index 82c5f99d1..100b539de 100644 --- a/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java +++ b/src/ooxml/java/org/apache/poi/xssf/usermodel/XSSFDrawing.java @@ -76,7 +76,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing { * @param rel the package relationship holding this drawing, * the relationship type must be http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing */ - protected XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException { + public XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException { super(part, rel); XmlOptions options = new XmlOptions(DEFAULT_XML_OPTIONS); //Removing root element diff --git a/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestXSSFReader.java b/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestXSSFReader.java index 5baaebb90..85613d3e4 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestXSSFReader.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/eventusermodel/TestXSSFReader.java @@ -19,6 +19,7 @@ package org.apache.poi.xssf.eventusermodel; import java.io.InputStream; import java.util.Iterator; +import java.util.List; import junit.framework.TestCase; @@ -27,6 +28,8 @@ import org.apache.poi.util.IOUtils; import org.apache.poi.xssf.XSSFTestDataSamples; import org.apache.poi.xssf.model.CommentsTable; import org.apache.poi.xssf.usermodel.XSSFRichTextString; +import org.apache.poi.xssf.usermodel.XSSFShape; +import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.poi.POIDataSamples; /** @@ -164,4 +167,33 @@ public final class TestXSSFReader extends TestCase { stream.close(); } } + /** + * Test text extraction from text box using getShapes() + * @throws Exception + */ + public void testShapes() throws Exception{ + OPCPackage pkg = XSSFTestDataSamples.openSamplePackage("WithTextBox.xlsx"); + XSSFReader r = new XSSFReader(pkg); + XSSFReader.SheetIterator it = (XSSFReader.SheetIterator)r.getSheetsData(); + + StringBuilder sb = new StringBuilder(); + while(it.hasNext()) + { + it.next(); + List shapes = it.getShapes(); + if (shapes != null){ + for (XSSFShape shape : shapes){ + if (shape instanceof XSSFSimpleShape){ + String t = ((XSSFSimpleShape)shape).getText(); + sb.append(t).append('\n'); + } + } + } + } + String text = sb.toString(); + assertTrue(text.indexOf("Line 1") > -1); + assertTrue(text.indexOf("Line 2") > -1); + assertTrue(text.indexOf("Line 3") > -1); + + } } diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java index eac3700e7..1b0e6c479 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFEventBasedExcelExtractor.java @@ -17,6 +17,7 @@ package org.apache.poi.xssf.extractor; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -25,7 +26,11 @@ import junit.framework.TestCase; import org.apache.poi.POITextExtractor; import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xssf.XSSFTestDataSamples; +import org.apache.poi.xssf.eventusermodel.XSSFReader; +import org.apache.poi.xssf.usermodel.XSSFShape; +import org.apache.poi.xssf.usermodel.XSSFSimpleShape; /** * Tests for {@link XSSFEventBasedExcelExtractor} @@ -167,4 +172,19 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase { ole2Extractor.close(); ooxmlExtractor.close(); } + + /** + * Test text extraction from text box using getShapes() + * @throws Exception + */ + public void testShapes() throws Exception{ + XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("WithTextBox.xlsx"); + + String text = ooxmlExtractor.getText(); + + assertTrue(text.indexOf("Line 1") > -1); + assertTrue(text.indexOf("Line 2") > -1); + assertTrue(text.indexOf("Line 3") > -1); + + } } diff --git a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java index bc86d6f9b..b4872d118 100644 --- a/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xssf/extractor/TestXSSFExcelExtractor.java @@ -211,4 +211,16 @@ public class TestXSSFExcelExtractor extends TestCase { extractor.close(); } + /** + * Simple test for text box text + * @throws IOException + */ + public void testTextBoxes() throws IOException { + XSSFExcelExtractor extractor = getExtractor("WithTextBox.xlsx"); + extractor.setFormulasNotResults(true); + String text = extractor.getText(); + assertTrue(text.indexOf("Line 1") > -1); + assertTrue(text.indexOf("Line 2") > -1); + assertTrue(text.indexOf("Line 3") > -1); + } }