55347 - integrate textbox text extraction with Excel extractors

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1511789 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2013-08-08 14:04:07 +00:00
parent 0d700b0470
commit ebbbaefe69
7 changed files with 152 additions and 3 deletions

View File

@ -21,6 +21,8 @@ import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.poi.POIXMLException;
@ -37,7 +39,9 @@ import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.model.ThemesTable;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbook;
@ -273,6 +277,35 @@ public class XSSFReader {
return null;
}
/**
* Returns the shapes associated with this sheet,
* an empty list or null if there is an exception
*/
public List<XSSFShape> getShapes() {
PackagePart sheetPkg = getSheetPart();
List<XSSFShape> shapes= new LinkedList<XSSFShape>();
// Do we have a comments relationship? (Only ever one if so)
try {
PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation());
for (int i = 0; i < drawingsList.size(); i++){
PackageRelationship drawings = drawingsList.getRelationship(i);
PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI());
PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName);
XSSFDrawing drawing = new XSSFDrawing(drawingsPart, drawings);
for (XSSFShape shape : drawing.getShapes()){
shapes.add(shape);
}
}
} catch (XmlException e){
return null;
} catch (InvalidFormatException e) {
return null;
} catch (IOException e) {
return null;
}
return shapes;
}
public PackagePart getSheetPart() {
String sheetId = ctSheet.getId();
return sheetMap.get(sheetId);

View File

@ -18,6 +18,7 @@ package org.apache.poi.xssf.extractor;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Locale;
import javax.xml.parsers.ParserConfigurationException;
@ -37,6 +38,8 @@ import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
@ -54,6 +57,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
private Locale locale;
private boolean includeSheetNames = true;
private boolean formulasNotResults = false;
private boolean includeTextBoxes = true;
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(OPCPackage.open(path));
@ -90,6 +94,14 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
this.formulasNotResults = formulasNotResults;
}
/**
* Should text from textboxes be included? Default is true
*/
public void setIncludeTextBoxes(boolean includeTextBoxes) {
this.includeTextBoxes = includeTextBoxes;
}
public void setLocale(Locale locale) {
this.locale = locale;
}
@ -175,6 +187,9 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
text.append('\n');
}
processSheet(sheetExtractor, styles, strings, stream);
if (includeTextBoxes){
processShapes(iter.getShapes(), text);
}
stream.close();
}
@ -191,6 +206,19 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
}
}
private void processShapes(List<XSSFShape> shapes, StringBuffer text) {
if (shapes == null){
return;
}
for (XSSFShape shape : shapes){
if (shape instanceof XSSFSimpleShape){
String sText = ((XSSFSimpleShape)shape).getText();
if (sText != null && sText.length() > 0){
text.append(sText).append('\n');
}
}
}
}
@Override
public void close() throws IOException {
if (container != null) {

View File

@ -31,8 +31,11 @@ import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.xmlbeans.XmlException;
@ -52,6 +55,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
private boolean formulasNotResults = false;
private boolean includeCellComments = false;
private boolean includeHeadersFooters = true;
private boolean includeTextBoxes = true;
/**
* @deprecated Use {@link #XSSFExcelExtractor(org.apache.poi.openxml4j.opc.OPCPackage)} instead.
@ -103,6 +107,13 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
this.includeHeadersFooters = includeHeadersFooters;
}
/**
* Should text within textboxes be included? Default is true
* @param includeTextBoxes
*/
public void setIncludeTextBoxes(boolean includeTextBoxes){
this.includeTextBoxes = includeTextBoxes;
}
/**
* What Locale should be used for formatting numbers (based
* on the styles applied to the cells)
@ -181,6 +192,19 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
text.append("\n");
}
// add textboxes
if (includeTextBoxes){
XSSFDrawing drawing = sheet.createDrawingPatriarch();
for (XSSFShape shape : drawing.getShapes()){
if (shape instanceof XSSFSimpleShape){
String boxText = ((XSSFSimpleShape)shape).getText();
if (boxText.length() > 0){
text.append(boxText);
text.append('\n');
}
}
}
}
// Finally footer(s), if present
if(includeHeadersFooters) {
text.append(

View File

@ -76,7 +76,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
* @param rel the package relationship holding this drawing,
* the relationship type must be http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing
*/
protected XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
public XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
super(part, rel);
XmlOptions options = new XmlOptions(DEFAULT_XML_OPTIONS);
//Removing root element

View File

@ -19,6 +19,7 @@ package org.apache.poi.xssf.eventusermodel;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import junit.framework.TestCase;
@ -27,6 +28,8 @@ import org.apache.poi.util.IOUtils;
import org.apache.poi.xssf.XSSFTestDataSamples;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.POIDataSamples;
/**
@ -164,4 +167,33 @@ public final class TestXSSFReader extends TestCase {
stream.close();
}
}
/**
* Test text extraction from text box using getShapes()
* @throws Exception
*/
public void testShapes() throws Exception{
OPCPackage pkg = XSSFTestDataSamples.openSamplePackage("WithTextBox.xlsx");
XSSFReader r = new XSSFReader(pkg);
XSSFReader.SheetIterator it = (XSSFReader.SheetIterator)r.getSheetsData();
StringBuilder sb = new StringBuilder();
while(it.hasNext())
{
it.next();
List<XSSFShape> shapes = it.getShapes();
if (shapes != null){
for (XSSFShape shape : shapes){
if (shape instanceof XSSFSimpleShape){
String t = ((XSSFSimpleShape)shape).getText();
sb.append(t).append('\n');
}
}
}
}
String text = sb.toString();
assertTrue(text.indexOf("Line 1") > -1);
assertTrue(text.indexOf("Line 2") > -1);
assertTrue(text.indexOf("Line 3") > -1);
}
}

View File

@ -17,6 +17,7 @@
package org.apache.poi.xssf.extractor;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -25,7 +26,11 @@ import junit.framework.TestCase;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.XSSFTestDataSamples;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
/**
* Tests for {@link XSSFEventBasedExcelExtractor}
@ -167,4 +172,19 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase {
ole2Extractor.close();
ooxmlExtractor.close();
}
/**
* Test text extraction from text box using getShapes()
* @throws Exception
*/
public void testShapes() throws Exception{
XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("WithTextBox.xlsx");
String text = ooxmlExtractor.getText();
assertTrue(text.indexOf("Line 1") > -1);
assertTrue(text.indexOf("Line 2") > -1);
assertTrue(text.indexOf("Line 3") > -1);
}
}

View File

@ -211,4 +211,16 @@ public class TestXSSFExcelExtractor extends TestCase {
extractor.close();
}
/**
* Simple test for text box text
* @throws IOException
*/
public void testTextBoxes() throws IOException {
XSSFExcelExtractor extractor = getExtractor("WithTextBox.xlsx");
extractor.setFormulasNotResults(true);
String text = extractor.getText();
assertTrue(text.indexOf("Line 1") > -1);
assertTrue(text.indexOf("Line 2") > -1);
assertTrue(text.indexOf("Line 3") > -1);
}
}