55347 - integrate textbox text extraction with Excel extractors
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1511789 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0d700b0470
commit
ebbbaefe69
@ -21,6 +21,8 @@ import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.poi.POIXMLException;
|
||||
@ -37,7 +39,9 @@ import org.apache.poi.xssf.model.CommentsTable;
|
||||
import org.apache.poi.xssf.model.SharedStringsTable;
|
||||
import org.apache.poi.xssf.model.StylesTable;
|
||||
import org.apache.poi.xssf.model.ThemesTable;
|
||||
import org.apache.poi.xssf.usermodel.XSSFDrawing;
|
||||
import org.apache.poi.xssf.usermodel.XSSFRelation;
|
||||
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
|
||||
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbook;
|
||||
@ -273,6 +277,35 @@ public class XSSFReader {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the shapes associated with this sheet,
|
||||
* an empty list or null if there is an exception
|
||||
*/
|
||||
public List<XSSFShape> getShapes() {
|
||||
PackagePart sheetPkg = getSheetPart();
|
||||
List<XSSFShape> shapes= new LinkedList<XSSFShape>();
|
||||
// Do we have a comments relationship? (Only ever one if so)
|
||||
try {
|
||||
PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation());
|
||||
for (int i = 0; i < drawingsList.size(); i++){
|
||||
PackageRelationship drawings = drawingsList.getRelationship(i);
|
||||
PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI());
|
||||
PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName);
|
||||
XSSFDrawing drawing = new XSSFDrawing(drawingsPart, drawings);
|
||||
for (XSSFShape shape : drawing.getShapes()){
|
||||
shapes.add(shape);
|
||||
}
|
||||
}
|
||||
} catch (XmlException e){
|
||||
return null;
|
||||
} catch (InvalidFormatException e) {
|
||||
return null;
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
return shapes;
|
||||
}
|
||||
|
||||
public PackagePart getSheetPart() {
|
||||
String sheetId = ctSheet.getId();
|
||||
return sheetMap.get(sheetId);
|
||||
|
@ -18,6 +18,7 @@ package org.apache.poi.xssf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
@ -37,6 +38,8 @@ import org.apache.poi.xssf.eventusermodel.XSSFReader;
|
||||
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
|
||||
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
|
||||
import org.apache.poi.xssf.model.StylesTable;
|
||||
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.InputSource;
|
||||
@ -54,6 +57,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
|
||||
private Locale locale;
|
||||
private boolean includeSheetNames = true;
|
||||
private boolean formulasNotResults = false;
|
||||
private boolean includeTextBoxes = true;
|
||||
|
||||
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
|
||||
this(OPCPackage.open(path));
|
||||
@ -89,6 +93,14 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
|
||||
public void setFormulasNotResults(boolean formulasNotResults) {
|
||||
this.formulasNotResults = formulasNotResults;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should text from textboxes be included? Default is true
|
||||
*/
|
||||
|
||||
public void setIncludeTextBoxes(boolean includeTextBoxes) {
|
||||
this.includeTextBoxes = includeTextBoxes;
|
||||
}
|
||||
|
||||
public void setLocale(Locale locale) {
|
||||
this.locale = locale;
|
||||
@ -175,6 +187,9 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
|
||||
text.append('\n');
|
||||
}
|
||||
processSheet(sheetExtractor, styles, strings, stream);
|
||||
if (includeTextBoxes){
|
||||
processShapes(iter.getShapes(), text);
|
||||
}
|
||||
stream.close();
|
||||
}
|
||||
|
||||
@ -191,7 +206,20 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
private void processShapes(List<XSSFShape> shapes, StringBuffer text) {
|
||||
if (shapes == null){
|
||||
return;
|
||||
}
|
||||
for (XSSFShape shape : shapes){
|
||||
if (shape instanceof XSSFSimpleShape){
|
||||
String sText = ((XSSFSimpleShape)shape).getText();
|
||||
if (sText != null && sText.length() > 0){
|
||||
text.append(sText).append('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (container != null) {
|
||||
container.close();
|
||||
|
@ -31,8 +31,11 @@ import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
import org.apache.poi.ss.usermodel.HeaderFooter;
|
||||
import org.apache.poi.ss.usermodel.Row;
|
||||
import org.apache.poi.xssf.usermodel.XSSFCell;
|
||||
import org.apache.poi.xssf.usermodel.XSSFDrawing;
|
||||
import org.apache.poi.xssf.usermodel.XSSFRelation;
|
||||
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
|
||||
@ -52,6 +55,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
|
||||
private boolean formulasNotResults = false;
|
||||
private boolean includeCellComments = false;
|
||||
private boolean includeHeadersFooters = true;
|
||||
private boolean includeTextBoxes = true;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #XSSFExcelExtractor(org.apache.poi.openxml4j.opc.OPCPackage)} instead.
|
||||
@ -103,6 +107,13 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
|
||||
public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
|
||||
this.includeHeadersFooters = includeHeadersFooters;
|
||||
}
|
||||
/**
|
||||
* Should text within textboxes be included? Default is true
|
||||
* @param includeTextBoxes
|
||||
*/
|
||||
public void setIncludeTextBoxes(boolean includeTextBoxes){
|
||||
this.includeTextBoxes = includeTextBoxes;
|
||||
}
|
||||
/**
|
||||
* What Locale should be used for formatting numbers (based
|
||||
* on the styles applied to the cells)
|
||||
@ -180,7 +191,20 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor implements org.apach
|
||||
}
|
||||
text.append("\n");
|
||||
}
|
||||
|
||||
|
||||
// add textboxes
|
||||
if (includeTextBoxes){
|
||||
XSSFDrawing drawing = sheet.createDrawingPatriarch();
|
||||
for (XSSFShape shape : drawing.getShapes()){
|
||||
if (shape instanceof XSSFSimpleShape){
|
||||
String boxText = ((XSSFSimpleShape)shape).getText();
|
||||
if (boxText.length() > 0){
|
||||
text.append(boxText);
|
||||
text.append('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Finally footer(s), if present
|
||||
if(includeHeadersFooters) {
|
||||
text.append(
|
||||
|
@ -76,7 +76,7 @@ public final class XSSFDrawing extends POIXMLDocumentPart implements Drawing {
|
||||
* @param rel the package relationship holding this drawing,
|
||||
* the relationship type must be http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing
|
||||
*/
|
||||
protected XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
|
||||
public XSSFDrawing(PackagePart part, PackageRelationship rel) throws IOException, XmlException {
|
||||
super(part, rel);
|
||||
XmlOptions options = new XmlOptions(DEFAULT_XML_OPTIONS);
|
||||
//Removing root element
|
||||
|
@ -19,6 +19,7 @@ package org.apache.poi.xssf.eventusermodel;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
@ -27,6 +28,8 @@ import org.apache.poi.util.IOUtils;
|
||||
import org.apache.poi.xssf.XSSFTestDataSamples;
|
||||
import org.apache.poi.xssf.model.CommentsTable;
|
||||
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
|
||||
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
|
||||
import org.apache.poi.POIDataSamples;
|
||||
|
||||
/**
|
||||
@ -164,4 +167,33 @@ public final class TestXSSFReader extends TestCase {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Test text extraction from text box using getShapes()
|
||||
* @throws Exception
|
||||
*/
|
||||
public void testShapes() throws Exception{
|
||||
OPCPackage pkg = XSSFTestDataSamples.openSamplePackage("WithTextBox.xlsx");
|
||||
XSSFReader r = new XSSFReader(pkg);
|
||||
XSSFReader.SheetIterator it = (XSSFReader.SheetIterator)r.getSheetsData();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while(it.hasNext())
|
||||
{
|
||||
it.next();
|
||||
List<XSSFShape> shapes = it.getShapes();
|
||||
if (shapes != null){
|
||||
for (XSSFShape shape : shapes){
|
||||
if (shape instanceof XSSFSimpleShape){
|
||||
String t = ((XSSFSimpleShape)shape).getText();
|
||||
sb.append(t).append('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
String text = sb.toString();
|
||||
assertTrue(text.indexOf("Line 1") > -1);
|
||||
assertTrue(text.indexOf("Line 2") > -1);
|
||||
assertTrue(text.indexOf("Line 3") > -1);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -17,6 +17,7 @@
|
||||
|
||||
package org.apache.poi.xssf.extractor;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -25,7 +26,11 @@ import junit.framework.TestCase;
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.xssf.XSSFTestDataSamples;
|
||||
import org.apache.poi.xssf.eventusermodel.XSSFReader;
|
||||
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
|
||||
|
||||
/**
|
||||
* Tests for {@link XSSFEventBasedExcelExtractor}
|
||||
@ -167,4 +172,19 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase {
|
||||
ole2Extractor.close();
|
||||
ooxmlExtractor.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test text extraction from text box using getShapes()
|
||||
* @throws Exception
|
||||
*/
|
||||
public void testShapes() throws Exception{
|
||||
XSSFEventBasedExcelExtractor ooxmlExtractor = getExtractor("WithTextBox.xlsx");
|
||||
|
||||
String text = ooxmlExtractor.getText();
|
||||
|
||||
assertTrue(text.indexOf("Line 1") > -1);
|
||||
assertTrue(text.indexOf("Line 2") > -1);
|
||||
assertTrue(text.indexOf("Line 3") > -1);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -211,4 +211,16 @@ public class TestXSSFExcelExtractor extends TestCase {
|
||||
|
||||
extractor.close();
|
||||
}
|
||||
/**
|
||||
* Simple test for text box text
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testTextBoxes() throws IOException {
|
||||
XSSFExcelExtractor extractor = getExtractor("WithTextBox.xlsx");
|
||||
extractor.setFormulasNotResults(true);
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.indexOf("Line 1") > -1);
|
||||
assertTrue(text.indexOf("Line 2") > -1);
|
||||
assertTrue(text.indexOf("Line 3") > -1);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user