Implement an Excel text extractor, and put all the existing text extractors under a common superclass, so they're easier to find and use

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589224 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2007-10-27 21:57:10 +00:00
parent e145f417b7
commit 9d3658e72f
6 changed files with 352 additions and 61 deletions

View File

@ -0,0 +1,49 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi;
/**
* Common Parent for Text Extractors
* of POI Documents.
* You will typically find the implementation of
* a given format's text extractor under
* org.apache.poi.[format].extractor .
* @see org.apache.poi.hssf.extractor.ExcelExtractor
* @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor
*/
public abstract class POITextExtractor {
/** The POIDocument that's open */
protected POIDocument document;
/**
* Creates a new text extractor for the given document
*/
public POITextExtractor(POIDocument document) {
this.document = document;
}
/**
* Retrieves all the text from the document.
* How cells, paragraphs etc are separated in the text
* is implementation specific - see the javadocs for
* a specific project for details.
* @return All the text from the document
*/
public abstract String getText();
}

View File

@ -0,0 +1,144 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.extractor;
import java.io.IOException;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* A text extractor for Excel files.
* Returns the textual content of the file, suitable for
* indexing by something like Lucene, but not really
* intended for display to the user.
* To turn an excel file into a CSV or similar, then see
* the XLS2CSVmra example
* @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
*/
public class ExcelExtractor extends POITextExtractor{
private HSSFWorkbook wb;
private boolean includeSheetNames = true;
private boolean formulasNotResults = false;
public ExcelExtractor(HSSFWorkbook wb) {
super(wb);
this.wb = wb;
}
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
this(new HSSFWorkbook(fs));
}
/**
* Should sheet names be included? Default is true
*/
public void setIncludeSheetNames(boolean includeSheetNames) {
this.includeSheetNames = includeSheetNames;
}
/**
* Should we return the formula itself, and not
* the result it produces? Default is false
*/
public void setFormulasNotResults(boolean formulasNotResults) {
this.formulasNotResults = formulasNotResults;
}
/**
* Retreives the text contents of the file
*/
public String getText() {
StringBuffer text = new StringBuffer();
for(int i=0;i<wb.getNumberOfSheets();i++) {
HSSFSheet sheet = wb.getSheetAt(i);
if(sheet == null) { continue; }
if(includeSheetNames) {
String name = wb.getSheetName(i);
if(name != null) {
text.append(name);
text.append("\n");
}
}
int firstRow = sheet.getFirstRowNum();
int lastRow = sheet.getLastRowNum();
for(int j=firstRow;j<=lastRow;j++) {
HSSFRow row = sheet.getRow(j);
if(row == null) { continue; }
// Check each cell in turn
int firstCell = row.getFirstCellNum();
int lastCell = row.getLastCellNum();
for(int k=firstCell;k<lastCell;k++) {
HSSFCell cell = row.getCell((short)k);
boolean outputContents = false;
if(cell == null) { continue; }
switch(cell.getCellType()) {
case HSSFCell.CELL_TYPE_STRING:
text.append(cell.getRichStringCellValue().getString());
outputContents = true;
break;
case HSSFCell.CELL_TYPE_NUMERIC:
// Note - we don't apply any formatting!
text.append(cell.getNumericCellValue());
outputContents = true;
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
text.append(cell.getBooleanCellValue());
outputContents = true;
break;
case HSSFCell.CELL_TYPE_FORMULA:
if(formulasNotResults) {
text.append(cell.getCellFormula());
} else {
// Try it as a string, if not as a number
HSSFRichTextString str =
cell.getRichStringCellValue();
if(str != null && str.length() > 0) {
text.append(str.toString());
} else {
// Try and treat it as a number
double val = cell.getNumericCellValue();
text.append(val);
}
}
outputContents = true;
break;
}
// Output a tab if we're not on the last cell
if(outputContents && k < (lastCell-1)) {
text.append("\t");
}
}
// Finish off the row
text.append("\n");
}
}
return text.toString();
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hdgf.HDGFDiagram;
import org.apache.poi.hdgf.chunks.Chunk.Command;
import org.apache.poi.hdgf.streams.ChunkStream;
@ -33,11 +34,12 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* Can opperate on the command line (outputs to stdout), or
* can return the text for you (eg for use with Lucene).
*/
public class VisioTextExtractor {
public class VisioTextExtractor extends POITextExtractor {
private HDGFDiagram hdgf;
private POIFSFileSystem fs;
public VisioTextExtractor(HDGFDiagram hdgf) {
super(hdgf);
this.hdgf = hdgf;
}
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
@ -84,6 +86,8 @@ public class VisioTextExtractor {
/**
* Returns the textual contents of the file.
* Each textual object's text will be separated
* by a newline
*/
public String getText() {
StringBuffer text = new StringBuffer();

View File

@ -22,6 +22,8 @@ package org.apache.poi.hslf.extractor;
import java.io.*;
import java.util.HashSet;
import org.apache.poi.POITextExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.hslf.*;
import org.apache.poi.hslf.model.*;
@ -34,12 +36,12 @@ import org.apache.poi.hslf.usermodel.*;
* @author Nick Burch
*/
public class PowerPointExtractor
public class PowerPointExtractor extends POITextExtractor
{
private HSLFSlideShow _hslfshow;
private SlideShow _show;
private Slide[] _slides;
private Notes[] _notes;
private HSLFSlideShow _hslfshow;
private SlideShow _show;
private Slide[] _slides;
private Notes[] _notes;
/**
* Basic extractor. Returns all the text, and optionally all the notes
@ -66,61 +68,50 @@ public class PowerPointExtractor
ppe.close();
}
/**
* Creates a PowerPointExtractor, from a file
* @param fileName The name of the file to extract from
*/
public PowerPointExtractor(String fileName) throws IOException {
_hslfshow = new HSLFSlideShow(fileName);
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
}
/**
* Creates a PowerPointExtractor, from a file
* @param fileName The name of the file to extract from
*/
public PowerPointExtractor(String fileName) throws IOException {
this(new FileInputStream(fileName));
}
/**
* Creates a PowerPointExtractor, from an Input Stream
* @param iStream The input stream containing the PowerPoint document
*/
public PowerPointExtractor(InputStream iStream) throws IOException {
this(new POIFSFileSystem(iStream));
}
/**
* Creates a PowerPointExtractor, from an open POIFSFileSystem
* @param fs the POIFSFileSystem containing the PowerPoint document
*/
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
this(new HSLFSlideShow(fs));
}
/**
* Creates a PowerPointExtractor, from an Input Stream
* @param iStream The input stream containing the PowerPoint document
*/
public PowerPointExtractor(InputStream iStream) throws IOException {
_hslfshow = new HSLFSlideShow(iStream);
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
}
/**
* Creates a PowerPointExtractor, from a HSLFSlideShow
* @param ss the HSLFSlideShow to extract text from
*/
public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
super(ss);
_hslfshow = ss;
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
}
/**
* Creates a PowerPointExtractor, from an open POIFSFileSystem
* @param fs the POIFSFileSystem containing the PowerPoint document
*/
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
_hslfshow = new HSLFSlideShow(fs);
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
}
/**
* Creates a PowerPointExtractor, from a HSLFSlideShow
* @param ss the HSLFSlideShow to extract text from
*/
public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
_hslfshow = ss;
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
}
/**
* Shuts down the underlying streams
*/
public void close() throws IOException {
_hslfshow.close();
_hslfshow = null;
_show = null;
_slides = null;
_notes = null;
}
/**
* Shuts down the underlying streams
*/
public void close() throws IOException {
_hslfshow.close();
_hslfshow = null;
_show = null;
_slides = null;
_notes = null;
}
/**
@ -195,4 +186,4 @@ public class PowerPointExtractor
return ret.toString();
}
}
}

View File

@ -22,6 +22,7 @@ import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.Paragraph;
@ -36,7 +37,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
*
* @author Nick Burch (nick at torchbox dot com)
*/
public class WordExtractor {
public class WordExtractor extends POITextExtractor {
private POIFSFileSystem fs;
private HWPFDocument doc;
@ -62,6 +63,7 @@ public class WordExtractor {
* @param doc The HWPFDocument to extract from
*/
public WordExtractor(HWPFDocument doc) throws IOException {
super(doc);
this.doc = doc;
}

View File

@ -0,0 +1,101 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.extractor;
import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import junit.framework.TestCase;
public class TestExcelExtractor extends TestCase {
public void testSimple() throws Exception {
String path = System.getProperty("HSSF.testdata.path");
FileInputStream fin = new FileInputStream(path + File.separator + "Simple.xls");
ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText());
// Now turn off sheet names
extractor.setIncludeSheetNames(false);
assertEquals("replaceMe\n", extractor.getText());
}
public void testNumericFormula() throws Exception {
String path = System.getProperty("HSSF.testdata.path");
FileInputStream fin = new FileInputStream(path + File.separator + "sumifformula.xls");
ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
assertEquals(
"Sheet1\n" +
"1000.0\t1.0\t5.0\n" +
"2000.0\t2.0\t\n" +
"3000.0\t3.0\t\n" +
"4000.0\t4.0\t\n" +
"5000.0\t5.0\t\n" +
"Sheet2\nSheet3\n",
extractor.getText()
);
extractor.setFormulasNotResults(true);
assertEquals(
"Sheet1\n" +
"1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
"2000.0\t2.0\t\n" +
"3000.0\t3.0\t\n" +
"4000.0\t4.0\t\n" +
"5000.0\t5.0\t\n" +
"Sheet2\nSheet3\n",
extractor.getText()
);
}
public void testStringConcat() throws Exception {
String path = System.getProperty("HSSF.testdata.path");
FileInputStream fin = new FileInputStream(path + File.separator + "SimpleWithFormula.xls");
ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
// Comes out as NaN if treated as a number
// And as XYZ if treated as a string
assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText());
extractor.setFormulasNotResults(true);
assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText());
}
public void testStringFormula() throws Exception {
String path = System.getProperty("HSSF.testdata.path");
FileInputStream fin = new FileInputStream(path + File.separator + "StringFormulas.xls");
ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
// Comes out as NaN if treated as a number
// And as XYZ if treated as a string
assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText());
extractor.setFormulasNotResults(true);
assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
}
}