Implement an Excel text extractor, and put all the existing text extractors under a common superclass, so they're easier to find and use

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589224 13f79535-47bb-0310-9956-ffa450edef68
2007-10-27 21:57:10 +00:00 · 2007-10-27 21:57:10 +00:00 · 9d3658e72f
commit 9d3658e72f
parent e145f417b7
6 changed files with 352 additions and 61 deletions
--- a/src/java/org/apache/poi/POITextExtractor.java
+++ b/src/java/org/apache/poi/POITextExtractor.java
@ -0,0 +1,49 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi;
 /**
 * Common Parent for Text Extractors
 *  of POI Documents. 
 * You will typically find the implementation of
 *  a given format's text extractor under
 *  org.apache.poi.[format].extractor .
 * @see org.apache.poi.hssf.extractor.ExcelExtractor
 * @see org.apache.poi.hslf.extractor.PowerPointExtractor
 * @see org.apache.poi.hdgf.extractor.VisioTextExtractor
 * @see org.apache.poi.hwpf.extractor.WordExtractor
 */
 public abstract class POITextExtractor {
 	/** The POIDocument that's open */
 	protected POIDocument document;
 	/**
 	 * Creates a new text extractor for the given document
 	 */
 	public POITextExtractor(POIDocument document) {
 		this.document = document;
 	}
 	/**
 	 * Retrieves all the text from the document.
 	 * How cells, paragraphs etc are separated in the text
 	 *  is implementation specific - see the javadocs for
 	 *  a specific project for details.
 	 * @return All the text from the document
 	 */
 	public abstract String getText();
 }
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@ -0,0 +1,144 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hssf.extractor;
 import java.io.IOException;
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFRichTextString;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 /**
 * A text extractor for Excel files.
 * Returns the textual content of the file, suitable for 
 *  indexing by something like Lucene, but not really
 *  intended for display to the user.
 * To turn an excel file into a CSV or similar, then see
 *  the XLS2CSVmra example
 * @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
 */
 public class ExcelExtractor extends POITextExtractor{
 	private HSSFWorkbook wb;
 	private boolean includeSheetNames = true;
 	private boolean formulasNotResults = false;
 	public ExcelExtractor(HSSFWorkbook wb) {
 		super(wb);
 		this.wb = wb;
 	}
 	public ExcelExtractor(POIFSFileSystem fs) throws IOException {
 		this(new HSSFWorkbook(fs));
 	}
 	/**
 	 * Should sheet names be included? Default is true
 	 */
 	public void setIncludeSheetNames(boolean includeSheetNames) {
 		this.includeSheetNames = includeSheetNames;
 	}
 	/**
 	 * Should we return the formula itself, and not
 	 *  the result it produces? Default is false
 	 */
 	public void setFormulasNotResults(boolean formulasNotResults) {
 		this.formulasNotResults = formulasNotResults;
 	}
 	/**
 	 * Retreives the text contents of the file
 	 */
 	public String getText() {
 		StringBuffer text = new StringBuffer();
 		for(int i=0;i<wb.getNumberOfSheets();i++) {
 			HSSFSheet sheet = wb.getSheetAt(i);
 			if(sheet == null) { continue; }
 			if(includeSheetNames) {
 				String name = wb.getSheetName(i);
 				if(name != null) {
 					text.append(name);
 					text.append("\n");
 				}
 			}
 			int firstRow = sheet.getFirstRowNum();
 			int lastRow = sheet.getLastRowNum();
 			for(int j=firstRow;j<=lastRow;j++) {
 				HSSFRow row = sheet.getRow(j);
 				if(row == null) { continue; }
 				// Check each cell in turn
 				int firstCell = row.getFirstCellNum();
 				int lastCell = row.getLastCellNum();
 				for(int k=firstCell;k<lastCell;k++) {
 					HSSFCell cell = row.getCell((short)k);
 					boolean outputContents = false;
 					if(cell == null) { continue; }
 					switch(cell.getCellType()) {
 						case HSSFCell.CELL_TYPE_STRING:
 							text.append(cell.getRichStringCellValue().getString());
 							outputContents = true;
 							break;
 						case HSSFCell.CELL_TYPE_NUMERIC:
 							// Note - we don't apply any formatting!
 							text.append(cell.getNumericCellValue());
 							outputContents = true;
 							break;
 						case HSSFCell.CELL_TYPE_BOOLEAN:
 							text.append(cell.getBooleanCellValue());
 							outputContents = true;
 							break;
 						case HSSFCell.CELL_TYPE_FORMULA:
 							if(formulasNotResults) {
 								text.append(cell.getCellFormula());
 							} else {
 								// Try it as a string, if not as a number
 								HSSFRichTextString str = 
 									cell.getRichStringCellValue();
 								if(str != null && str.length() > 0) {
 									text.append(str.toString());
 								} else {
 									// Try and treat it as a number
 									double val = cell.getNumericCellValue();
 									text.append(val);
 								}
 							}
 							outputContents = true;
 							break;
 					}
 					// Output a tab if we're not on the last cell
 					if(outputContents && k < (lastCell-1)) {
 						text.append("\t");
 					}
 				}
 				// Finish off the row
 				text.append("\n");
 			}
 		}
 		return text.toString();
 	}
 }
--- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.hdgf.HDGFDiagram;
 import org.apache.poi.hdgf.chunks.Chunk.Command;
 import org.apache.poi.hdgf.streams.ChunkStream;
@ -33,11 +34,12 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 * Can opperate on the command line (outputs to stdout), or
 *  can return the text for you (eg for use with Lucene).
 */
-public class VisioTextExtractor {
+public class VisioTextExtractor extends POITextExtractor {
 	private HDGFDiagram hdgf;
 	private POIFSFileSystem fs;
 	public VisioTextExtractor(HDGFDiagram hdgf) {
 		super(hdgf);
 		this.hdgf = hdgf;
 	}
 	public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
@ -84,6 +86,8 @@ public class VisioTextExtractor {
 	/**
 	 * Returns the textual contents of the file.
 	 * Each textual object's text will be separated
 	 *  by a newline
 	 */
 	public String getText() {
 		StringBuffer text = new StringBuffer();
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@ -22,6 +22,8 @@ package org.apache.poi.hslf.extractor;
 import java.io.*;
 import java.util.HashSet;
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.hslf.*;
 import org.apache.poi.hslf.model.*;
@ -34,12 +36,12 @@ import org.apache.poi.hslf.usermodel.*;
 * @author Nick Burch
 */
-public class PowerPointExtractor
+public class PowerPointExtractor extends POITextExtractor
 {
-  private HSLFSlideShow _hslfshow;
+	private HSLFSlideShow _hslfshow;
-  private SlideShow _show;
+	private SlideShow _show;
-  private Slide[] _slides;
+	private Slide[] _slides;
-  private Notes[] _notes;
+	private Notes[] _notes;
  /**
   * Basic extractor. Returns all the text, and optionally all the notes
@ -66,61 +68,50 @@ public class PowerPointExtractor
 	ppe.close();
  }
-  /**
+	/**
-   * Creates a PowerPointExtractor, from a file
+	 * Creates a PowerPointExtractor, from a file
-   * @param fileName The name of the file to extract from
+	 * @param fileName The name of the file to extract from
-   */
+	 */
-  public PowerPointExtractor(String fileName) throws IOException {
+	public PowerPointExtractor(String fileName) throws IOException {
-	_hslfshow = new HSLFSlideShow(fileName);
+		this(new FileInputStream(fileName));
-	_show = new SlideShow(_hslfshow);
+	}
-	_slides = _show.getSlides();
+	/**
-	_notes = _show.getNotes();
+	 * Creates a PowerPointExtractor, from an Input Stream
-  }
+	 * @param iStream The input stream containing the PowerPoint document
 	 */
 	public PowerPointExtractor(InputStream iStream) throws IOException {
 		this(new POIFSFileSystem(iStream));
 	}
 	/**
 	 * Creates a PowerPointExtractor, from an open POIFSFileSystem
 	 * @param fs the POIFSFileSystem containing the PowerPoint document
 	 */
 	public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
 		this(new HSLFSlideShow(fs));
 	}
-  /**
+	/**
-   * Creates a PowerPointExtractor, from an Input Stream
+	 * Creates a PowerPointExtractor, from a HSLFSlideShow
-   * @param iStream The input stream containing the PowerPoint document
+	 * @param ss the HSLFSlideShow to extract text from
-   */
+	 */
-  public PowerPointExtractor(InputStream iStream) throws IOException {
+	public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
-	_hslfshow = new HSLFSlideShow(iStream);
+		super(ss);
-	_show = new SlideShow(_hslfshow);
+		_hslfshow = ss;
-	_slides = _show.getSlides();
+		_show = new SlideShow(_hslfshow);
-	_notes = _show.getNotes();
+		_slides = _show.getSlides();
-  }
+		_notes = _show.getNotes();
 	}
-  /**
+	/**
-   * Creates a PowerPointExtractor, from an open POIFSFileSystem
+	 * Shuts down the underlying streams
-   * @param fs the POIFSFileSystem containing the PowerPoint document
+	 */
-   */
+	public void close() throws IOException {
-  public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
+		_hslfshow.close();
-	_hslfshow = new HSLFSlideShow(fs);
+		_hslfshow = null;
-	_show = new SlideShow(_hslfshow);
+		_show = null;
-	_slides = _show.getSlides();
+		_slides = null;
-	_notes = _show.getNotes();
+		_notes = null;
-  }
+	}
  /**
   * Creates a PowerPointExtractor, from a HSLFSlideShow
   * @param ss the HSLFSlideShow to extract text from
   */
  public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
 	_hslfshow = ss;
 	_show = new SlideShow(_hslfshow);
 	_slides = _show.getSlides();
 	_notes = _show.getNotes();
  }
  /**
   * Shuts down the underlying streams
   */
  public void close() throws IOException {
 	_hslfshow.close();
 	_hslfshow = null;
 	_show = null;
 	_slides = null;
 	_notes = null;
  }
  /**
@ -195,4 +186,4 @@ public class PowerPointExtractor
 	return ret.toString();
  }
-}
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.Iterator;
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.Paragraph;
@ -36,7 +37,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 *
 * @author Nick Burch (nick at torchbox dot com)
 */
-public class WordExtractor {
+public class WordExtractor extends POITextExtractor {
 	private POIFSFileSystem fs;
 	private HWPFDocument doc;
@ -62,6 +63,7 @@ public class WordExtractor {
 	 * @param doc The HWPFDocument to extract from
 	 */
 	public WordExtractor(HWPFDocument doc) throws IOException {
 		super(doc);
 		this.doc = doc;
 	}
--- a/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
+++ b/src/testcases/org/apache/poi/hssf/extractor/TestExcelExtractor.java
@ -0,0 +1,101 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hssf.extractor;
 import java.io.File;
 import java.io.FileInputStream;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import junit.framework.TestCase;
 public class TestExcelExtractor extends TestCase {
 	public void testSimple() throws Exception {
 		String path = System.getProperty("HSSF.testdata.path");
 		FileInputStream fin = new FileInputStream(path + File.separator + "Simple.xls");
 		ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
 		assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText());
 		// Now turn off sheet names
 		extractor.setIncludeSheetNames(false);
 		assertEquals("replaceMe\n", extractor.getText());
 	}
 	public void testNumericFormula() throws Exception {
 		String path = System.getProperty("HSSF.testdata.path");
 		FileInputStream fin = new FileInputStream(path + File.separator + "sumifformula.xls");
 		ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
 		assertEquals(
 				"Sheet1\n" +
 				"1000.0\t1.0\t5.0\n" +
 				"2000.0\t2.0\t\n" +	
 				"3000.0\t3.0\t\n" +
 				"4000.0\t4.0\t\n" + 
 				"5000.0\t5.0\t\n" +
 				"Sheet2\nSheet3\n", 
 				extractor.getText()
 		);
 		extractor.setFormulasNotResults(true);
 		assertEquals(
 				"Sheet1\n" +
 				"1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
 				"2000.0\t2.0\t\n" +	
 				"3000.0\t3.0\t\n" +
 				"4000.0\t4.0\t\n" + 
 				"5000.0\t5.0\t\n" +
 				"Sheet2\nSheet3\n", 
 				extractor.getText()
 		);
 	}
 	public void testStringConcat() throws Exception {
 		String path = System.getProperty("HSSF.testdata.path");
 		FileInputStream fin = new FileInputStream(path + File.separator + "SimpleWithFormula.xls");
 		ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
 		// Comes out as NaN if treated as a number
 		// And as XYZ if treated as a string
 		assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText());
 		extractor.setFormulasNotResults(true);
 		assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText());
 	}
 	public void testStringFormula() throws Exception {
 		String path = System.getProperty("HSSF.testdata.path");
 		FileInputStream fin = new FileInputStream(path + File.separator + "StringFormulas.xls");
 		ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
 		// Comes out as NaN if treated as a number
 		// And as XYZ if treated as a string
 		assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText());
 		extractor.setFormulasNotResults(true);
 		assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
 	}
 }