QuickButCruddyTextExtractor - gets all the text (including stuff you might not want), but does it fast

git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@353710 13f79535-47bb-0310-9956-ffa450edef68
2005-06-09 15:09:58 +00:00 · 2005-06-09 15:09:58 +00:00 · db67860378
commit db67860378
parent d40aa464fa
2 changed files with 214 additions and 5 deletions
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@ -126,7 +126,9 @@ public class PowerPointExtractor
  }

  /**
-   * Fetches text from the slideshow, be it slide text or note text
+   * Fetches text from the slideshow, be it slide text or note text.
+   * Because the final block of text in a TextRun normally have their
+   *  last \n stripped, we add it back
   * @param getSlideText fetch slide text
   * @param getNoteText fetch note text
   */
@ -139,6 +141,7 @@ public class PowerPointExtractor
 			TextRun[] runs = slide.getTextRuns();
 			for(int j=0; j<runs.length; j++) {
 				TextRun run = runs[j];
+				if(run != null) {
 					String text = run.getText();
 					ret.append(text);
 					if(! text.endsWith("\n")) {
@ -146,6 +149,7 @@ public class PowerPointExtractor
 					}
 				}
 			}
+		}
 		if(getNoteText) {
 			ret.append(" ");
 		}
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/QuickButCruddyTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/QuickButCruddyTextExtractor.java
@ -0,0 +1,205 @@
+
+/* ====================================================================
+   Copyright 2002-2004   Apache Software Foundation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+        
+
+
+package org.apache.poi.hslf.extractor;
+
+import java.io.*;
+import java.util.Vector;
+
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSDocument;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.util.LittleEndian;
+
+import org.apache.poi.hslf.record.Record;
+import org.apache.poi.hslf.record.TextHeaderAtom;
+import org.apache.poi.hslf.record.TextBytesAtom;
+import org.apache.poi.hslf.record.TextCharsAtom;
+import org.apache.poi.hslf.model.TextRun;
+
+/**
+ * This class will get all the text from a Powerpoint Document, including
+ *  all the bits you didn't want, and in a somewhat random order, but will
+ *  do it very fast.
+ * The class ignores most of the hslf classes, and doesn't use 
+ *  HSLFSlideShow. Instead, it just does a very basic scan through the
+ *  file, grabbing all the text records as it goes. It then returns the
+ *  text, either as a single string, or as a vector of all the individual
+ *  strings.
+ * Because of how it works, it will return a lot of "crud" text that you 
+ *  probably didn't want! It will return text from master slides. It will
+ *  return duplicate text, and some mangled text (powerpoint files often
+ *  have duplicate copies of slide text in them). You don't get any idea
+ *  what the text was associated with.
+ * Almost everyone will want to use @see PowerPointExtractor instead. There
+ *  are only a very small number of cases (eg some performance sensitive
+ *  lucene indexers) that would ever want to use this!
+ *
+ * @author Nick Burch
+ */
+
+public class QuickButCruddyTextExtractor
+{
+	private POIFSFileSystem fs;
+	private InputStream is;
+	private byte[] pptContents;
+
+	/**
+	 * Really basic text extractor, that will also return lots of crud text.
+	 * Takes a single argument, the file to extract from
+	 */
+	public static void main(String args[]) throws IOException
+	{
+		if(args.length < 1) {
+			System.err.println("Useage:");
+			System.err.println("\tQuickButCruddyTextExtractor <file>");
+			System.exit(1);
+		}
+
+		String file = args[0];
+
+		QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(file);
+		System.out.println(ppe.getTextAsString());
+		ppe.close();
+	}
+
+	/**
+	 * Creates an extractor from a given file name
+	 * @param fileName
+	 */
+	public QuickButCruddyTextExtractor(String fileName) throws IOException {
+		this(new FileInputStream(fileName));
+	}
+
+	/**
+	 * Creates an extractor from a given input stream
+	 * @param iStream
+	 */
+	public QuickButCruddyTextExtractor(InputStream iStream) throws IOException {
+		this(new POIFSFileSystem(iStream));
+		is = iStream;
+	}
+
+	/**
+	 * Creates an extractor from a POIFS Filesystem
+	 * @param poifs
+	 */
+	public QuickButCruddyTextExtractor(POIFSFileSystem poifs) throws IOException {
+		fs = poifs;
+
+		// Find the PowerPoint bit, and get out the bytes
+		DocumentEntry docProps =
+			(DocumentEntry)fs.getRoot().getEntry("PowerPoint Document");
+		pptContents = new byte[docProps.getSize()];
+		fs.createDocumentInputStream("PowerPoint Document").read(pptContents);
+	}
+
+
+	/**
+	 * Shuts down the underlying streams
+	 */
+	public void close() throws IOException {
+		if(is != null) { is.close(); }
+		fs = null;
+	}
+
+	/**
+	 * Fetches the ALL the text of the powerpoint file, as a single string
+	 */
+	public String getTextAsString() {
+		StringBuffer ret = new StringBuffer();
+		Vector textV = getTextAsVector();
+		for(int i=0; i<textV.size(); i++) {
+			String text = (String)textV.get(i);
+			ret.append(text);
+			if(! text.endsWith("\n")) {
+				ret.append('\n');
+			}
+		}
+		return ret.toString();
+	}
+
+	/**
+	 * Fetches the ALL the text of the powerpoint file, in a vector of
+	 *  strings, one per text record
+	 */
+	public Vector getTextAsVector() {
+		Vector textV = new Vector();
+
+		// Set to the start of the file
+		int walkPos = 0;
+
+		// Start walking the file, looking for the records
+		while(walkPos != -1) {
+			int newPos = findTextRecords(walkPos,textV);
+			walkPos = newPos;
+		}
+
+		// Return what we find
+		return textV;
+	}
+
+	/**
+	 * For the given position, look if the record is a text record, and wind
+	 *  on after.
+	 * If it is a text record, grabs out the text. Whatever happens, returns
+	 *  the position of the next record, or -1 if no more.
+	 */
+	public int findTextRecords(int startPos, Vector textV) {
+		// Grab the length, and the first option byte
+		// Note that the length doesn't include the 8 byte atom header
+		int len = (int)LittleEndian.getUInt(pptContents,startPos+4);
+		byte opt = pptContents[startPos];
+
+		// If it's a container, step into it and return
+		// (If it's a container, option byte 1 BINARY_AND 0x0f will be 0x0f)
+		int container = (int)opt & 0x0f;
+		if(container == 0x0f) {
+			return (startPos+8);
+		}
+
+		// Otherwise, check the type to see if it's text
+		long type = LittleEndian.getUShort(pptContents,startPos+2);
+		TextRun trun = null;
+
+		// TextBytesAtom
+		if(type == 4008l) {
+			TextBytesAtom tba = (TextBytesAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
+			trun = new TextRun((TextHeaderAtom)null,tba);
+		}
+		// TextCharsAtom
+		if(type == 4000l) {
+			TextCharsAtom tca = (TextCharsAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
+			trun = new TextRun((TextHeaderAtom)null,tca);
+		}
+
+		// If we found text, save it in the vector
+		if(trun != null) {
+			textV.add(trun.getText());
+		}
+		
+		// Wind on by the atom length, and check we're not at the end
+		int newPos = (startPos + 8 + len);
+		if(newPos > (pptContents.length - 8)) { 
+			newPos = -1;
+		}
+		return newPos;
+	}
+}