QuickButCruddyTextExtractor - gets all the text (including stuff you might not want), but does it fast
git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@353710 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d40aa464fa
commit
db67860378
@ -126,7 +126,9 @@ public class PowerPointExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches text from the slideshow, be it slide text or note text
|
* Fetches text from the slideshow, be it slide text or note text.
|
||||||
|
* Because the final block of text in a TextRun normally have their
|
||||||
|
* last \n stripped, we add it back
|
||||||
* @param getSlideText fetch slide text
|
* @param getSlideText fetch slide text
|
||||||
* @param getNoteText fetch note text
|
* @param getNoteText fetch note text
|
||||||
*/
|
*/
|
||||||
@ -139,10 +141,12 @@ public class PowerPointExtractor
|
|||||||
TextRun[] runs = slide.getTextRuns();
|
TextRun[] runs = slide.getTextRuns();
|
||||||
for(int j=0; j<runs.length; j++) {
|
for(int j=0; j<runs.length; j++) {
|
||||||
TextRun run = runs[j];
|
TextRun run = runs[j];
|
||||||
String text = run.getText();
|
if(run != null) {
|
||||||
ret.append(text);
|
String text = run.getText();
|
||||||
if(! text.endsWith("\n")) {
|
ret.append(text);
|
||||||
ret.append("\n");
|
if(! text.endsWith("\n")) {
|
||||||
|
ret.append("\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,205 @@
|
|||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
Copyright 2002-2004 Apache Software Foundation
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
package org.apache.poi.hslf.extractor;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.Vector;
|
||||||
|
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSDocument;
|
||||||
|
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||||
|
import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
|
||||||
|
import org.apache.poi.hslf.record.Record;
|
||||||
|
import org.apache.poi.hslf.record.TextHeaderAtom;
|
||||||
|
import org.apache.poi.hslf.record.TextBytesAtom;
|
||||||
|
import org.apache.poi.hslf.record.TextCharsAtom;
|
||||||
|
import org.apache.poi.hslf.model.TextRun;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class will get all the text from a Powerpoint Document, including
|
||||||
|
* all the bits you didn't want, and in a somewhat random order, but will
|
||||||
|
* do it very fast.
|
||||||
|
* The class ignores most of the hslf classes, and doesn't use
|
||||||
|
* HSLFSlideShow. Instead, it just does a very basic scan through the
|
||||||
|
* file, grabbing all the text records as it goes. It then returns the
|
||||||
|
* text, either as a single string, or as a vector of all the individual
|
||||||
|
* strings.
|
||||||
|
* Because of how it works, it will return a lot of "crud" text that you
|
||||||
|
* probably didn't want! It will return text from master slides. It will
|
||||||
|
* return duplicate text, and some mangled text (powerpoint files often
|
||||||
|
* have duplicate copies of slide text in them). You don't get any idea
|
||||||
|
* what the text was associated with.
|
||||||
|
* Almost everyone will want to use @see PowerPointExtractor instead. There
|
||||||
|
* are only a very small number of cases (eg some performance sensitive
|
||||||
|
* lucene indexers) that would ever want to use this!
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class QuickButCruddyTextExtractor
|
||||||
|
{
|
||||||
|
private POIFSFileSystem fs;
|
||||||
|
private InputStream is;
|
||||||
|
private byte[] pptContents;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Really basic text extractor, that will also return lots of crud text.
|
||||||
|
* Takes a single argument, the file to extract from
|
||||||
|
*/
|
||||||
|
public static void main(String args[]) throws IOException
|
||||||
|
{
|
||||||
|
if(args.length < 1) {
|
||||||
|
System.err.println("Useage:");
|
||||||
|
System.err.println("\tQuickButCruddyTextExtractor <file>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
String file = args[0];
|
||||||
|
|
||||||
|
QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(file);
|
||||||
|
System.out.println(ppe.getTextAsString());
|
||||||
|
ppe.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an extractor from a given file name
|
||||||
|
* @param fileName
|
||||||
|
*/
|
||||||
|
public QuickButCruddyTextExtractor(String fileName) throws IOException {
|
||||||
|
this(new FileInputStream(fileName));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an extractor from a given input stream
|
||||||
|
* @param iStream
|
||||||
|
*/
|
||||||
|
public QuickButCruddyTextExtractor(InputStream iStream) throws IOException {
|
||||||
|
this(new POIFSFileSystem(iStream));
|
||||||
|
is = iStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an extractor from a POIFS Filesystem
|
||||||
|
* @param poifs
|
||||||
|
*/
|
||||||
|
public QuickButCruddyTextExtractor(POIFSFileSystem poifs) throws IOException {
|
||||||
|
fs = poifs;
|
||||||
|
|
||||||
|
// Find the PowerPoint bit, and get out the bytes
|
||||||
|
DocumentEntry docProps =
|
||||||
|
(DocumentEntry)fs.getRoot().getEntry("PowerPoint Document");
|
||||||
|
pptContents = new byte[docProps.getSize()];
|
||||||
|
fs.createDocumentInputStream("PowerPoint Document").read(pptContents);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shuts down the underlying streams
|
||||||
|
*/
|
||||||
|
public void close() throws IOException {
|
||||||
|
if(is != null) { is.close(); }
|
||||||
|
fs = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches the ALL the text of the powerpoint file, as a single string
|
||||||
|
*/
|
||||||
|
public String getTextAsString() {
|
||||||
|
StringBuffer ret = new StringBuffer();
|
||||||
|
Vector textV = getTextAsVector();
|
||||||
|
for(int i=0; i<textV.size(); i++) {
|
||||||
|
String text = (String)textV.get(i);
|
||||||
|
ret.append(text);
|
||||||
|
if(! text.endsWith("\n")) {
|
||||||
|
ret.append('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches the ALL the text of the powerpoint file, in a vector of
|
||||||
|
* strings, one per text record
|
||||||
|
*/
|
||||||
|
public Vector getTextAsVector() {
|
||||||
|
Vector textV = new Vector();
|
||||||
|
|
||||||
|
// Set to the start of the file
|
||||||
|
int walkPos = 0;
|
||||||
|
|
||||||
|
// Start walking the file, looking for the records
|
||||||
|
while(walkPos != -1) {
|
||||||
|
int newPos = findTextRecords(walkPos,textV);
|
||||||
|
walkPos = newPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return what we find
|
||||||
|
return textV;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For the given position, look if the record is a text record, and wind
|
||||||
|
* on after.
|
||||||
|
* If it is a text record, grabs out the text. Whatever happens, returns
|
||||||
|
* the position of the next record, or -1 if no more.
|
||||||
|
*/
|
||||||
|
public int findTextRecords(int startPos, Vector textV) {
|
||||||
|
// Grab the length, and the first option byte
|
||||||
|
// Note that the length doesn't include the 8 byte atom header
|
||||||
|
int len = (int)LittleEndian.getUInt(pptContents,startPos+4);
|
||||||
|
byte opt = pptContents[startPos];
|
||||||
|
|
||||||
|
// If it's a container, step into it and return
|
||||||
|
// (If it's a container, option byte 1 BINARY_AND 0x0f will be 0x0f)
|
||||||
|
int container = (int)opt & 0x0f;
|
||||||
|
if(container == 0x0f) {
|
||||||
|
return (startPos+8);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise, check the type to see if it's text
|
||||||
|
long type = LittleEndian.getUShort(pptContents,startPos+2);
|
||||||
|
TextRun trun = null;
|
||||||
|
|
||||||
|
// TextBytesAtom
|
||||||
|
if(type == 4008l) {
|
||||||
|
TextBytesAtom tba = (TextBytesAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
|
||||||
|
trun = new TextRun((TextHeaderAtom)null,tba);
|
||||||
|
}
|
||||||
|
// TextCharsAtom
|
||||||
|
if(type == 4000l) {
|
||||||
|
TextCharsAtom tca = (TextCharsAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
|
||||||
|
trun = new TextRun((TextHeaderAtom)null,tca);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we found text, save it in the vector
|
||||||
|
if(trun != null) {
|
||||||
|
textV.add(trun.getText());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wind on by the atom length, and check we're not at the end
|
||||||
|
int newPos = (startPos + 8 + len);
|
||||||
|
if(newPos > (pptContents.length - 8)) {
|
||||||
|
newPos = -1;
|
||||||
|
}
|
||||||
|
return newPos;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user