refactored HSSFEventFactory to use RecordFactory instead of HSSFRecordStream, see Bugzilla 47448

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@791251 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2009-07-05 14:10:49 +00:00
parent 9c5c51ad29
commit 15ffa9ebda
5 changed files with 248 additions and 348 deletions

View File

@ -33,6 +33,7 @@
<changes> <changes>
<release version="3.5-beta7" date="2009-??-??"> <release version="3.5-beta7" date="2009-??-??">
<action dev="POI-DEVELOPERS" type="fix">47448 - Allow HSSFEventFactory to handle non-zero padding at the end of the workbook stream</action>
<action dev="POI-DEVELOPERS" type="add">47456 - Support for getting OLE object data in PowerPointExtractor</action> <action dev="POI-DEVELOPERS" type="add">47456 - Support for getting OLE object data in PowerPointExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">47411 - Explicitly set the 1900 date system when creating XSSF workbooks</action> <action dev="POI-DEVELOPERS" type="fix">47411 - Explicitly set the 1900 date system when creating XSSF workbooks</action>
<action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action> <action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>

View File

@ -134,7 +134,7 @@ public class HSSFEventFactory
Record r = null; Record r = null;
// Create a new RecordStream and use that // Create a new RecordStream and use that
HSSFRecordStream recordStream = new HSSFRecordStream(in); RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in);
// Process each record as they come in // Process each record as they come in
while(going) { while(going) {

View File

@ -1,234 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.eventusermodel;
import java.util.Vector;
import org.apache.poi.hssf.record.ContinueRecord;
import org.apache.poi.hssf.record.DrawingGroupRecord;
import org.apache.poi.hssf.record.DrawingRecord;
import org.apache.poi.hssf.record.ObjRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.RecordFactory;
import org.apache.poi.hssf.record.RecordFormatException;
import org.apache.poi.hssf.record.RecordInputStream;
import org.apache.poi.hssf.record.TextObjectRecord;
import org.apache.poi.hssf.record.UnknownRecord;
/**
* A stream based way to get at complete records, with
* as low a memory footprint as possible.
* This handles reading from a RecordInputStream, turning
* the data into full records, processing continue records
* etc.
* Most users should use {@link HSSFEventFactory} /
* {@link HSSFListener} and have new records pushed to
* them, but this does allow for a "pull" style of coding.
*/
public class HSSFRecordStream {
private RecordInputStream in;
/** Have we run out of records on the stream? */
private boolean hitEOS = false;
/** Have we returned all the records there are? */
private boolean complete = false;
/**
* Sometimes we end up with a bunch of
* records. When we do, these should
* be returned before the next normal
* record processing occurs (i.e. before
* we check for continue records and
* return rec)
*/
private Vector bonusRecords = null;
/**
* The next record to return, which may need to have its
* continue records passed to it before we do
*/
private Record rec = null;
/**
* The most recent record that we gave to the user
*/
private Record lastRec = null;
/**
* The most recent DrawingRecord seen
*/
private DrawingRecord lastDrawingRecord = new DrawingRecord();
public HSSFRecordStream(RecordInputStream inp) {
this.in = inp;
}
/**
* Returns the next (complete) record from the
* stream, or null if there are no more.
*/
public Record nextRecord() {
Record r = null;
// Loop until we get something
while(r == null && !complete) {
// Are there any bonus records that we need to
// return?
r = getBonusRecord();
// If not, ask for the next real record
if(r == null) {
r = getNextRecord();
}
}
// All done
return r;
}
/**
* If there are any "bonus" records, that should
* be returned before processing new ones,
* grabs the next and returns it.
* If not, returns null;
*/
private Record getBonusRecord() {
if(bonusRecords != null) {
Record r = (Record)bonusRecords.remove(0);
if(bonusRecords.size() == 0) {
bonusRecords = null;
}
return r;
}
return null;
}
/**
* Returns the next available record, or null if
* this pass didn't return a record that's
* suitable for returning (eg was a continue record).
*/
private Record getNextRecord() {
Record toReturn = null;
if(in.hasNextRecord()) {
// Grab our next record
in.nextRecord();
short sid = in.getSid();
//
// for some reasons we have to make the workbook to be at least 4096 bytes
// but if we have such workbook we fill the end of it with zeros (many zeros)
//
// it is not good:
// if the length( all zero records ) % 4 = 1
// e.g.: any zero record would be readed as 4 bytes at once ( 2 - id and 2 - size ).
// And the last 1 byte will be readed WRONG ( the id must be 2 bytes )
//
// So we should better to check if the sid is zero and not to read more data
// The zero sid shows us that rest of the stream data is a fake to make workbook
// certain size
//
if ( sid == 0 )
return null;
// If we had a last record, and this one
// isn't a continue record, then pass
// it on to the listener
if ((rec != null) && (sid != ContinueRecord.sid))
{
// This last record ought to be returned
toReturn = rec;
}
// If this record isn't a continue record,
// then build it up
if (sid != ContinueRecord.sid)
{
//System.out.println("creating "+sid);
Record[] recs = RecordFactory.createRecord(in);
// We know that the multiple record situations
// don't contain continue records, so just
// pass those on to the listener now
if (recs.length > 1) {
bonusRecords = new Vector(recs.length-1);
for (int k = 0; k < (recs.length - 1); k++) {
bonusRecords.add(recs[k]);
}
}
// Regardless of the number we created, always hold
// onto the last record to be processed on the next
// loop, in case it has any continue records
rec = recs[ recs.length - 1 ];
// Don't return it just yet though, as we probably have
// a record from the last round to return
}
else {
// Normally, ContinueRecords are handled internally
// However, in a few cases, there is a gap between a record at
// its Continue, so we have to handle them specially
// This logic is much like in RecordFactory.createRecords()
Record[] recs = RecordFactory.createRecord(in);
ContinueRecord crec = (ContinueRecord)recs[0];
if((lastRec instanceof ObjRecord) || (lastRec instanceof TextObjectRecord)) {
// You can have Obj records between a DrawingRecord
// and its continue!
lastDrawingRecord.processContinueRecord( crec.getData() );
// Trigger them on the drawing record, now it's complete
rec = lastDrawingRecord;
}
else if((lastRec instanceof DrawingGroupRecord)) {
((DrawingGroupRecord)lastRec).processContinueRecord(crec.getData());
// Trigger them on the drawing record, now it's complete
rec = lastRec;
}
else {
if (rec instanceof UnknownRecord) {
;//silently skip records we don't know about
} else {
throw new RecordFormatException("Records should handle ContinueRecord internally. Should not see this exception");
}
}
}
// Update our tracking of the last record
lastRec = rec;
if(rec instanceof DrawingRecord) {
lastDrawingRecord = (DrawingRecord)rec;
}
} else {
// No more records
hitEOS = true;
}
// If we've hit the end-of-stream, then
// finish off the last record and be done
if(hitEOS) {
complete = true;
// Return the last record if there was
// one, otherwise null
if(rec != null) {
toReturn = rec;
rec = null;
}
}
return toReturn;
}
}

View File

@ -17,21 +17,14 @@
package org.apache.poi.hssf.record; package org.apache.poi.hssf.record;
import org.apache.poi.hssf.record.chart.*;
import org.apache.poi.hssf.record.pivottable.*;
import java.io.InputStream; import java.io.InputStream;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException; import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Modifier; import java.lang.reflect.Modifier;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.poi.hssf.record.chart.*;
import org.apache.poi.hssf.record.pivottable.*;
/** /**
* Title: Record Factory<P> * Title: Record Factory<P>
@ -259,7 +252,7 @@ public final class RecordFactory {
return new Record[] { record, }; return new Record[] { record, };
} }
static Record createSingleRecord(RecordInputStream in) { public static Record createSingleRecord(RecordInputStream in) {
I_RecordCreator constructor = _recordCreatorsById.get(new Integer(in.getSid())); I_RecordCreator constructor = _recordCreatorsById.get(new Integer(in.getSid()));
if (constructor == null) { if (constructor == null) {
@ -273,7 +266,7 @@ public final class RecordFactory {
* RK record is a slightly smaller alternative to NumberRecord * RK record is a slightly smaller alternative to NumberRecord
* POI likes NumberRecord better * POI likes NumberRecord better
*/ */
private static NumberRecord convertToNumberRecord(RKRecord rk) { public static NumberRecord convertToNumberRecord(RKRecord rk) {
NumberRecord num = new NumberRecord(); NumberRecord num = new NumberRecord();
num.setColumn(rk.getColumn()); num.setColumn(rk.getColumn());
@ -286,7 +279,7 @@ public final class RecordFactory {
/** /**
* Converts a {@link MulRKRecord} into an equivalent array of {@link NumberRecord}s * Converts a {@link MulRKRecord} into an equivalent array of {@link NumberRecord}s
*/ */
private static NumberRecord[] convertRKRecords(MulRKRecord mrk) { public static NumberRecord[] convertRKRecords(MulRKRecord mrk) {
NumberRecord[] mulRecs = new NumberRecord[mrk.getNumColumns()]; NumberRecord[] mulRecs = new NumberRecord[mrk.getNumColumns()];
for (int k = 0; k < mrk.getNumColumns(); k++) { for (int k = 0; k < mrk.getNumColumns(); k++) {
@ -374,109 +367,16 @@ public final class RecordFactory {
* @exception RecordFormatException on error processing the InputStream * @exception RecordFormatException on error processing the InputStream
*/ */
public static List<Record> createRecords(InputStream in) throws RecordFormatException { public static List<Record> createRecords(InputStream in) throws RecordFormatException {
List<Record> records = new ArrayList<Record>(NUM_RECORDS); List<Record> records = new ArrayList<Record>(NUM_RECORDS);
RecordInputStream recStream = new RecordInputStream(in); RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in));
DrawingRecord lastDrawingRecord = new DrawingRecord( ); recStream.setIncludeContinueRecords(true);
Record lastRecord = null;
/*
* How to recognise end of stream?
* In the best case, the underlying input stream (in) ends just after the last EOF record
* Usually however, the stream is padded with an arbitrary byte count. Excel and most apps
* reliably use zeros for padding and if this were always the case, this code could just
* skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with
* non-zero padding that is read OK by Excel (Excel also fixes the padding).
*
* So to properly detect the workbook end of stream, this code has to identify the last
* EOF record. This is not so easy because the worbook bof+eof pair do not bracket the
* whole stream. The worksheets follow the workbook, but it is not easy to tell how many
* sheet sub-streams should be present. Hence we are looking for an EOF record that is not
* immediately followed by a BOF record. One extra complication is that bof+eof sub-
* streams can be nested within worksheet streams and it's not clear in these cases what
* record might follow any EOF record. So we also need to keep track of the bof/eof
* nesting level.
*/
int bofDepth=0; Record record;
boolean lastRecordWasEOFLevelZero = false; while ((record = recStream.nextRecord())!=null) {
while (recStream.hasNextRecord()) {
recStream.nextRecord();
if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) {
// Normally InputStream (in) contains only zero padding after this point
break;
}
Record record = createSingleRecord(recStream);
lastRecordWasEOFLevelZero = false;
if (record instanceof BOFRecord) {
bofDepth++;
records.add(record); records.add(record);
continue;
}
if (record instanceof EOFRecord) {
bofDepth--;
records.add(record);
if (bofDepth<1) {
lastRecordWasEOFLevelZero = true;
}
continue;
} }
if (record instanceof DBCellRecord) {
// Not needed by POI. Regenerated from scratch by POI when spreadsheet is written
continue;
}
if (record instanceof RKRecord) {
records.add(convertToNumberRecord((RKRecord) record));
continue;
}
if (record instanceof MulRKRecord) {
addAll(records, convertRKRecords((MulRKRecord)record));
continue;
}
if (record.getSid() == DrawingGroupRecord.sid
&& lastRecord instanceof DrawingGroupRecord) {
DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord;
lastDGRecord.join((AbstractEscherHolderRecord) record);
} else if (record.getSid() == ContinueRecord.sid) {
ContinueRecord contRec = (ContinueRecord)record;
if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) {
// Drawing records have a very strange continue behaviour.
//There can actually be OBJ records mixed between the continues.
lastDrawingRecord.processContinueRecord(contRec.getData() );
//we must remember the position of the continue record.
//in the serialization procedure the original structure of records must be preserved
records.add(record);
} else if (lastRecord instanceof DrawingGroupRecord) {
((DrawingGroupRecord)lastRecord).processContinueRecord(contRec.getData());
} else if (lastRecord instanceof UnknownRecord) {
//Gracefully handle records that we don't know about,
//that happen to be continued
records.add(record);
} else if (lastRecord instanceof EOFRecord) {
// This is really odd, but excel still sometimes
// outputs a file like this all the same
records.add(record);
} else {
throw new RecordFormatException("Unhandled Continue Record");
}
} else {
lastRecord = record;
if (record instanceof DrawingRecord) {
lastDrawingRecord = (DrawingRecord) record;
}
records.add(record);
}
}
return records; return records;
} }
private static void addAll(List<Record> destList, Record[] srcRecs) {
for (int i = 0; i < srcRecs.length; i++) {
destList.add(srcRecs[i]);
}
}
} }

View File

@ -0,0 +1,233 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hssf.record;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
/**
* A stream based way to get at complete records, with
* as low a memory footprint as possible.
* This handles reading from a RecordInputStream, turning
* the data into full records, processing continue records
* etc.
* Most users should use {@link HSSFEventFactory} /
* {@link HSSFListener} and have new records pushed to
* them, but this does allow for a "pull" style of coding.
*/
public class RecordFactoryInputStream {
private final RecordInputStream recStream;
/**
* Have we returned all the records there are?
*/
private boolean complete = false;
/**
* Sometimes we end up with a bunch of
* records. When we do, these should
* be returned before the next normal
* record processing occurs (i.e. before
* we check for continue records and
* return rec)
*/
private final LinkedList bonusRecords = new LinkedList();
/**
* The most recent record that we gave to the user
*/
private Record lastRecord = null;
/**
* The most recent DrawingRecord seen
*/
private DrawingRecord lastDrawingRecord = new DrawingRecord();
private int bofDepth = 0;
private boolean lastRecordWasEOFLevelZero = false;
private boolean includeContinueRecords = false;
public RecordFactoryInputStream(RecordInputStream inp) {
recStream = inp;
}
/**
* Returns the next (complete) record from the
* stream, or null if there are no more.
*/
public Record nextRecord() {
Record r = null;
// Loop until we get something
while (r == null && !complete) {
// Are there any bonus records that we need to
// return?
r = getBonusRecord();
// If not, ask for the next real record
if (r == null) {
r = getNextRecord();
}
}
// All done
return r;
}
/**
* If there are any "bonus" records, that should
* be returned before processing new ones,
* grabs the next and returns it.
* If not, returns null;
*/
private Record getBonusRecord() {
if (!bonusRecords.isEmpty()) {
return (Record) bonusRecords.removeFirst();
}
return null;
}
/**
* Returns the next available record, or null if
* this pass didn't return a record that's
* suitable for returning (eg was a continue record).
*/
private Record getNextRecord() {
/*
* How to recognise end of stream?
* In the best case, the underlying input stream (in) ends just after the last EOF record
* Usually however, the stream is padded with an arbitrary byte count. Excel and most apps
* reliably use zeros for padding and if this were always the case, this code could just
* skip all the (zero sized) records with sid==0. However, bug 46987 shows a file with
* non-zero padding that is read OK by Excel (Excel also fixes the padding).
*
* So to properly detect the workbook end of stream, this code has to identify the last
* EOF record. This is not so easy because the worbook bof+eof pair do not bracket the
* whole stream. The worksheets follow the workbook, but it is not easy to tell how many
* sheet sub-streams should be present. Hence we are looking for an EOF record that is not
* immediately followed by a BOF record. One extra complication is that bof+eof sub-
* streams can be nested within worksheet streams and it's not clear in these cases what
* record might follow any EOF record. So we also need to keep track of the bof/eof
* nesting level.
*/
if (recStream.hasNextRecord()) {
// Grab our next record
recStream.nextRecord();
if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) {
// Normally InputStream (in) contains only zero padding after this point
complete = true;
return null;
}
Record record = RecordFactory.createSingleRecord(recStream);
lastRecordWasEOFLevelZero = false;
if (record instanceof BOFRecord) {
bofDepth++;
return record;
}
if (record instanceof EOFRecord) {
bofDepth--;
if (bofDepth < 1) {
lastRecordWasEOFLevelZero = true;
}
return record;
}
if (record instanceof DBCellRecord) {
// Not needed by POI. Regenerated from scratch by POI when spreadsheet is written
return null;
}
if (record instanceof RKRecord) {
return RecordFactory.convertToNumberRecord((RKRecord) record);
}
if (record instanceof MulRKRecord) {
NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
List<NumberRecord> list = Arrays.asList(records);
bonusRecords.addAll(list.subList(1, list.size()));
return records[0];
}
if (record.getSid() == DrawingGroupRecord.sid
&& lastRecord instanceof DrawingGroupRecord) {
DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord;
lastDGRecord.join((AbstractEscherHolderRecord) record);
return null;
} else if (record.getSid() == ContinueRecord.sid) {
ContinueRecord contRec = (ContinueRecord) record;
if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) {
// Drawing records have a very strange continue behaviour.
//There can actually be OBJ records mixed between the continues.
lastDrawingRecord.processContinueRecord(contRec.getData());
//we must remember the position of the continue record.
//in the serialization procedure the original structure of records must be preserved
if (includeContinueRecords) {
return record;
} else {
return null;
}
} else if (lastRecord instanceof DrawingGroupRecord) {
((DrawingGroupRecord) lastRecord).processContinueRecord(contRec.getData());
return null;
} else if (lastRecord instanceof UnknownRecord) {
//Gracefully handle records that we don't know about,
//that happen to be continued
return record;
} else if (lastRecord instanceof EOFRecord) {
// This is really odd, but excel still sometimes
// outputs a file like this all the same
return record;
} else {
throw new RecordFormatException("Unhandled Continue Record");
}
} else {
lastRecord = record;
if (record instanceof DrawingRecord) {
lastDrawingRecord = (DrawingRecord) record;
}
return record;
}
} else {
// No more records
complete = true;
return null;
}
}
/**
* Return or not ContinueRecord in nextRecord
*/
public void setIncludeContinueRecords(boolean includeContinueRecords) {
this.includeContinueRecords = includeContinueRecords;
}
}