refactored HSSFEventFactory to use RecordFactory instead of HSSFRecordStream, see Bugzilla 47448

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@791251 13f79535-47bb-0310-9956-ffa450edef68
2009-07-05 14:10:49 +00:00 · 2009-07-05 14:10:49 +00:00 · 15ffa9ebda
commit 15ffa9ebda
parent 9c5c51ad29
5 changed files with 248 additions and 348 deletions
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -33,6 +33,7 @@
    <changes>
        <release version="3.5-beta7" date="2009-??-??">
           <action dev="POI-DEVELOPERS" type="fix">47448 - Allow HSSFEventFactory to handle non-zero padding at the end of the workbook stream</action>
           <action dev="POI-DEVELOPERS" type="add">47456 - Support for getting OLE object data in PowerPointExtractor</action>
           <action dev="POI-DEVELOPERS" type="fix">47411 - Explicitly set the 1900 date system when creating XSSF workbooks</action>
           <action dev="POI-DEVELOPERS" type="add">47400 - Support fo text extraction of footnotes, endnotes and comments in HWPF</action>
--- a/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java
+++ b/src/java/org/apache/poi/hssf/eventusermodel/HSSFEventFactory.java
@ -134,7 +134,7 @@ public class HSSFEventFactory
 		Record r = null;
 		// Create a new RecordStream and use that
-		HSSFRecordStream recordStream = new HSSFRecordStream(in);
+		RecordFactoryInputStream recordStream = new RecordFactoryInputStream(in);
 		// Process each record as they come in
 		while(going) {
--- a/src/java/org/apache/poi/hssf/eventusermodel/HSSFRecordStream.java
+++ b/src/java/org/apache/poi/hssf/eventusermodel/HSSFRecordStream.java
@ -1,234 +0,0 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hssf.eventusermodel;
 import java.util.Vector;
 import org.apache.poi.hssf.record.ContinueRecord;
 import org.apache.poi.hssf.record.DrawingGroupRecord;
 import org.apache.poi.hssf.record.DrawingRecord;
 import org.apache.poi.hssf.record.ObjRecord;
 import org.apache.poi.hssf.record.Record;
 import org.apache.poi.hssf.record.RecordFactory;
 import org.apache.poi.hssf.record.RecordFormatException;
 import org.apache.poi.hssf.record.RecordInputStream;
 import org.apache.poi.hssf.record.TextObjectRecord;
 import org.apache.poi.hssf.record.UnknownRecord;
 /**
 * A stream based way to get at complete records, with
 *  as low a memory footprint as possible.
 * This handles reading from a RecordInputStream, turning
 *  the data into full records, processing continue records
 *  etc.
 * Most users should use {@link HSSFEventFactory} /
 *  {@link HSSFListener} and have new records pushed to
 *  them, but this does allow for a "pull" style of coding.  
 */
 public class HSSFRecordStream {
 	private RecordInputStream in;
 	/** Have we run out of records on the stream? */
 	private boolean hitEOS = false;
 	/** Have we returned all the records there are? */
 	private boolean complete = false;
 	/**
 	 * Sometimes we end up with a bunch of
 	 *  records. When we do, these should
 	 *  be returned before the next normal
 	 *  record processing occurs (i.e. before
 	 *  we check for continue records and
 	 *  return rec)
 	 */
 	private Vector bonusRecords = null;
 	/** 
 	 * The next record to return, which may need to have its
 	 *  continue records passed to it before we do
 	 */
 	private Record rec = null;
 	/**
 	 * The most recent record that we gave to the user
 	 */
 	private Record lastRec = null;
 	/**
 	 * The most recent DrawingRecord seen
 	 */
 	private DrawingRecord lastDrawingRecord = new DrawingRecord();
 	public HSSFRecordStream(RecordInputStream inp) {
 		this.in = inp;
 	}
 	/**
 	 * Returns the next (complete) record from the 
 	 *  stream, or null if there are no more.
 	 */
 	public Record nextRecord() {
 		Record r = null;
 		// Loop until we get something
 		while(r == null && !complete) {
 			// Are there any bonus records that we need to
 			//  return?
 			r = getBonusRecord();
 			// If not, ask for the next real record
 			if(r == null) {
 				r = getNextRecord();
 			}
 		}
 		// All done
 		return r;
 	}
 	/**
 	 * If there are any "bonus" records, that should
 	 *  be returned before processing new ones, 
 	 *  grabs the next and returns it.
 	 * If not, returns null;
 	 */
 	private Record getBonusRecord() {
 		if(bonusRecords != null) {
 			Record r = (Record)bonusRecords.remove(0);
 			if(bonusRecords.size() == 0) {
 				bonusRecords = null;
 			}
 			return r;
 		}
 		return null;
 	}
 	/**
 	 * Returns the next available record, or null if
 	 *  this pass didn't return a record that's
 	 *  suitable for returning (eg was a continue record).
 	 */
 	private Record getNextRecord() {
 		Record toReturn = null;
 		if(in.hasNextRecord()) {
 			// Grab our next record
 			in.nextRecord();
 			short sid = in.getSid();
            //
            // for some reasons we have to make the workbook to be at least 4096 bytes
            // but if we have such workbook we fill the end of it with zeros (many zeros)
            //
            // it is not good:
            // if the length( all zero records ) % 4 = 1
            // e.g.: any zero record would be readed as  4 bytes at once ( 2 - id and 2 - size ).
            // And the last 1 byte will be readed WRONG ( the id must be 2 bytes )
            //
            // So we should better to check if the sid is zero and not to read more data
            // The zero sid shows us that rest of the stream data is a fake to make workbook 
            // certain size
            //
            if ( sid == 0 )
                return null;
            // If we had a last record, and this one
            //  isn't a continue record, then pass
            //  it on to the listener
 			if ((rec != null) && (sid != ContinueRecord.sid))
 			{
 				// This last record ought to be returned
 				toReturn = rec;
 			}
 			// If this record isn't a continue record,
 			//  then build it up
 			if (sid != ContinueRecord.sid)
 			{
 				//System.out.println("creating "+sid);
 				Record[] recs = RecordFactory.createRecord(in);
 				// We know that the multiple record situations
 				//  don't contain continue records, so just
 				//  pass those on to the listener now
 				if (recs.length > 1) {
 					bonusRecords = new Vector(recs.length-1);
 					for (int k = 0; k < (recs.length - 1); k++)	{
 						bonusRecords.add(recs[k]);
 					}
 				}
 				// Regardless of the number we created, always hold
 				//  onto the last record to be processed on the next
 				//  loop, in case it has any continue records
 				rec = recs[ recs.length - 1 ];
 				// Don't return it just yet though, as we probably have
 				//  a record from the last round to return
 			}
 			else {
 				// Normally, ContinueRecords are handled internally
 				// However, in a few cases, there is a gap between a record at
 				//  its Continue, so we have to handle them specially
 				// This logic is much like in RecordFactory.createRecords()
 				Record[] recs = RecordFactory.createRecord(in);
 				ContinueRecord crec = (ContinueRecord)recs[0];
 				if((lastRec instanceof ObjRecord) || (lastRec instanceof TextObjectRecord)) {
 					// You can have Obj records between a DrawingRecord
 					//  and its continue!
 					lastDrawingRecord.processContinueRecord( crec.getData() );
 					// Trigger them on the drawing record, now it's complete
 					rec = lastDrawingRecord;
 				}
 				else if((lastRec instanceof DrawingGroupRecord)) {
 					((DrawingGroupRecord)lastRec).processContinueRecord(crec.getData());
 					// Trigger them on the drawing record, now it's complete
 					rec = lastRec;
 				}
 				else {
                    if (rec instanceof UnknownRecord) {
                        ;//silently skip records we don't know about
                    } else {
 					    throw new RecordFormatException("Records should handle ContinueRecord internally. Should not see this exception");
                    }
 				}
 			}
 			// Update our tracking of the last record
 			lastRec = rec;
 			if(rec instanceof DrawingRecord) {
 				lastDrawingRecord = (DrawingRecord)rec;
 			}
 		} else {
 			// No more records
 			hitEOS = true;
 		}
 		// If we've hit the end-of-stream, then
 		//  finish off the last record and be done
 		if(hitEOS) {
 			complete = true;
 			// Return the last record if there was
 			//  one, otherwise null
 			if(rec != null) {
 				toReturn = rec;
 				rec = null;
 			}
 		}
 		return toReturn;
 	}
 }
--- a/src/java/org/apache/poi/hssf/record/RecordFactory.java
+++ b/src/java/org/apache/poi/hssf/record/RecordFactory.java
@ -17,21 +17,14 @@
 package org.apache.poi.hssf.record;
 import org.apache.poi.hssf.record.chart.*;
 import org.apache.poi.hssf.record.pivottable.*;
 import java.io.InputStream;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Modifier;
-import java.util.ArrayList;
+import java.util.*;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import org.apache.poi.hssf.record.chart.*;
 import org.apache.poi.hssf.record.pivottable.*;
 /**
 * Title:  Record Factory<P>
@ -259,7 +252,7 @@ public final class RecordFactory {
 		return new Record[] { record, };
 	}
-	static Record createSingleRecord(RecordInputStream in) {
+	public static Record createSingleRecord(RecordInputStream in) {
 		I_RecordCreator constructor = _recordCreatorsById.get(new Integer(in.getSid()));
 		if (constructor == null) {
@ -273,7 +266,7 @@ public final class RecordFactory {
 	 * RK record is a slightly smaller alternative to NumberRecord
 	 * POI likes NumberRecord better
 	 */
-	private static NumberRecord convertToNumberRecord(RKRecord rk) {
+	public static NumberRecord convertToNumberRecord(RKRecord rk) {
 		NumberRecord num = new NumberRecord();
 		num.setColumn(rk.getColumn());
@ -286,7 +279,7 @@ public final class RecordFactory {
 	/**
 	 * Converts a {@link MulRKRecord} into an equivalent array of {@link NumberRecord}s
 	 */
-	private static NumberRecord[] convertRKRecords(MulRKRecord mrk) {
+	public static NumberRecord[] convertRKRecords(MulRKRecord mrk) {
 		NumberRecord[] mulRecs = new NumberRecord[mrk.getNumColumns()];
 		for (int k = 0; k < mrk.getNumColumns(); k++) {
@ -374,109 +367,16 @@ public final class RecordFactory {
 	 * @exception RecordFormatException on error processing the InputStream
 	 */
 	public static List<Record> createRecords(InputStream in) throws RecordFormatException {
 		List<Record> records = new ArrayList<Record>(NUM_RECORDS);
-		RecordInputStream recStream = new RecordInputStream(in);
+		RecordFactoryInputStream recStream = new RecordFactoryInputStream(new RecordInputStream(in));
-		DrawingRecord lastDrawingRecord = new DrawingRecord( );
+                recStream.setIncludeContinueRecords(true);
 		Record lastRecord = null;
 		/*
 		 * How to recognise end of stream?
 		 * In the best case, the underlying input stream (in) ends just after the last EOF record
 		 * Usually however, the stream is padded with an arbitrary byte count.  Excel and most apps
 		 * reliably use zeros for padding and if this were always the case, this code could just
 		 * skip all the (zero sized) records with sid==0.  However, bug 46987 shows a file with
 		 * non-zero padding that is read OK by Excel (Excel also fixes the padding).
 		 *
 		 * So to properly detect the workbook end of stream, this code has to identify the last
 		 * EOF record.  This is not so easy because the worbook bof+eof pair do not bracket the
 		 * whole stream.  The worksheets follow the workbook, but it is not easy to tell how many
 		 * sheet sub-streams should be present.  Hence we are looking for an EOF record that is not
 		 * immediately followed by a BOF record.  One extra complication is that bof+eof sub-
 		 * streams can be nested within worksheet streams and it's not clear in these cases what
 		 * record might follow any EOF record.  So we also need to keep track of the bof/eof
 		 * nesting level.
 		 */
-		int bofDepth=0;
+        Record record;
-		boolean lastRecordWasEOFLevelZero = false;
+		while ((record = recStream.nextRecord())!=null) {
 		while (recStream.hasNextRecord()) {
 			recStream.nextRecord();
 			if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) {
 				// Normally InputStream (in) contains only zero padding after this point
 				break;
 			}
 			Record record = createSingleRecord(recStream);
 			lastRecordWasEOFLevelZero = false;
 			if (record instanceof BOFRecord) {
 				bofDepth++;
                        records.add(record);
 				continue;
 			}
 			if (record instanceof EOFRecord) {
 				bofDepth--;
 				records.add(record);
 				if (bofDepth<1) {
 					lastRecordWasEOFLevelZero = true;
 				}
 				continue;
 		}
 			if (record instanceof DBCellRecord) {
 				// Not needed by POI.  Regenerated from scratch by POI when spreadsheet is written
 				continue;
 			}
 			if (record instanceof RKRecord) {
 				records.add(convertToNumberRecord((RKRecord) record));
 				continue;
 			}
 			if (record instanceof MulRKRecord) {
 				addAll(records, convertRKRecords((MulRKRecord)record));
 				continue;
 			}
 			if (record.getSid() == DrawingGroupRecord.sid
 				   && lastRecord instanceof DrawingGroupRecord) {
 				DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord;
 				lastDGRecord.join((AbstractEscherHolderRecord) record);
 			} else if (record.getSid() == ContinueRecord.sid) {
 				ContinueRecord contRec = (ContinueRecord)record;
 				if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) {
 					// Drawing records have a very strange continue behaviour.
 					//There can actually be OBJ records mixed between the continues.
 					lastDrawingRecord.processContinueRecord(contRec.getData() );
 					//we must remember the position of the continue record.
 					//in the serialization procedure the original structure of records must be preserved
 					records.add(record);
 				} else if (lastRecord instanceof DrawingGroupRecord) {
 					((DrawingGroupRecord)lastRecord).processContinueRecord(contRec.getData());
 				} else if (lastRecord instanceof UnknownRecord) {
 					//Gracefully handle records that we don't know about,
 					//that happen to be continued
 					records.add(record);
 				} else if (lastRecord instanceof EOFRecord) {
 					// This is really odd, but excel still sometimes
 					//  outputs a file like this all the same
 					records.add(record);
 				} else {
 					throw new RecordFormatException("Unhandled Continue Record");
 				}
 			} else {
 				lastRecord = record;
 				if (record instanceof DrawingRecord) {
 					lastDrawingRecord = (DrawingRecord) record;
 				}
 				records.add(record);
 			}
 		}
 		return records;
 	}
 	private static void addAll(List<Record> destList, Record[] srcRecs) {
 		for (int i = 0; i < srcRecs.length; i++) {
 			destList.add(srcRecs[i]);
 		}
 	}
 }
--- a/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java
+++ b/src/java/org/apache/poi/hssf/record/RecordFactoryInputStream.java
@ -0,0 +1,233 @@
 /* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
 package org.apache.poi.hssf.record;
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;
 import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.List;
 /**
 * A stream based way to get at complete records, with
 * as low a memory footprint as possible.
 * This handles reading from a RecordInputStream, turning
 * the data into full records, processing continue records
 * etc.
 * Most users should use {@link HSSFEventFactory} /
 * {@link HSSFListener} and have new records pushed to
 * them, but this does allow for a "pull" style of coding.
 */
 public class RecordFactoryInputStream {
    private final RecordInputStream recStream;
    /**
     * Have we returned all the records there are?
     */
    private boolean complete = false;
    /**
     * Sometimes we end up with a bunch of
     * records. When we do, these should
     * be returned before the next normal
     * record processing occurs (i.e. before
     * we check for continue records and
     * return rec)
     */
    private final LinkedList bonusRecords = new LinkedList();
    /**
     * The most recent record that we gave to the user
     */
    private Record lastRecord = null;
    /**
     * The most recent DrawingRecord seen
     */
    private DrawingRecord lastDrawingRecord = new DrawingRecord();
    private int bofDepth = 0;
    private boolean lastRecordWasEOFLevelZero = false;
    private boolean includeContinueRecords = false;
    public RecordFactoryInputStream(RecordInputStream inp) {
        recStream = inp;
    }
    /**
     * Returns the next (complete) record from the
     * stream, or null if there are no more.
     */
    public Record nextRecord() {
        Record r = null;
        // Loop until we get something
        while (r == null && !complete) {
            // Are there any bonus records that we need to
            //  return?
            r = getBonusRecord();
            // If not, ask for the next real record
            if (r == null) {
                r = getNextRecord();
            }
        }
        // All done
        return r;
    }
    /**
     * If there are any "bonus" records, that should
     * be returned before processing new ones,
     * grabs the next and returns it.
     * If not, returns null;
     */
    private Record getBonusRecord() {
        if (!bonusRecords.isEmpty()) {
            return (Record) bonusRecords.removeFirst();
        }
        return null;
    }
    /**
     * Returns the next available record, or null if
     * this pass didn't return a record that's
     * suitable for returning (eg was a continue record).
     */
    private Record getNextRecord() {
        /*
        * How to recognise end of stream?
        * In the best case, the underlying input stream (in) ends just after the last EOF record
        * Usually however, the stream is padded with an arbitrary byte count.  Excel and most apps
        * reliably use zeros for padding and if this were always the case, this code could just
        * skip all the (zero sized) records with sid==0.  However, bug 46987 shows a file with
        * non-zero padding that is read OK by Excel (Excel also fixes the padding).
        *
        * So to properly detect the workbook end of stream, this code has to identify the last
        * EOF record.  This is not so easy because the worbook bof+eof pair do not bracket the
        * whole stream.  The worksheets follow the workbook, but it is not easy to tell how many
        * sheet sub-streams should be present.  Hence we are looking for an EOF record that is not
        * immediately followed by a BOF record.  One extra complication is that bof+eof sub-
        * streams can be nested within worksheet streams and it's not clear in these cases what
        * record might follow any EOF record.  So we also need to keep track of the bof/eof
        * nesting level.
        */
        if (recStream.hasNextRecord()) {
            // Grab our next record
            recStream.nextRecord();
            if (lastRecordWasEOFLevelZero && recStream.getSid() != BOFRecord.sid) {
                // Normally InputStream (in) contains only zero padding after this point
                complete = true;
                return null;
            }
            Record record = RecordFactory.createSingleRecord(recStream);
            lastRecordWasEOFLevelZero = false;
            if (record instanceof BOFRecord) {
                bofDepth++;
                return record;
            }
            if (record instanceof EOFRecord) {
                bofDepth--;
                if (bofDepth < 1) {
                    lastRecordWasEOFLevelZero = true;
                }
                return record;
            }
            if (record instanceof DBCellRecord) {
                // Not needed by POI.  Regenerated from scratch by POI when spreadsheet is written
                return null;
            }
            if (record instanceof RKRecord) {
                return RecordFactory.convertToNumberRecord((RKRecord) record);
            }
            if (record instanceof MulRKRecord) {
                NumberRecord[] records = RecordFactory.convertRKRecords((MulRKRecord) record);
                List<NumberRecord> list = Arrays.asList(records);
                bonusRecords.addAll(list.subList(1, list.size()));
                return records[0];
            }
            if (record.getSid() == DrawingGroupRecord.sid
                    && lastRecord instanceof DrawingGroupRecord) {
                DrawingGroupRecord lastDGRecord = (DrawingGroupRecord) lastRecord;
                lastDGRecord.join((AbstractEscherHolderRecord) record);
                return null;
            } else if (record.getSid() == ContinueRecord.sid) {
                ContinueRecord contRec = (ContinueRecord) record;
                if (lastRecord instanceof ObjRecord || lastRecord instanceof TextObjectRecord) {
                    // Drawing records have a very strange continue behaviour.
                    //There can actually be OBJ records mixed between the continues.
                    lastDrawingRecord.processContinueRecord(contRec.getData());
                    //we must remember the position of the continue record.
                    //in the serialization procedure the original structure of records must be preserved
                    if (includeContinueRecords) {
                        return record;
                    } else {
                        return null;
                    }
                } else if (lastRecord instanceof DrawingGroupRecord) {
                    ((DrawingGroupRecord) lastRecord).processContinueRecord(contRec.getData());
                    return null;
                } else if (lastRecord instanceof UnknownRecord) {
                    //Gracefully handle records that we don't know about,
                    //that happen to be continued
                    return record;
                } else if (lastRecord instanceof EOFRecord) {
                    // This is really odd, but excel still sometimes
                    //  outputs a file like this all the same
                    return record;
                } else {
                    throw new RecordFormatException("Unhandled Continue Record");
                }
            } else {
                lastRecord = record;
                if (record instanceof DrawingRecord) {
                    lastDrawingRecord = (DrawingRecord) record;
                }
                return record;
            }
        } else {
            // No more records
            complete = true;
            return null;
        }
    }
    /**
     * Return or not ContinueRecord in nextRecord
     */
    public void setIncludeContinueRecords(boolean includeContinueRecords) {
        this.includeContinueRecords = includeContinueRecords;
    }
 }