Start on HSSFOptimiser, which removes un-needed cell styles and fonts, fixing up references as it does so

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@677041 13f79535-47bb-0310-9956-ffa450edef68
2008-07-15 21:15:16 +00:00 · 2008-07-15 21:15:16 +00:00 · b2edbb5332
commit b2edbb5332
parent aa11ce85c1
8 changed files with 372 additions and 2 deletions
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@ -37,6 +37,7 @@

 		<!-- Don't forget to update status.xml too! -->
        <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">New helper, HSSFOptimiser, which handles removing duplicated font and style records, to avoid going over the limits in Excel</action>
           <action dev="POI-DEVELOPERS" type="fix">45322 - Fixed NPE in HSSFSheet.autoSizeColumn() when cell number format was not found</action>
           <action dev="POI-DEVELOPERS" type="add">45380 - Missing return keyword in ArrayPtg.toFormulaString()</action>
           <action dev="POI-DEVELOPERS" type="add">44958 - Record level support for Data Tables. (No formula parser support though)</action>
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,7 @@
 	<!-- Don't forget to update changes.xml too! -->
    <changes>
        <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="add">New helper, HSSFOptimiser, which handles removing duplicated font and style records, to avoid going over the limits in Excel</action>
           <action dev="POI-DEVELOPERS" type="fix">45322 - Fixed NPE in HSSFSheet.autoSizeColumn() when cell number format was not found</action>
           <action dev="POI-DEVELOPERS" type="add">45380 - Missing return keyword in ArrayPtg.toFormulaString()</action>
           <action dev="POI-DEVELOPERS" type="add">44958 - Record level support for Data Tables. (No formula parser support though)</action>
--- a/src/java/org/apache/poi/hssf/record/UnicodeString.java
+++ b/src/java/org/apache/poi/hssf/record/UnicodeString.java
@ -439,6 +439,23 @@ public class UnicodeString
      this.field_5_ext_rst = ext_rst;
    }

+
+    /**
+     * Swaps all use in the string of one font index 
+     *  for use of a different font index.
+     * Normally only called when fonts have been
+     *  removed / re-ordered
+     */
+    public void swapFontUse(short oldFontIndex, short newFontIndex) {
+    	Iterator i = field_4_format_runs.iterator();
+    	while(i.hasNext()) {
+    		FormatRun run = (FormatRun)i.next();
+    		if(run.fontIndex == oldFontIndex) {
+    			run.fontIndex = newFontIndex;
+    		}
+    	}
+    }
+    
    /**
     * unlike the real records we return the same as "getString()" rather than debug info
     * @see #getDebugInfo()
--- a/src/java/org/apache/poi/hssf/usermodel/HSSFOptimiser.java
+++ b/src/java/org/apache/poi/hssf/usermodel/HSSFOptimiser.java
@ -0,0 +1,178 @@
+/* ====================================================================
+   Copyright 2002-2004   Apache Software Foundation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.usermodel;
+
+import java.util.HashSet;
+import java.util.Iterator;
+
+import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FontRecord;
+import org.apache.poi.hssf.record.UnicodeString;
+
+/**
+ * Excel can get cranky if you give it files containing too
+ *  many (especially duplicate) objects, and this class can
+ *  help to avoid those.
+ * In general, it's much better to make sure you don't 
+ *  duplicate the objects in your code, as this is likely
+ *  to be much faster than creating lots and lots of
+ *  excel objects+records, only to optimise them down to
+ *  many fewer at a later stage.
+ * However, sometimes this is too hard / tricky to do, which
+ *  is where the use of this class comes in.
+ */
+public class HSSFOptimiser {
+	/**
+	 * Goes through the Workbook, optimising the fonts by
+	 *  removing duplicate ones.
+	 * For now, only works on fonts used in {@link HSSFCellStyle}
+	 *  and {@link HSSFRichTextString}. Any other font uses
+	 *  (eg charts, pictures) may well end up broken!
+	 * This can be a slow operation, especially if you have
+	 *  lots of cells, cell styles or rich text strings
+	 * @param workbook The workbook in which to optimise the fonts
+	 */
+	public static void optimiseFonts(HSSFWorkbook workbook) {
+		// Where each font has ended up, and if we need to
+		//  delete the record for it. Start off with no change
+		short[] newPos = 
+			new short[workbook.getWorkbook().getNumberOfFontRecords()+1];
+		boolean[] zapRecords = new boolean[newPos.length];
+		for(int i=0; i<newPos.length; i++) {
+			newPos[i] = (short)i;
+			zapRecords[i] = false;
+		}
+		
+		// Get each font record, so we can do deletes
+		//  without getting confused
+		FontRecord[] frecs = new FontRecord[newPos.length]; 
+		for(int i=0; i<newPos.length; i++) {
+			// There is no 4!
+			if(i == 4) continue;
+			
+			frecs[i] = workbook.getWorkbook().getFontRecordAt(i);
+		}
+		
+		// Loop over each font, seeing if it is the same
+		//  as an earlier one. If it is, point users of the
+		//  later duplicate copy to the earlier one, and 
+		//  mark the later one as needing deleting
+		// Note - don't change built in fonts (those before 5)
+		for(int i=5; i<newPos.length; i++) {
+			// Check this one for being a duplicate
+			//  of an earlier one
+			int earlierDuplicate = -1;
+			for(int j=0; j<i && earlierDuplicate == -1; j++) {
+				if(j == 4) continue;
+				
+				FontRecord frCheck = workbook.getWorkbook().getFontRecordAt(j);
+				if(frCheck.sameProperties(frecs[i])) {
+					earlierDuplicate = j;
+				}
+			}
+			
+			// If we got a duplicate, mark it as such
+			if(earlierDuplicate != -1) {
+				newPos[i] = (short)earlierDuplicate;
+				zapRecords[i] = true;
+			}
+		}
+		
+		// Update the new positions based on
+		//  deletes that have occurred between
+		//  the start and them
+		// Only need to worry about user fonts
+		for(int i=5; i<newPos.length; i++) {
+			// Find the number deleted to that
+			//  point, and adjust
+			short preDeletePos = newPos[i];
+			short newPosition = preDeletePos;
+			for(int j=0; j<preDeletePos; j++) {
+				if(zapRecords[j]) newPosition--;
+			}
+			
+			// Update the new position
+			newPos[i] = newPosition;
+		}
+		
+		// Zap the un-needed user font records
+		for(int i=5; i<newPos.length; i++) {
+			if(zapRecords[i]) {
+				workbook.getWorkbook().removeFontRecord(
+						frecs[i]
+				);
+			}
+		}
+		
+		// Tell HSSFWorkbook that it needs to
+		//  re-start its HSSFFontCache
+		workbook.resetFontCache();
+		
+		// Update the cell styles to point at the 
+		//  new locations of the fonts
+		for(int i=0; i<workbook.getWorkbook().getNumExFormats(); i++) {
+			ExtendedFormatRecord xfr = workbook.getWorkbook().getExFormatAt(i);
+			xfr.setFontIndex(
+					newPos[ xfr.getFontIndex() ]
+			);
+		}
+		
+		// Update the rich text strings to point at
+		//  the new locations of the fonts
+		// Remember that one underlying unicode string
+		//  may be shared by multiple RichTextStrings!
+		HashSet doneUnicodeStrings = new HashSet();
+		for(int sheetNum=0; sheetNum<workbook.getNumberOfSheets(); sheetNum++) {
+			HSSFSheet s = workbook.getSheetAt(sheetNum);
+			Iterator rIt = s.rowIterator();
+			while(rIt.hasNext()) {
+				HSSFRow row = (HSSFRow)rIt.next();
+				Iterator cIt = row.cellIterator();
+				while(cIt.hasNext()) {
+					HSSFCell cell = (HSSFCell)cIt.next();
+					if(cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
+						HSSFRichTextString rtr = cell.getRichStringCellValue();
+						UnicodeString u = rtr.getRawUnicodeString();
+						
+						// Have we done this string already?
+						if(! doneUnicodeStrings.contains(u)) {
+							// Update for each new position
+							for(short i=5; i<newPos.length; i++) {
+								if(i != newPos[i]) {
+									u.swapFontUse(i, newPos[i]);
+								}
+							}
+							
+							// Mark as done
+							doneUnicodeStrings.add(u);
+						}
+					}
+				}
+			}
+		}
+	}
+	
+	/**
+	 * Goes through the Wokrbook, optimising the cell styles
+	 *  by removing duplicate ones.
+	 * For best results, optimise the fonts via a call to
+	 *  {@link #optimiseFonts(HSSFWorkbook)} first.
+	 * @param workbook The workbook in which to optimise the cell styles
+	 */
+	public static void optimiseCellStyles(HSSFWorkbook workbook) {
+		
+	}
+}
--- a/src/java/org/apache/poi/hssf/usermodel/HSSFRichTextString.java
+++ b/src/java/org/apache/poi/hssf/usermodel/HSSFRichTextString.java
@ -67,7 +67,7 @@ public class HSSFRichTextString
    
    /** Called whenever the unicode string is modified. When it is modified
     *  we need to create a new SST index, so that other LabelSSTRecords will not
-     *  be affected by changes tat we make to this string.
+     *  be affected by changes that we make to this string.
     */
    private UnicodeString cloneStringIfRequired() {
      if (book == null)
@ -167,10 +167,25 @@ public class HSSFRichTextString
        return string.getString();
    }

-    /** Used internally by the HSSFCell to get the internal string value*/
+    /** 
+     * Used internally by the HSSFCell to get the internal 
+     * string value.
+     * Will ensure the string is not shared
+     */
    UnicodeString getUnicodeString() {
      return cloneStringIfRequired();
    }
+    
+    /**
+     * Returns the raw, probably shared Unicode String. 
+     * Used when tweaking the styles, eg updating font 
+     *  positions.
+     * Changes to this string may well effect
+     *  other RichTextStrings too! 
+     */
+    UnicodeString getRawUnicodeString() {
+    	return string;
+    }

    /** Used internally by the HSSFCell to set the internal string value*/
    void setUnicodeString(UnicodeString str) {
--- a/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java
+++ b/src/java/org/apache/poi/hssf/usermodel/HSSFWorkbook.java
@ -1073,6 +1073,16 @@ public class HSSFWorkbook extends POIDocument

        return retval;
    }
+    
+    /**
+     * Reset the fonts cache, causing all new calls
+     *  to getFontAt() to create new objects.
+     * Should only be called after deleting fonts,
+     *  and that's not something you should normally do
+     */
+    protected void resetFontCache() {
+    	fonts = new Hashtable();
+    }

    /**
     * create a new Cell style and add it to the workbook's style table
--- a/src/testcases/org/apache/poi/hssf/usermodel/AllUserModelTests.java
+++ b/src/testcases/org/apache/poi/hssf/usermodel/AllUserModelTests.java
@ -47,6 +47,7 @@ public class AllUserModelTests {
 		result.addTestSuite(TestHSSFDateUtil.class);
 		result.addTestSuite(TestHSSFHeaderFooter.class);
 		result.addTestSuite(TestHSSFHyperlink.class);
+		result.addTestSuite(TestHSSFOptimiser.class);
 		result.addTestSuite(TestHSSFPalette.class);
 		result.addTestSuite(TestHSSFPatriarch.class);
 		result.addTestSuite(TestHSSFPicture.class);
--- a/src/testcases/org/apache/poi/hssf/usermodel/TestHSSFOptimiser.java
+++ b/src/testcases/org/apache/poi/hssf/usermodel/TestHSSFOptimiser.java
@ -0,0 +1,147 @@
+/* ====================================================================
+   Copyright 2002-2004   Apache Software Foundation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hssf.usermodel;
+
+import junit.framework.TestCase;
+
+public class TestHSSFOptimiser extends TestCase {
+	public void testDoesNoHarmIfNothingToDo() throws Exception {
+		HSSFWorkbook wb = new HSSFWorkbook();
+		
+		HSSFFont f = wb.createFont();
+		f.setFontName("Testing");
+		HSSFCellStyle s = wb.createCellStyle();
+		s.setFont(f);
+		
+		assertEquals(5, wb.getNumberOfFonts());
+		assertEquals(22, wb.getNumCellStyles());
+		
+		// Optimise fonts
+		HSSFOptimiser.optimiseFonts(wb);
+		
+		assertEquals(5, wb.getNumberOfFonts());
+		assertEquals(22, wb.getNumCellStyles());
+		
+		assertEquals(f, s.getFont(wb));
+		
+		// Optimise styles
+//		HSSFOptimiser.optimiseCellStyles(wb);
+		
+		assertEquals(5, wb.getNumberOfFonts());
+		assertEquals(22, wb.getNumCellStyles());
+		
+		assertEquals(f, s.getFont(wb));
+	}
+	
+	public void testOptimiseFonts() throws Exception {
+		HSSFWorkbook wb = new HSSFWorkbook();
+		
+		// Add 6 fonts, some duplicates
+		HSSFFont f1 = wb.createFont();
+		f1.setFontHeight((short)11);
+		f1.setFontName("Testing");
+		
+		HSSFFont f2 = wb.createFont();
+		f2.setFontHeight((short)22);
+		f2.setFontName("Also Testing");
+		
+		HSSFFont f3 = wb.createFont();
+		f3.setFontHeight((short)33);
+		f3.setFontName("Unique");
+		
+		HSSFFont f4 = wb.createFont();
+		f4.setFontHeight((short)11);
+		f4.setFontName("Testing");
+		
+		HSSFFont f5 = wb.createFont();
+		f5.setFontHeight((short)22);
+		f5.setFontName("Also Testing");
+		
+		HSSFFont f6 = wb.createFont();
+		f6.setFontHeight((short)66);
+		f6.setFontName("Also Unique");
+		
+		
+		
+		// Use all three of the four in cell styles
+		HSSFCellStyle cs1 = wb.createCellStyle();
+		cs1.setFont(f1);
+		assertEquals(5, cs1.getFontIndex());
+		
+		HSSFCellStyle cs2 = wb.createCellStyle();
+		cs2.setFont(f4);
+		assertEquals(8, cs2.getFontIndex());
+		
+		HSSFCellStyle cs3 = wb.createCellStyle();
+		cs3.setFont(f5);
+		assertEquals(9, cs3.getFontIndex());
+		
+		HSSFCellStyle cs4 = wb.createCellStyle();
+		cs4.setFont(f6);
+		assertEquals(10, cs4.getFontIndex());
+		
+		
+		// And three in rich text
+		HSSFSheet s = wb.createSheet();
+		HSSFRow r = s.createRow(0);
+		
+		HSSFRichTextString rtr1 = new HSSFRichTextString("Test");
+		rtr1.applyFont(0, 2, f1);
+		rtr1.applyFont(3, 4, f2);
+		r.createCell((short)0).setCellValue(rtr1);
+		
+		HSSFRichTextString rtr2 = new HSSFRichTextString("AlsoTest");
+		rtr2.applyFont(0, 2, f3);
+		rtr2.applyFont(3, 5, f5);
+		rtr2.applyFont(6, 8, f6);
+		r.createCell((short)1).setCellValue(rtr2);
+		
+		
+		// Check what we have now
+		assertEquals(10, wb.getNumberOfFonts());
+		assertEquals(25, wb.getNumCellStyles());
+		
+		// Optimise
+		HSSFOptimiser.optimiseFonts(wb);
+		
+		// Check font count
+		assertEquals(8, wb.getNumberOfFonts());
+		assertEquals(25, wb.getNumCellStyles());
+		
+		// Check font use in cell styles
+		assertEquals(5, cs1.getFontIndex());
+		assertEquals(5, cs2.getFontIndex()); // duplicate of 1
+		assertEquals(6, cs3.getFontIndex()); // duplicate of 2
+		assertEquals(8, cs4.getFontIndex()); // two have gone
+		
+		
+		// And in rich text
+		
+		// RTR 1 had f1 and f2, unchanged 
+		assertEquals(5, r.getCell(0).getRichStringCellValue().getFontAtIndex(0));
+		assertEquals(5, r.getCell(0).getRichStringCellValue().getFontAtIndex(1));
+		assertEquals(6, r.getCell(0).getRichStringCellValue().getFontAtIndex(3));
+		assertEquals(6, r.getCell(0).getRichStringCellValue().getFontAtIndex(4));
+		
+		// RTR 2 had f3 (unchanged), f5 (=f2) and f6 (moved down)
+		assertEquals(7, r.getCell(1).getRichStringCellValue().getFontAtIndex(0));
+		assertEquals(7, r.getCell(1).getRichStringCellValue().getFontAtIndex(1));
+		assertEquals(6, r.getCell(1).getRichStringCellValue().getFontAtIndex(3));
+		assertEquals(6, r.getCell(1).getRichStringCellValue().getFontAtIndex(4));
+		assertEquals(8, r.getCell(1).getRichStringCellValue().getFontAtIndex(6));
+		assertEquals(8, r.getCell(1).getRichStringCellValue().getFontAtIndex(7));
+	}
+}