/* * HDFObjectFactory.java * * Created on February 24, 2002, 2:17 PM */ package org.apache.poi.hdf.model; //import java.io; import java.util.ArrayList; import java.io.InputStream; import java.io.IOException; import java.util.List; import java.util.TreeSet; import org.apache.poi.hdf.model.hdftypes.*; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSDocument; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.util.LittleEndian; /** * The Object Factory takes in a stream and creates the low level objects * that represent the data. * @author andy */ public class HDFObjectFactory { /** OLE stuff*/ private POIFSFileSystem _filesystem; /** The FIB*/ private FileInformationBlock _fib; /** The DOP*/ private DocumentProperties _dop; /**the StyleSheet*/ private StyleSheet _styleSheet; /**list info */ private ListTables _listTables; /** Font info */ private FontTable _fonts; /** text pieces */ //BTreeSet _text = new BTreeSet(); TreeSet _text = new TreeSet(); /** document sections */ TreeSet _sections = new TreeSet(); /** document paragraphs */ TreeSet _paragraphs = new TreeSet(); /** document character runs */ TreeSet _characterRuns = new TreeSet(); /** main document stream buffer*/ byte[] _mainDocument; /** table stream buffer*/ byte[] _tableBuffer; /** Creates a new instance of HDFObjectFactory * * @param istream The InputStream that is the Word document * */ public HDFObjectFactory(InputStream istream) throws IOException { //do Ole stuff _filesystem = new POIFSFileSystem(istream); DocumentEntry headerProps = (DocumentEntry)_filesystem.getRoot().getEntry("WordDocument"); _mainDocument = new byte[headerProps.getSize()]; _filesystem.createDocumentInputStream("WordDocument").read(_mainDocument); _fib = new FileInformationBlock(_mainDocument); initTableStream(); initTextPieces(); initFormattingProperties(); } public static List getTypes(InputStream istream) throws IOException { List results = new ArrayList(1); //do Ole stuff POIFSFileSystem filesystem = new POIFSFileSystem(istream); DocumentEntry headerProps = (DocumentEntry)filesystem.getRoot().getEntry("WordDocument"); byte[] mainDocument = new byte[headerProps.getSize()]; filesystem.createDocumentInputStream("WordDocument").read(mainDocument); FileInformationBlock fib = new FileInformationBlock(mainDocument); // initTableStream(); // initTextPieces(); // initFormattingProperties(); results.add(fib); return results; } /** * Initializes the table stream * * @throws IOException */ private void initTableStream() throws IOException { String tablename = null; if(_fib.isTabletype()) { tablename="1Table"; } else { tablename="0Table"; } DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename); //load the table stream into a buffer int size = tableEntry.getSize(); _tableBuffer = new byte[size]; _filesystem.createDocumentInputStream(tablename).read(_tableBuffer); } /** * Initializes the text pieces. Text is divided into pieces because some * "pieces" may only contain unicode characters. * * @throws IOException */ private void initTextPieces() throws IOException { int pos = _fib.getTextPieceTableOffset(); //skips through the prms before we reach the piece table. These contain data //for actual fast saved files while (_tableBuffer[pos] == 1) { pos++; int skip = LittleEndian.getShort(_tableBuffer, pos); pos += 2 + skip; } if(_tableBuffer[pos] != 2) { throw new IOException("The text piece table is corrupted"); } else { //parse out the text pieces int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos); pos += 4; int pieces = (pieceTableSize - 4) / 12; for (int x = 0; x < pieces; x++) { int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2); boolean unicode = false; if ((filePos & 0x40000000) == 0) { unicode = true; } else { unicode = false; filePos &= ~(0x40000000);//gives me FC in doc stream filePos /= 2; } int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) - LittleEndian.getInt(_tableBuffer, pos + (x * 4)); TextPiece piece = new TextPiece(filePos, totLength, unicode); _text.add(piece); } } } /** * initializes all of the formatting properties for a Word Document */ private void initFormattingProperties() { createStyleSheet(); createListTables(); createFontTable(); initDocumentProperties(); initSectionProperties(); initCharacterProperties(); initParagraphProperties(); } /** * initializes the CharacterProperties BTree */ private void initCharacterProperties() { int charOffset = _fib.getChp_bin_table_offset(); int charPlcSize = _fib.getChp_bin_table_size(); int arraySize = (charPlcSize - 4)/8; //first we must go through the bin table and find the fkps for(int x = 0; x < arraySize; x++) { //get page number(has nothing to do with document page) //containing the chpx for the paragraph int PN = LittleEndian.getInt(_tableBuffer, charOffset + (4 * (arraySize + 1) + (4 * x))); byte[] fkp = new byte[512]; System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512); //take each fkp and get the chpxs int crun = LittleEndian.getUnsignedByte(fkp, 511); for(int y = 0; y < crun; y++) { //get the beginning fc of each paragraph text run int fcStart = LittleEndian.getInt(fkp, y * 4); int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4); //get the offset in fkp of the papx for this paragraph int chpxOffset = 2 * LittleEndian.getUnsignedByte(fkp, ((crun + 1) * 4) + y); //optimization if offset == 0 use "Normal" style if(chpxOffset == 0) { _characterRuns.add(new ChpxNode(fcStart, fcEnd, new byte[0])); continue; } int size = LittleEndian.getUnsignedByte(fkp, chpxOffset); byte[] chpx = new byte[size]; System.arraycopy(fkp, ++chpxOffset, chpx, 0, size); //_papTable.put(new Integer(fcStart), papx); _characterRuns.add(new ChpxNode(fcStart, fcEnd, chpx)); } } } /** * intializes the Paragraph Properties BTree */ private void initParagraphProperties() { //find paragraphs int parOffset = _fib.getPap_bin_table_offset(); int parPlcSize = _fib.getPap_bin_table_size(); int arraySize = (parPlcSize - 4)/8; //first we must go through the bin table and find the fkps for(int x = 0; x < arraySize; x++) { int PN = LittleEndian.getInt(_tableBuffer, parOffset + (4 * (arraySize + 1) + (4 * x))); byte[] fkp = new byte[512]; System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512); //take each fkp and get the paps int crun = LittleEndian.getUnsignedByte(fkp, 511); for(int y = 0; y < crun; y++) { //get the beginning fc of each paragraph text run int fcStart = LittleEndian.getInt(fkp, y * 4); int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4); //get the offset in fkp of the papx for this paragraph int papxOffset = 2 * LittleEndian.getUnsignedByte(fkp, ((crun + 1) * 4) + (y * 13)); int size = 2 * LittleEndian.getUnsignedByte(fkp, papxOffset); if(size == 0) { size = 2 * LittleEndian.getUnsignedByte(fkp, ++papxOffset); } else { size--; } byte[] papx = new byte[size]; System.arraycopy(fkp, ++papxOffset, papx, 0, size); _paragraphs.add(new PapxNode(fcStart, fcEnd, papx)); } } } /** * initializes the SectionProperties BTree */ private void initSectionProperties() { //find sections int fcMin = _fib.getOffsetFirstChar(); int plcfsedFC = _fib.getSectionPlcOffset(); int plcfsedSize = _fib.getSectionPlcSize(); byte[] plcfsed = new byte[plcfsedSize]; System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize); int arraySize = (plcfsedSize - 4)/16; for(int x = 0; x < arraySize; x++) { int sectionStart = LittleEndian.getInt(plcfsed, x * 4) + fcMin; int sectionEnd = LittleEndian.getInt(plcfsed, (x+1) * 4) + fcMin; int sepxStart = LittleEndian.getInt(plcfsed, 4 * (arraySize + 1) + (x * 12) + 2); int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart); byte[] sepx = new byte[sepxSize]; System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize); SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx); _sections.add(node); } } /** * Initializes the DocumentProperties object unique to this document. */ private void initDocumentProperties() { int pos = _fib.getDOPOffset(); int size = _fib.getDOPSize(); byte[] dopArray = new byte[size]; System.arraycopy(_tableBuffer, pos, dopArray, 0, size); _dop = new DocumentProperties(dopArray); } /** * Uncompresses the StyleSheet from file into memory. */ private void createStyleSheet() { int stshIndex = _fib.getStylesheetOffset(); int stshSize = _fib.getStylesheetSize(); byte[] stsh = new byte[stshSize]; System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize); _styleSheet = new StyleSheet(stsh); } /** * Initializes the list tables for this document */ private void createListTables() { int lfoOffset = _fib.getFcPlfLfo(); int lfoSize = _fib.getLcbPlfLfo(); byte[] plflfo = new byte[lfoSize]; System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize); int lstOffset = _fib.getFcPlcfLst(); int lstSize = lstOffset; //not sure if this is a mistake or what. I vaguely remember a trick like //this //int lstSize = LittleEndian.getInt(_header, 0x2e2); if(lstOffset > 0 && lstSize > 0) { lstSize = lfoOffset - lstOffset; byte[] plcflst = new byte[lstSize]; System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize); _listTables = new ListTables(plcflst, plflfo); } } /** * Initializes this document's FontTable; */ private void createFontTable() { int fontTableIndex = _fib.getFonts_bin_table_offset(); int fontTableSize = _fib.getFonts_bin_table_size(); byte[] fontTable = new byte[fontTableSize]; System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize); _fonts = new FontTable(fontTable); } }