More Word 6 / Word 95 Support
HWPFOldDocument now processes a few more table sections, and so we can fake up some basic Ranges. This allows us to do paragraph level text extraction git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@960102 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
01ec911b74
commit
256e73d16d
@ -34,6 +34,7 @@
|
||||
|
||||
<changes>
|
||||
<release version="3.7-beta2" date="2010-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">Paragraph level as well as whole-file text extraction for Word 6/95 files through HWPF</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
|
||||
|
@ -31,7 +31,6 @@ import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||
import org.apache.poi.hwpf.model.DocumentProperties;
|
||||
import org.apache.poi.hwpf.model.EscherRecordHolder;
|
||||
import org.apache.poi.hwpf.model.FSPATable;
|
||||
import org.apache.poi.hwpf.model.FileInformationBlock;
|
||||
import org.apache.poi.hwpf.model.FontTable;
|
||||
import org.apache.poi.hwpf.model.GenericPropertyNode;
|
||||
import org.apache.poi.hwpf.model.ListTables;
|
||||
@ -83,24 +82,6 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||
|
||||
protected TextPieceTable _tpt;
|
||||
|
||||
/** Contains formatting properties for text*/
|
||||
protected CHPBinTable _cbt;
|
||||
|
||||
/** Contains formatting properties for paragraphs*/
|
||||
protected PAPBinTable _pbt;
|
||||
|
||||
/** Contains formatting properties for sections.*/
|
||||
protected SectionTable _st;
|
||||
|
||||
/** Holds styles for this document.*/
|
||||
protected StyleSheet _ss;
|
||||
|
||||
/** Holds fonts for this document.*/
|
||||
protected FontTable _ft;
|
||||
|
||||
/** Hold list tables */
|
||||
protected ListTables _lt;
|
||||
|
||||
/** Holds the save history for this document. */
|
||||
protected SavedByTable _sbt;
|
||||
|
||||
@ -277,15 +258,11 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||
}
|
||||
}
|
||||
|
||||
public StyleSheet getStyleSheet()
|
||||
public TextPieceTable getTextTable()
|
||||
{
|
||||
return _ss;
|
||||
return _cft.getTextPieceTable();
|
||||
}
|
||||
|
||||
public FileInformationBlock getFileInformationBlock()
|
||||
{
|
||||
return _fib;
|
||||
}
|
||||
public CPSplitCalculator getCPSplitCalculator()
|
||||
{
|
||||
return _cpSplit;
|
||||
@ -390,11 +367,6 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||
return length;
|
||||
}
|
||||
|
||||
public ListTables getListTables()
|
||||
{
|
||||
return _lt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a reference to the saved -by table, which holds the save history for the document.
|
||||
*
|
||||
@ -591,26 +563,6 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||
pfs.writeFilesystem(out);
|
||||
}
|
||||
|
||||
public CHPBinTable getCharacterTable()
|
||||
{
|
||||
return _cbt;
|
||||
}
|
||||
|
||||
public PAPBinTable getParagraphTable()
|
||||
{
|
||||
return _pbt;
|
||||
}
|
||||
|
||||
public SectionTable getSectionTable()
|
||||
{
|
||||
return _st;
|
||||
}
|
||||
|
||||
public TextPieceTable getTextTable()
|
||||
{
|
||||
return _cft.getTextPieceTable();
|
||||
}
|
||||
|
||||
public byte[] getDataStream()
|
||||
{
|
||||
return _dataStream;
|
||||
@ -629,11 +581,6 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||
return _lt.addList(list.getListData(), list.getOverride());
|
||||
}
|
||||
|
||||
public FontTable getFontTable()
|
||||
{
|
||||
return _ft;
|
||||
}
|
||||
|
||||
public void delete(int start, int length)
|
||||
{
|
||||
Range r = new Range(start, start + length, this);
|
||||
|
@ -23,7 +23,15 @@ import java.io.PushbackInputStream;
|
||||
|
||||
import org.apache.poi.EncryptedDocumentException;
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.hwpf.model.CHPBinTable;
|
||||
import org.apache.poi.hwpf.model.FileInformationBlock;
|
||||
import org.apache.poi.hwpf.model.FontTable;
|
||||
import org.apache.poi.hwpf.model.ListTables;
|
||||
import org.apache.poi.hwpf.model.PAPBinTable;
|
||||
import org.apache.poi.hwpf.model.SectionTable;
|
||||
import org.apache.poi.hwpf.model.StyleSheet;
|
||||
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
@ -40,6 +48,24 @@ public abstract class HWPFDocumentCore extends POIDocument
|
||||
/** The FIB */
|
||||
protected FileInformationBlock _fib;
|
||||
|
||||
/** Holds styles for this document.*/
|
||||
protected StyleSheet _ss;
|
||||
|
||||
/** Contains formatting properties for text*/
|
||||
protected CHPBinTable _cbt;
|
||||
|
||||
/** Contains formatting properties for paragraphs*/
|
||||
protected PAPBinTable _pbt;
|
||||
|
||||
/** Contains formatting properties for sections.*/
|
||||
protected SectionTable _st;
|
||||
|
||||
/** Holds fonts for this document.*/
|
||||
protected FontTable _ft;
|
||||
|
||||
/** Hold list tables */
|
||||
protected ListTables _lt;
|
||||
|
||||
/** main document stream buffer*/
|
||||
protected byte[] _mainStream;
|
||||
|
||||
@ -123,6 +149,44 @@ public abstract class HWPFDocumentCore extends POIDocument
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the range which covers the whole of the
|
||||
* document, but excludes any headers and footers.
|
||||
*/
|
||||
public abstract Range getRange();
|
||||
|
||||
public abstract TextPieceTable getTextTable();
|
||||
|
||||
public CHPBinTable getCharacterTable()
|
||||
{
|
||||
return _cbt;
|
||||
}
|
||||
|
||||
public PAPBinTable getParagraphTable()
|
||||
{
|
||||
return _pbt;
|
||||
}
|
||||
|
||||
public SectionTable getSectionTable()
|
||||
{
|
||||
return _st;
|
||||
}
|
||||
|
||||
public StyleSheet getStyleSheet()
|
||||
{
|
||||
return _ss;
|
||||
}
|
||||
|
||||
public ListTables getListTables()
|
||||
{
|
||||
return _lt;
|
||||
}
|
||||
|
||||
public FontTable getFontTable()
|
||||
{
|
||||
return _ft;
|
||||
}
|
||||
|
||||
public FileInformationBlock getFileInformationBlock()
|
||||
{
|
||||
return _fib;
|
||||
|
@ -18,15 +18,15 @@ package org.apache.poi.hwpf;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.hwpf.model.CHPX;
|
||||
import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
||||
import org.apache.poi.hwpf.model.OldPAPBinTable;
|
||||
import org.apache.poi.hwpf.model.OldSectionTable;
|
||||
import org.apache.poi.hwpf.model.PieceDescriptor;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
@ -34,11 +34,9 @@ import org.apache.poi.util.LittleEndian;
|
||||
/**
|
||||
* Provides very simple support for old (Word 6 / Word 95)
|
||||
* files.
|
||||
* TODO Provide a way to get at the properties associated
|
||||
* with each block of text
|
||||
*/
|
||||
public class HWPFOldDocument extends HWPFDocumentCore {
|
||||
private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>();
|
||||
private TextPieceTable tpt;
|
||||
|
||||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
||||
this(fs.getRoot(), fs);
|
||||
@ -49,14 +47,19 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||
super(directory, fs);
|
||||
|
||||
// Where are things?
|
||||
int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
|
||||
int sedTableSize = LittleEndian.getInt(_mainStream, 0x8c);
|
||||
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
|
||||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
||||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
||||
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
|
||||
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
|
||||
//int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
|
||||
//int shfTableSize = LittleEndian.getInt(_mainStream, 0x64);
|
||||
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
||||
|
||||
// We need to get hold of the text that makes up the
|
||||
// document, which might be regular or fast-saved
|
||||
StringBuffer text = new StringBuffer();
|
||||
TextPieceTable tpt;
|
||||
if(_fib.isFComplex()) {
|
||||
ComplexFileTable cft = new ComplexFileTable(
|
||||
_mainStream, _mainStream,
|
||||
@ -68,11 +71,15 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||
text.append( tp.getStringBuffer() );
|
||||
}
|
||||
} else {
|
||||
// TODO Discover if these older documents can ever hold Unicode Strings?
|
||||
// (We think not, because they seem to lack a Piece table)
|
||||
// TODO Build the Piece Descriptor properly
|
||||
// TODO Can these old documents ever contain Unicode strings?
|
||||
// (We have to fake it, as they don't seem to have a proper Piece table)
|
||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
|
||||
pd.setFilePosition(_fib.getFcMin());
|
||||
|
||||
// Generate a single Text Piece Table, with a single Text Piece
|
||||
// which covers all the (8 bit only) text in the file
|
||||
tpt = new TextPieceTable();
|
||||
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
|
||||
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
|
||||
@ -85,51 +92,34 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||
}
|
||||
|
||||
// Now we can fetch the character and paragraph properties
|
||||
OldCHPBinTable chpTable = new OldCHPBinTable(
|
||||
_cbt = new OldCHPBinTable(
|
||||
_mainStream, chpTableOffset, chpTableSize,
|
||||
_fib.getFcMin(), tpt
|
||||
);
|
||||
_pbt = new OldPAPBinTable(
|
||||
_mainStream, papTableOffset, papTableSize,
|
||||
_fib.getFcMin(), tpt
|
||||
);
|
||||
_st = new OldSectionTable(
|
||||
_mainStream, sedTableOffset, sedTableSize,
|
||||
_fib.getFcMin(), tpt
|
||||
);
|
||||
}
|
||||
|
||||
// Finally build up runs
|
||||
for(CHPX chpx : chpTable.getTextRuns()) {
|
||||
String str = text.substring(chpx.getStart(), chpx.getEnd());
|
||||
contents.add(new TextAndCHPX(str,chpx));
|
||||
}
|
||||
public Range getRange() {
|
||||
// Life is easy when we have no footers, headers or unicode!
|
||||
return new Range(
|
||||
0, _fib.getFcMac() - _fib.getFcMin(), this
|
||||
);
|
||||
}
|
||||
|
||||
public TextPieceTable getTextTable()
|
||||
{
|
||||
return tpt;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(OutputStream out) throws IOException {
|
||||
throw new IllegalStateException("Writing is not available for the older file formats");
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all our text, in order, along with the
|
||||
* CHPX information on each bit.
|
||||
* Every entry has the same formatting, but as yet
|
||||
* we've no way to tell what the formatting is...
|
||||
* Warnings - this will change as soon as we support
|
||||
* text formatting!
|
||||
*/
|
||||
public List<TextAndCHPX> getContents() {
|
||||
return contents;
|
||||
}
|
||||
|
||||
/**
|
||||
* Warnings - this will change as soon as we support
|
||||
* text formatting!
|
||||
*/
|
||||
public static class TextAndCHPX {
|
||||
private String text;
|
||||
private CHPX chpx;
|
||||
private TextAndCHPX(String text, CHPX chpx) {
|
||||
this.text = text;
|
||||
this.chpx = chpx;
|
||||
}
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
public CHPX getChpx() {
|
||||
return chpx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -22,7 +22,6 @@ import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||
import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
@ -68,12 +67,41 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
/**
|
||||
* Get the text from the word file, as an array with one String
|
||||
* per paragraph
|
||||
*/
|
||||
public String[] getParagraphText() {
|
||||
String[] ret;
|
||||
|
||||
// Extract using the model code
|
||||
try {
|
||||
Range r = doc.getRange();
|
||||
|
||||
ret = WordExtractor.getParagraphText(r);
|
||||
} catch (Exception e) {
|
||||
// Something's up with turning the text pieces into paragraphs
|
||||
// Fall back to ripping out the text pieces
|
||||
ret = new String[doc.getTextTable().getTextPieces().size()];
|
||||
for(int i=0; i<ret.length; i++) {
|
||||
ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuffer().toString();
|
||||
|
||||
// Fix the line endings
|
||||
ret[i].replaceAll("\r", "\ufffe");
|
||||
ret[i].replaceAll("\ufffe","\r\n");
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
for(TextAndCHPX tchpx : doc.getContents()) {
|
||||
text.append( Range.stripFields(tchpx.getText()) );
|
||||
|
||||
for(String t : getParagraphText()) {
|
||||
text.append(t);
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
||||
|
@ -17,13 +17,12 @@
|
||||
|
||||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
@ -133,7 +132,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
||||
return getParagraphText(r);
|
||||
}
|
||||
|
||||
private String[] getParagraphText(Range r) {
|
||||
protected static String[] getParagraphText(Range r) {
|
||||
String[] ret;
|
||||
ret = new String[r.numParagraphs()];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
@ -215,10 +214,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
||||
public String getTextFromPieces() {
|
||||
StringBuffer textBuf = new StringBuffer();
|
||||
|
||||
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
|
||||
while (textPieces.hasNext()) {
|
||||
TextPiece piece = (TextPiece) textPieces.next();
|
||||
|
||||
for(TextPiece piece : doc.getTextTable().getTextPieces()) {
|
||||
String encoding = "Cp1252";
|
||||
if (piece.isUnicode()) {
|
||||
encoding = "UTF-16LE";
|
||||
|
@ -32,7 +32,7 @@ import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||
*
|
||||
* @author Ryan Ackley
|
||||
*/
|
||||
public final class CHPBinTable
|
||||
public class CHPBinTable
|
||||
{
|
||||
/** List of character properties.*/
|
||||
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
|
||||
|
@ -17,9 +17,6 @@
|
||||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
@ -31,11 +28,8 @@ import org.apache.poi.util.LittleEndian;
|
||||
* In common with the rest of the old support, it
|
||||
* is read only
|
||||
*/
|
||||
public final class OldCHPBinTable
|
||||
public final class OldCHPBinTable extends CHPBinTable
|
||||
{
|
||||
/** List of character properties.*/
|
||||
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
|
||||
|
||||
/**
|
||||
* Constructor used to read an old-style binTable
|
||||
* in from a Word document.
|
||||
@ -69,9 +63,4 @@ public final class OldCHPBinTable
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<CHPX> getTextRuns()
|
||||
{
|
||||
return _textRuns;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,59 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
/**
|
||||
* This class holds all of the paragraph formatting
|
||||
* properties from Old (Word 6 / Word 95) documents.
|
||||
* Unlike with Word 97+, it all gets held in the
|
||||
* same stream.
|
||||
* In common with the rest of the old support, it
|
||||
* is read only
|
||||
*/
|
||||
public final class OldPAPBinTable extends PAPBinTable
|
||||
{
|
||||
public OldPAPBinTable(byte[] documentStream, int offset,
|
||||
int size, int fcMin, TextPieceTable tpt)
|
||||
{
|
||||
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
|
||||
|
||||
int length = binTable.length();
|
||||
for (int x = 0; x < length; x++)
|
||||
{
|
||||
GenericPropertyNode node = binTable.getProperty(x);
|
||||
|
||||
int pageNum = LittleEndian.getShort(node.getBytes());
|
||||
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
|
||||
|
||||
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
|
||||
documentStream, pageOffset, fcMin, tpt);
|
||||
|
||||
int fkpSize = pfkp.size();
|
||||
|
||||
for (int y = 0; y < fkpSize; y++)
|
||||
{
|
||||
PAPX papx = pfkp.getPAPX(y);
|
||||
_paragraphs.add(papx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,65 @@
|
||||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
/**
|
||||
* This class holds all of the section formatting
|
||||
* properties from Old (Word 6 / Word 95) documents.
|
||||
* Unlike with Word 97+, it all gets held in the
|
||||
* same stream.
|
||||
* In common with the rest of the old support, it
|
||||
* is read only
|
||||
*/
|
||||
public final class OldSectionTable extends SectionTable
|
||||
{
|
||||
public OldSectionTable(byte[] documentStream, int offset,
|
||||
int size, int fcMin,
|
||||
TextPieceTable tpt)
|
||||
{
|
||||
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
|
||||
|
||||
int length = sedPlex.length();
|
||||
|
||||
for (int x = 0; x < length; x++)
|
||||
{
|
||||
GenericPropertyNode node = sedPlex.getProperty(x);
|
||||
SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);
|
||||
|
||||
int fileOffset = sed.getFc();
|
||||
int startAt = node.getStart();
|
||||
int endAt = node.getEnd();
|
||||
|
||||
// check for the optimization
|
||||
if (fileOffset == 0xffffffff)
|
||||
{
|
||||
_sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
|
||||
}
|
||||
else
|
||||
{
|
||||
// The first short at the offset is the size of the grpprl.
|
||||
int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
|
||||
byte[] buf = new byte[sepxSize];
|
||||
fileOffset += LittleEndian.SHORT_SIZE;
|
||||
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
|
||||
_sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -34,7 +34,7 @@ import org.apache.poi.util.LittleEndian;
|
||||
*
|
||||
* @author Ryan Ackley
|
||||
*/
|
||||
public final class PAPBinTable
|
||||
public class PAPBinTable
|
||||
{
|
||||
protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
|
||||
byte[] _dataStream;
|
||||
|
@ -112,7 +112,11 @@ public final class PAPX extends BytePropertyNode {
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
return LittleEndian.getShort(buf);
|
||||
if (buf.length == 1)
|
||||
{
|
||||
return (short)LittleEndian.getUnsignedByte(buf, 0);
|
||||
}
|
||||
return LittleEndian.getShort(buf);
|
||||
}
|
||||
|
||||
public SprmBuffer getSprmBuf()
|
||||
@ -122,6 +126,11 @@ public final class PAPX extends BytePropertyNode {
|
||||
|
||||
public ParagraphProperties getParagraphProperties(StyleSheet ss)
|
||||
{
|
||||
if(ss == null) {
|
||||
// TODO Fix up for Word 6/95
|
||||
return new ParagraphProperties();
|
||||
}
|
||||
|
||||
short istd = getIstd();
|
||||
ParagraphProperties baseStyle = ss.getParagraphStyle(istd);
|
||||
ParagraphProperties props = ParagraphSprmUncompressor.uncompressPAP(baseStyle, getGrpprl(), 2);
|
||||
|
@ -27,12 +27,12 @@ import org.apache.poi.hwpf.model.io.*;
|
||||
/**
|
||||
* @author Ryan Ackley
|
||||
*/
|
||||
public final class SectionTable
|
||||
public class SectionTable
|
||||
{
|
||||
private static final int SED_SIZE = 12;
|
||||
|
||||
protected ArrayList _sections = new ArrayList();
|
||||
protected List _text;
|
||||
protected ArrayList<SEPX> _sections = new ArrayList<SEPX>();
|
||||
protected List<TextPiece> _text;
|
||||
|
||||
/** So we can know if things are unicode or not */
|
||||
private TextPieceTable tpt;
|
||||
@ -84,7 +84,7 @@ public final class SectionTable
|
||||
boolean matchAt = false;
|
||||
boolean matchHalf = false;
|
||||
for(int i=0; i<_sections.size(); i++) {
|
||||
SEPX s = (SEPX)_sections.get(i);
|
||||
SEPX s = _sections.get(i);
|
||||
if(s.getEnd() == mainEndsAt) {
|
||||
matchAt = true;
|
||||
} else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) {
|
||||
@ -94,7 +94,7 @@ public final class SectionTable
|
||||
if(! matchAt && matchHalf) {
|
||||
System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
|
||||
for(int i=0; i<_sections.size(); i++) {
|
||||
SEPX s = (SEPX)_sections.get(i);
|
||||
SEPX s = _sections.get(i);
|
||||
GenericPropertyNode node = sedPlex.getProperty(i);
|
||||
|
||||
s.setStart( CPtoFC(node.getStart()) );
|
||||
@ -106,12 +106,12 @@ public final class SectionTable
|
||||
public void adjustForInsert(int listIndex, int length)
|
||||
{
|
||||
int size = _sections.size();
|
||||
SEPX sepx = (SEPX)_sections.get(listIndex);
|
||||
SEPX sepx = _sections.get(listIndex);
|
||||
sepx.setEnd(sepx.getEnd() + length);
|
||||
|
||||
for (int x = listIndex + 1; x < size; x++)
|
||||
{
|
||||
sepx = (SEPX)_sections.get(x);
|
||||
sepx = _sections.get(x);
|
||||
sepx.setStart(sepx.getStart() + length);
|
||||
sepx.setEnd(sepx.getEnd() + length);
|
||||
}
|
||||
@ -129,7 +129,7 @@ public final class SectionTable
|
||||
|
||||
for(int i=_text.size()-1; i>-1; i--)
|
||||
{
|
||||
TP = (TextPiece)_text.get(i);
|
||||
TP = _text.get(i);
|
||||
|
||||
if(CP >= TP.getCP()) break;
|
||||
}
|
||||
@ -142,7 +142,7 @@ public final class SectionTable
|
||||
return FC;
|
||||
}
|
||||
|
||||
public ArrayList getSections()
|
||||
public ArrayList<SEPX> getSections()
|
||||
{
|
||||
return _sections;
|
||||
}
|
||||
@ -159,7 +159,7 @@ public final class SectionTable
|
||||
|
||||
for (int x = 0; x < len; x++)
|
||||
{
|
||||
SEPX sepx = (SEPX)_sections.get(x);
|
||||
SEPX sepx = _sections.get(x);
|
||||
byte[] grpprl = sepx.getGrpprl();
|
||||
|
||||
// write the sepx to the document stream. starts with a 2 byte size
|
||||
|
@ -20,6 +20,7 @@ package org.apache.poi.hwpf.usermodel;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.HWPFDocumentCore;
|
||||
|
||||
import org.apache.poi.hwpf.usermodel.CharacterRun;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
@ -77,7 +78,7 @@ public class Range { // TODO -instantiable superclass
|
||||
protected int _end;
|
||||
|
||||
/** The document this range blongs to. */
|
||||
protected HWPFDocument _doc;
|
||||
protected HWPFDocumentCore _doc;
|
||||
|
||||
/** Have we loaded the section indexes yet */
|
||||
boolean _sectionRangeFound;
|
||||
@ -144,7 +145,7 @@ public class Range { // TODO -instantiable superclass
|
||||
* @param doc
|
||||
* The HWPFDocument the range is based on.
|
||||
*/
|
||||
public Range(int start, int end, HWPFDocument doc) {
|
||||
public Range(int start, int end, HWPFDocumentCore doc) {
|
||||
_start = start;
|
||||
_end = end;
|
||||
_doc = doc;
|
||||
@ -1004,6 +1005,8 @@ public class Range { // TODO -instantiable superclass
|
||||
* The (signed) value that should be added to the FIB CCP fields
|
||||
*/
|
||||
protected void adjustFIB(int adjustment) {
|
||||
assert (_doc instanceof HWPFDocument);
|
||||
|
||||
// update the FIB.CCPText field (this should happen once per adjustment,
|
||||
// so we don't want it in
|
||||
// adjustForInsert() or it would get updated multiple times if the range
|
||||
@ -1011,7 +1014,7 @@ public class Range { // TODO -instantiable superclass
|
||||
// without this, OpenOffice.org (v. 2.2.x) does not see all the text in
|
||||
// the document
|
||||
|
||||
CPSplitCalculator cpS = _doc.getCPSplitCalculator();
|
||||
CPSplitCalculator cpS = ((HWPFDocument)_doc).getCPSplitCalculator();
|
||||
FileInformationBlock fib = _doc.getFileInformationBlock();
|
||||
|
||||
// Do for each affected part
|
||||
@ -1066,7 +1069,7 @@ public class Range { // TODO -instantiable superclass
|
||||
return _end;
|
||||
}
|
||||
|
||||
protected HWPFDocument getDocument() {
|
||||
protected HWPFDocumentCore getDocument() {
|
||||
|
||||
return _doc;
|
||||
}
|
||||
|
@ -256,6 +256,16 @@ public final class TestWordExtractor extends TestCase {
|
||||
assertTrue(text.contains("Paragraph 2"));
|
||||
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
|
||||
assertTrue(text.contains("Last (4th) paragraph"));
|
||||
|
||||
String[] tp = w6e.getParagraphText();
|
||||
assertEquals(7, tp.length);
|
||||
assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
|
||||
assertEquals("\r\n", tp[1]);
|
||||
assertEquals("Paragraph 2\r\n", tp[2]);
|
||||
assertEquals("\r\n", tp[3]);
|
||||
assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
|
||||
assertEquals("\r\n", tp[5]);
|
||||
assertEquals("Last (4th) paragraph.\r\n", tp[6]);
|
||||
}
|
||||
|
||||
public void testWord6() throws Exception {
|
||||
@ -273,5 +283,9 @@ public final class TestWordExtractor extends TestCase {
|
||||
String text = w6e.getText();
|
||||
|
||||
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
|
||||
|
||||
String[] tp = w6e.getParagraphText();
|
||||
assertEquals(1, tp.length);
|
||||
assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user