From af51ea4c1059148ea43c51604efea0f9351467ba Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Tue, 4 Apr 2017 02:06:46 +0000 Subject: [PATCH] bug 50955 -- word 6.0 charset fix git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790061 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/TestAllFiles.java | 3 + .../org/apache/poi/util/CodePageUtil.java | 10 ++ .../poi/util/LittleEndianBig5Stream.java | 107 ++++++++++++ src/java/org/apache/poi/util/StringUtil.java | 32 +++- .../org/apache/poi/hwmf/record/HwmfFont.java | 2 +- .../org/apache/poi/hwpf/HWPFOldDocument.java | 110 ++++++++++-- .../poi/hwpf/model/ComplexFileTable.java | 21 ++- .../apache/poi/hwpf/model/OldCHPBinTable.java | 2 +- .../poi/hwpf/model/OldComplexFileTable.java | 42 +++++ .../src/org/apache/poi/hwpf/model/OldFfn.java | 161 ++++++++++++++++++ .../apache/poi/hwpf/model/OldFontTable.java | 84 +++++++++ .../apache/poi/hwpf/model/OldTextPiece.java | 120 +++++++++++++ .../poi/hwpf/model/OldTextPieceTable.java | 119 +++++++++++++ .../apache/poi/hwpf/model/PAPBinTable.java | 2 +- .../poi/hwpf/model/PieceDescriptor.java | 72 +++++--- .../org/apache/poi/hwpf/model/TextPiece.java | 10 +- .../apache/poi/hwpf/model/TextPieceTable.java | 14 +- .../poi/hwpf/usermodel/CharacterRun.java | 5 + .../converter/TestWordToConverterSuite.java | 21 ++- .../apache/poi/hwpf/usermodel/TestBugs.java | 8 +- .../hwpf/usermodel/TestHWPFOldDocument.java | 90 +++++++++- test-data/document/Bug60936.doc | Bin 0 -> 6656 bytes test-data/document/Bug60942.doc | Bin 0 -> 20480 bytes test-data/document/Bug60942b.doc | Bin 0 -> 6144 bytes 24 files changed, 971 insertions(+), 64 deletions(-) create mode 100644 src/java/org/apache/poi/util/LittleEndianBig5Stream.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java create mode 100644 test-data/document/Bug60936.doc create mode 100644 test-data/document/Bug60942.doc create mode 100644 test-data/document/Bug60942b.doc diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java index 25ac41a9b..19edc1455 100644 --- a/src/integrationtest/org/apache/poi/TestAllFiles.java +++ b/src/integrationtest/org/apache/poi/TestAllFiles.java @@ -218,6 +218,9 @@ public class TestAllFiles { "document/Word6_sections2.doc", "document/Word95.doc", "document/word95err.doc", + "document/Bug60936.doc", + "document/Bug60942.doc", + "document/Bug60942b.doc", "hpsf/TestMickey.doc", "document/52117.doc" ); diff --git a/src/java/org/apache/poi/util/CodePageUtil.java b/src/java/org/apache/poi/util/CodePageUtil.java index 145929182..5be1c5077 100644 --- a/src/java/org/apache/poi/util/CodePageUtil.java +++ b/src/java/org/apache/poi/util/CodePageUtil.java @@ -18,6 +18,9 @@ package org.apache.poi.util; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.util.HashSet; +import java.util.Set; /** * Utilities for working with Microsoft CodePages. @@ -27,6 +30,13 @@ import java.io.UnsupportedEncodingException; */ public class CodePageUtil { + + public static final Set VARIABLE_BYTE_CHARSETS = new HashSet(); + static { + //others? + VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5); + } + /**

Codepage 037, a special case

*/ public static final int CP_037 = 37; diff --git a/src/java/org/apache/poi/util/LittleEndianBig5Stream.java b/src/java/org/apache/poi/util/LittleEndianBig5Stream.java new file mode 100644 index 000000000..f68b1cdb9 --- /dev/null +++ b/src/java/org/apache/poi/util/LittleEndianBig5Stream.java @@ -0,0 +1,107 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.util; + +import java.io.ByteArrayInputStream; + +/** + * Stream that converts MSOffice's way of storing Big5, with + * zero-byte padding for ASCII and in LittleEndianOrder. + */ +@Internal +public class LittleEndianBig5Stream extends ByteArrayInputStream { + private static final int EOF = -1; + private static final int INVALID_PAIR = -2; + private static final int EMPTY_TRAILING = -3; + + //the char that is logically trailing in Big5 encoding + //however in LittleEndian order, this is the first encountered. + int trailing = EMPTY_TRAILING; + public LittleEndianBig5Stream(byte[] buf) { + super(buf); + } + + public LittleEndianBig5Stream(byte[] buf, int offset, int length) { + super(buf, offset, length); + } + + @Override + public int read() { + + if (trailing != EMPTY_TRAILING) { + int tmp = trailing; + trailing = EMPTY_TRAILING; + return tmp; + } + int leading = readNext(); + while (leading == INVALID_PAIR) { + leading = readNext(); + } + + if (leading == EOF) { + return EOF; + } + return leading; + } + + //returns leading, sets trailing appropriately + //returns -1 if it hits the end of the stream + //returns -2 for an invalid big5 code pair + private final int readNext() { + trailing = super.read(); + if (trailing == -1) { + return EOF; + } + int leading = super.read(); + if (leading == EOF) { + return EOF; + } + int lead = leading&0xff; + if (lead > 0x80) { + return leading; + } else if (lead == 0) { + int ret = trailing; + trailing = EMPTY_TRAILING; + return ret; + } else { + int ret = trailing; + trailing = EMPTY_TRAILING; + return ret; + //return INVALID_PAIR; + } + + } + + @Override + public int read(byte[] buff, int off, int len) { + int bytesRead = 0; + for (int i = off; i < off+len; i++) { + int b = read(); + if (b == -1) { + if (bytesRead == 0) { + return -1; + } else { + return bytesRead; + } + } + bytesRead++; + buff[i] = (byte)b; + } + return bytesRead; + } +} diff --git a/src/java/org/apache/poi/util/StringUtil.java b/src/java/org/apache/poi/util/StringUtil.java index 20a6824c9..5d09dff56 100644 --- a/src/java/org/apache/poi/util/StringUtil.java +++ b/src/java/org/apache/poi/util/StringUtil.java @@ -17,6 +17,8 @@ package org.apache.poi.util; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Iterator; @@ -27,9 +29,14 @@ import java.util.Map; */ @Internal public class StringUtil { + + private static final POILogger logger = POILogFactory + .getLogger(StringUtil.class); protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); - protected static final Charset UTF16LE = Charset.forName("UTF-16LE"); + public static final Charset UTF16LE = Charset.forName("UTF-16LE"); public static final Charset UTF8 = Charset.forName("UTF-8"); + public static final Charset WIN_1252 = Charset.forName("cp1252"); + public static final Charset BIG5 = Charset.forName("Big5"); private static Map msCodepointToUnicode; @@ -573,7 +580,28 @@ public class StringUtil { 9133, // 0xf0fe bracerightbt ' ', // 0xf0ff not defined }; - + + /** + * This tries to convert a LE byte array in Big5 to a String. + * We know MS zero-padded ascii, and we drop those. + * However, there may be areas for improvement in this. + * + * @param data + * @param offset + * @param lengthInBytes + * @return + */ + public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + try { + IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os); + } catch (IOException e) { + logger.log(POILogger.WARN, + "IOException while copying a byte array stream to a byte array stream?!"); + } + return new String(os.toByteArray(), BIG5); + } + // Could be replaced with org.apache.commons.lang3.StringUtils#join @Internal public static String join(Object[] array, String separator) { diff --git a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java index 703faa153..f6e256381 100644 --- a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java +++ b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java @@ -108,7 +108,7 @@ public class HwmfFont { return charset; } - static WmfCharset valueOf(int flag) { + public static WmfCharset valueOf(int flag) { for (WmfCharset cs : values()) { if (cs.flag == flag) return cs; } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java index 6ff9f29bc..505789e2c 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java @@ -19,27 +19,43 @@ package org.apache.poi.hwpf; import java.io.File; import java.io.IOException; import java.io.OutputStream; +import java.nio.charset.Charset; +import org.apache.poi.hwmf.record.HwmfFont; import org.apache.poi.hwpf.model.ComplexFileTable; +import org.apache.poi.hwpf.model.FontTable; import org.apache.poi.hwpf.model.OldCHPBinTable; +import org.apache.poi.hwpf.model.OldComplexFileTable; +import org.apache.poi.hwpf.model.OldFfn; +import org.apache.poi.hwpf.model.OldFontTable; import org.apache.poi.hwpf.model.OldPAPBinTable; import org.apache.poi.hwpf.model.OldSectionTable; +import org.apache.poi.hwpf.model.OldTextPieceTable; import org.apache.poi.hwpf.model.PieceDescriptor; import org.apache.poi.hwpf.model.TextPiece; import org.apache.poi.hwpf.model.TextPieceTable; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.CodePageUtil; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.NotImplemented; +import org.apache.poi.util.StringUtil; /** * Provides very simple support for old (Word 6 / Word 95) * files. */ public class HWPFOldDocument extends HWPFDocumentCore { - private TextPieceTable tpt; + + private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252; + + private OldTextPieceTable tpt; private StringBuilder _text; + + private final OldFontTable fontTable; + private final Charset guessedCharset; public HWPFOldDocument(POIFSFileSystem fs) throws IOException { this(fs.getRoot()); @@ -56,45 +72,52 @@ public class HWPFOldDocument extends HWPFDocumentCore { int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc); int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0); int papTableSize = LittleEndian.getInt(_mainStream, 0xc4); - //int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60); - //int shfTableSize = LittleEndian.getInt(_mainStream, 0x64); + int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0); + int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4); + + fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize); + //TODO: figure out how to map runs/text pieces to fonts + //for now, if there's a non standard codepage in one of the fonts + //assume that the doc is in that codepage. + guessedCharset = guessCodePage(fontTable); + int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160); // We need to get hold of the text that makes up the // document, which might be regular or fast-saved ComplexFileTable cft = null; - StringBuffer text = new StringBuffer(); if(_fib.getFibBase().isFComplex()) { - cft = new ComplexFileTable( + cft = new OldComplexFileTable( _mainStream, _mainStream, - complexTableOffset, _fib.getFibBase().getFcMin() + complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset ); - tpt = cft.getTextPieceTable(); + tpt = (OldTextPieceTable)cft.getTextPieceTable(); - for(TextPiece tp : tpt.getTextPieces()) { - text.append( tp.getStringBuilder() ); - } } else { // TODO Discover if these older documents can ever hold Unicode Strings? // (We think not, because they seem to lack a Piece table) // TODO Build the Piece Descriptor properly // (We have to fake it, as they don't seem to have a proper Piece table) - PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0); + PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset); pd.setFilePosition(_fib.getFibBase().getFcMin()); // Generate a single Text Piece Table, with a single Text Piece // which covers all the (8 bit only) text in the file - tpt = new TextPieceTable(); + tpt = new OldTextPieceTable(); byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()]; System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length); + + int numChars = textData.length; + if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) { + numChars /= 2; + } + TextPiece tp = new TextPiece( - 0, textData.length, textData, pd + 0, numChars, textData, pd ); tpt.add(tp); - text.append(tp.getStringBuilder()); } - _text = tpt.getText(); // Now we can fetch the character and paragraph properties @@ -133,12 +156,54 @@ public class HWPFOldDocument extends HWPFDocumentCore { } } + + /** + * Take the first codepage that is not default, ansi or symbol. + * Ideally, we'd want to track fonts with runs, but we don't yet + * know how to do that. + * + * Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi + * appears here. + * + * @param fontTable + * @return + */ + private Charset guessCodePage(OldFontTable fontTable) { + + for (OldFfn oldFfn : fontTable.getFontNames()) { + HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff); + if (wmfCharset != null && + wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET && + wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET && + wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) { + return wmfCharset.getCharset(); + } + } + return DEFAULT_CHARSET; + } + public Range getOverallRange() { // Life is easy when we have no footers, headers or unicode! return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this ); } + /** + * Use {@link #getOldFontTable()} instead!!! + * This always throws an IllegalArgumentException. + * + * @return nothing + * @throws UnsupportedOperationException + */ + @Override + @NotImplemented + public FontTable getFontTable() { + throw new UnsupportedOperationException("Use getOldFontTable instead."); + } + + public OldFontTable getOldFontTable() { + return fontTable; + } public Range getRange() { return getOverallRange(); @@ -167,4 +232,19 @@ public class HWPFOldDocument extends HWPFDocumentCore { public void write(OutputStream out) throws IOException { throw new IllegalStateException("Writing is not available for the older file formats"); } + + /** + * As a rough heuristic (total hack), read through the font table + * and take the first non-default, non-ansi, non-symbol + * font's charset and return that. + * + * Once we figure out how to link a font to a text piece, we should + * use the font information per text piece. + * + * @return charset + */ + public Charset getGuessedCharset() { + return guessedCharset; + } + } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java index 42a2fb987..dc530bd64 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/ComplexFileTable.java @@ -18,6 +18,7 @@ package org.apache.poi.hwpf.model; import java.io.IOException; +import java.nio.charset.Charset; import java.util.LinkedList; import java.util.List; @@ -26,9 +27,10 @@ import org.apache.poi.hwpf.model.io.HWPFOutputStream; import org.apache.poi.hwpf.sprm.SprmBuffer; import org.apache.poi.util.Internal; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.StringUtil; @Internal -public final class ComplexFileTable { +public class ComplexFileTable { private static final byte GRPPRL_TYPE = 1; private static final byte TEXT_PIECE_TABLE_TYPE = 2; @@ -40,7 +42,8 @@ public final class ComplexFileTable { _tpt = new TextPieceTable(); } - public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { + protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin, + Charset charset) throws IOException { //skips through the prms before we reach the piece table. These contain data //for actual fast saved files List sprmBuffers = new LinkedList(); @@ -61,7 +64,12 @@ public final class ComplexFileTable { } int pieceTableSize = LittleEndian.getInt(tableStream, ++offset); offset += LittleEndian.INT_SIZE; - _tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); + _tpt = newTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset); + + } + + public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException { + this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252); } public TextPieceTable getTextPieceTable() { @@ -92,4 +100,11 @@ public final class ComplexFileTable { tableStream.write(table); } + protected TextPieceTable newTextPieceTable(byte[] documentStream, + byte[] tableStream, int offset, int pieceTableSize, int fcMin, + Charset charset) { + return new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin); + } + + } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java index bc3f4869b..45061ad65 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldCHPBinTable.java @@ -44,7 +44,7 @@ public final class OldCHPBinTable extends CHPBinTable * @param fcMin */ public OldCHPBinTable(byte[] documentStream, int offset, - int size, int fcMin, TextPieceTable tpt) + int size, int fcMin, OldTextPieceTable tpt) { PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2); diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java new file mode 100644 index 000000000..25510c89e --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldComplexFileTable.java @@ -0,0 +1,42 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import java.io.IOException; +import java.nio.charset.Charset; + +import org.apache.poi.util.Internal; + +@Internal +public final class OldComplexFileTable extends ComplexFileTable { + + public OldComplexFileTable(byte[] documentStream, byte[] tableStream, + int offset, int fcMin, Charset charset) throws IOException { + super(documentStream, tableStream, offset, fcMin, charset); + } + + + @Override + protected TextPieceTable newTextPieceTable(byte[] documentStream, + byte[] tableStream, int offset, + int pieceTableSize, int fcMin, Charset charset) { + return new OldTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset); + } + + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java new file mode 100644 index 000000000..d50ac4ec0 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFfn.java @@ -0,0 +1,161 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import java.nio.charset.Charset; + +import org.apache.poi.hwmf.record.HwmfFont; +import org.apache.poi.util.Internal; +import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; +import org.apache.poi.util.StringUtil; + +/** + * Word 6.0 Font information + */ +@Internal +public final class OldFfn { + + private static final POILogger logger = POILogFactory.getLogger(OldFfn.class); + + private byte _chs;// character set identifier + + private final String fontName; + private final String altFontName; + + private final int length; //length in bytes for this record + + /** + * try to read an OldFfn starting at offset; read no farther than end + * + * @param buf buffer from which to read + * @param offset offset at which to start + * @param fontTableEnd read no farther than this + * @return an OldFfn or null if asked to read beyond end + */ + static OldFfn build(byte[] buf, int offset, int fontTableEnd) { + int start = offset; + //preliminary bytes + if (offset + 6 > fontTableEnd) { + return null; + } + //first byte + short fontDescriptionLength = (short) buf[offset]; + offset += 1; + if (offset + fontDescriptionLength > fontTableEnd) { + logger.log(POILogger.WARN, "Asked to read beyond font table end. Skipping font"); + return null; + } + + //no idea what these 3 bytes do + offset += 3; + byte chs = buf[offset]; + Charset charset = null; + HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 0xff); + if (wmfCharset == null) { + logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs & 0xff)); + } else { + charset = wmfCharset.getCharset(); + } + charset = charset == null ? StringUtil.WIN_1252 : charset; + offset += LittleEndian.BYTE_SIZE; + //if this byte here == 7, it _may_ signify existence of + //an altername font name + + //not sure what the byte after the _chs does + offset += LittleEndian.BYTE_SIZE; + int fontNameLength = -1; + for (int i = offset; i < fontTableEnd; i++) { + if (buf[i] == 0) { + fontNameLength = i - offset; + break; + } + } + if (fontNameLength == -1) { + logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited font name length"); + return null; + } + String fontName = new String(buf, offset, fontNameLength, charset); + String altFontName = null; + int altFontNameLength = -1; + offset += fontNameLength + 1; + if (offset - start < fontDescriptionLength) { + for (int i = offset; i <= start + fontDescriptionLength; i++) { + if (buf[i] == 0) { + altFontNameLength = i - offset; + break; + } + } + if (altFontNameLength > -1) { + altFontName = new String(buf, offset, altFontNameLength, charset); + } + } + //reset to 0 for length calculation + altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 1;//add one for zero byte + + int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + LittleEndian.BYTE_SIZE +//6 starting bytes + fontNameLength + altFontNameLength + 1;//+1 is for the zero byte + //this len should == fontDescriptionLength + + return new OldFfn(chs, fontName, altFontName, len); + + } + + public OldFfn(byte charsetIdentifier, String fontName, String altFontName, int length) { + this._chs = charsetIdentifier; + this.fontName = fontName; + this.altFontName = altFontName; + this.length = length; + } + + public byte getChs() { + return _chs; + } + + public String getMainFontName() { + return fontName; + } + + /** + * @return altFontName if it exists, null otherwise + */ + public String getAltFontName() { + return altFontName; + } + + + /** + * @return length in bytes for this record + */ + public int getLength() { + return length; + } + + @Override + public String toString() { + return "OldFfn{" + + "_chs=" + (_chs & 0xff) + + ", fontName='" + fontName + '\'' + + ", altFontName='" + altFontName + '\'' + + ", length=" + length + + '}'; + } +} + + diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java new file mode 100644 index 000000000..dfe1f95e0 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldFontTable.java @@ -0,0 +1,84 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.poi.util.Internal; +import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + +/** + * Font table for Word 6.0 + */ +@Internal +public final class OldFontTable { + private final static POILogger _logger = POILogFactory.getLogger(OldFontTable.class); + + // added extra facilitator members + // FFN structure containing strings of font names + private final OldFfn[] _fontNames; + + public OldFontTable(byte[] buf, int offset, int length) { + //length is stored at the index section in the table + //and it is recorded in the first short. + + + List ffns = new ArrayList(); + int fontTableLength = LittleEndian.getShort(buf, offset); + + int endOfTableOffset = offset + length; + int startOffset = offset + LittleEndian.SHORT_SIZE;//first short should == length! + + while (true) { + OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset); + if (oldFfn == null) { + break; + } + ffns.add(oldFfn); + startOffset += oldFfn.getLength(); + + } + _fontNames = ffns.toArray(new OldFfn[ffns.size()]); + } + + + public OldFfn[] getFontNames() { + return _fontNames; + } + + + public String getMainFont(int chpFtc) { + if (chpFtc >= _fontNames.length) { + _logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount"); + return null; + } + + return _fontNames[chpFtc].getMainFontName(); + } + + @Override + public String toString() { + return "OldFontTable{" + + "_fontNames=" + Arrays.toString(_fontNames) + + '}'; + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java new file mode 100644 index 000000000..c82635bc3 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPiece.java @@ -0,0 +1,120 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ + +package org.apache.poi.hwpf.model; + + +import org.apache.poi.util.Internal; +import org.apache.poi.util.NotImplemented; + +/** + * Lightweight representation of a text piece. + * Works in the character domain, not the byte domain, so you + * need to have turned byte references into character + * references before getting here. + */ +@Internal +public class OldTextPiece extends TextPiece { + + private final byte[] rawBytes; + + /** + * @param start Beginning offset in main document stream, in characters. + * @param end Ending offset in main document stream, in characters. + * @param text The raw bytes of our text + */ + public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) { + super(start, end, text, pd); + this.rawBytes = text; + if (end < start) { + throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end); + } + } + + /** + * @return nothing, ever. Always throws an UnsupportedOperationException + * @throws UnsupportedOperationException + */ + @NotImplemented + @Override + public boolean isUnicode() { + throw new UnsupportedOperationException(); + } + + + public StringBuilder getStringBuilder() { + return (StringBuilder) _buf; + } + + @Override + public byte[] getRawBytes() { + byte[] buf = new byte[rawBytes.length]; + System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length); + return buf; + } + + /** + * Returns part of the string. + * Works only in characters, not in bytes! + * + * @param start Local start position, in characters + * @param end Local end position, in characters + * @throws UnsupportedOperationException + */ + @Deprecated + @NotImplemented + public String substring(int start, int end) { + throw new UnsupportedOperationException(); + } + + /** + * Not implemented for OldTextPiece. + * Always throws UnsupportedOperationException + */ + @Deprecated + @NotImplemented + public void adjustForDelete(int start, int length) { + throw new UnsupportedOperationException(); + } + + /** + * Returns the length, in bytes + */ + public int bytesLength() { + return rawBytes.length; + } + + @Override + public int hashCode() { + assert false : "hashCode not designed"; + return 42; // any arbitrary constant will do + } + + + /** + * Returns the character position we start at. + */ + public int getCP() { + return getStart(); + } + + public String toString() { + return "OldTextPiece from " + getStart() + " to " + getEnd() + " (" + + getPieceDescriptor() + ")"; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java new file mode 100644 index 000000000..3fd34ade0 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/OldTextPieceTable.java @@ -0,0 +1,119 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hwpf.model; + +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; + +import org.apache.poi.util.CodePageUtil; +import org.apache.poi.util.Internal; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; + + +@Internal +public class OldTextPieceTable extends TextPieceTable { + + private static final POILogger logger = POILogFactory + .getLogger(OldTextPieceTable.class); + + public OldTextPieceTable() { + super(); + } + + public OldTextPieceTable(byte[] documentStream, byte[] tableStream, + int offset, int size, int fcMin, Charset charset) { + //super(documentStream, tableStream, offset, size, fcMin, charset); + // get our plex of PieceDescriptors + PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size, + PieceDescriptor.getSizeInBytes()); + + int length = pieceTable.length(); + PieceDescriptor[] pieces = new PieceDescriptor[length]; + + // iterate through piece descriptors raw bytes and create + // PieceDescriptor objects + for (int x = 0; x < length; x++) { + GenericPropertyNode node = pieceTable.getProperty(x); + pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset); + } + + // Figure out the cp of the earliest text piece + // Note that text pieces don't have to be stored in order! + _cpMin = pieces[0].getFilePosition() - fcMin; + for (PieceDescriptor piece : pieces) { + int start = piece.getFilePosition() - fcMin; + if (start < _cpMin) { + _cpMin = start; + } + } + + // using the PieceDescriptors, build our list of TextPieces. + for (int x = 0; x < pieces.length; x++) { + int start = pieces[x].getFilePosition(); + GenericPropertyNode node = pieceTable.getProperty(x); + + // Grab the start and end, which are in characters + int nodeStartChars = node.getStart(); + int nodeEndChars = node.getEnd(); + + // What's the relationship between bytes and characters? + boolean unicode = pieces[x].isUnicode(); + int multiple = 1; + if (unicode || + (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) { + multiple = 2; + } + + // Figure out the length, in bytes and chars + int textSizeChars = (nodeEndChars - nodeStartChars); + int textSizeBytes = textSizeChars * multiple; + + // Grab the data that makes up the piece + byte[] buf = new byte[textSizeBytes]; + System.arraycopy(documentStream, start, buf, 0, textSizeBytes); + + // And now build the piece + final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf, + pieces[x]); + + _textPieces.add(newTextPiece); + } + + // In the interest of our sanity, now sort the text pieces + // into order, if they're not already + Collections.sort(_textPieces); + _textPiecesFCOrder = new ArrayList(_textPieces); + Collections.sort(_textPiecesFCOrder, new FCComparator()); + + } + + @Override + protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) { + return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd); + } + + @Override + protected int getEncodingMultiplier(TextPiece textPiece) { + Charset charset = textPiece.getPieceDescriptor().getCharset(); + if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) { + return 2; + } + return 1; + } +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java index 34c29511c..3979009f2 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PAPBinTable.java @@ -260,7 +260,7 @@ public class PAPBinTable SprmBuffer sprmBuffer = null; for ( PAPX papx : papxs ) { - if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 ) + if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 ) continue; if ( sprmBuffer == null ) { diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java b/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java index a190f1db0..53dcc1745 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/PieceDescriptor.java @@ -17,10 +17,13 @@ package org.apache.poi.hwpf.model; +import java.nio.charset.Charset; + import org.apache.poi.util.BitField; import org.apache.poi.util.BitFieldFactory; import org.apache.poi.util.Internal; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.StringUtil; @Internal public final class PieceDescriptor @@ -32,29 +35,51 @@ public final class PieceDescriptor private static BitField fCopied = BitFieldFactory.getInstance(0x04); int fc; PropertyModifier prm; - boolean unicode; + boolean unicode = false; + private final Charset charset; - public PieceDescriptor(byte[] buf, int offset) - { - descriptor = LittleEndian.getShort(buf, offset); - offset += LittleEndian.SHORT_SIZE; - fc = LittleEndian.getInt(buf, offset); - offset += LittleEndian.INT_SIZE; - prm = new PropertyModifier( LittleEndian.getShort(buf, offset)); - - // see if this piece uses unicode. - if ((fc & 0x40000000) == 0) - { - unicode = true; - } - else - { - unicode = false; - fc &= ~(0x40000000);//gives me FC in doc stream - fc /= 2; + public PieceDescriptor(byte[] buf, int offset) { + this(buf, offset, null); } + /** + * + * This initializer should only be used for HWPFOldDocuments. + * + * @param buf + * @param offset + * @param charset which charset to use if this is not unicode + */ + public PieceDescriptor(byte[] buf, int offset, Charset charset) { + descriptor = LittleEndian.getShort(buf, offset); + offset += LittleEndian.SHORT_SIZE; + fc = LittleEndian.getInt(buf, offset); + offset += LittleEndian.INT_SIZE; + prm = new PropertyModifier(LittleEndian.getShort(buf, offset)); + if (charset == null) { + // see if this piece uses unicode. + //From the documentation: If the second most significant bit + //is clear, then this indicates the actual file offset of the Unicode character (two bytes). If the + //second most significant bit is set, then the actual address of the codepage-1252 + //compressed version of the Unicode character (one byte), is actually at the offset indicated + //by clearing this bit and dividing by two. + if ((fc & 0x40000000) == 0) { + unicode = true; + this.charset = null; + } else { + unicode = false; + fc &= ~(0x40000000);//gives me FC in doc stream + fc /= 2; + this.charset = StringUtil.WIN_1252; + } + } else { + if (charset == StringUtil.UTF16LE) { + unicode = true; + } + this.charset = charset; + } + } public int getFilePosition() @@ -72,6 +97,15 @@ public final class PieceDescriptor return unicode; } + /** + * + * @return charset to use if this is not a Unicode PieceDescriptor + * this can be null + */ + public Charset getCharset() { + return charset; + } + public PropertyModifier getPrm() { return prm; diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java index d432f35b6..2a63bda16 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPiece.java @@ -21,6 +21,7 @@ package org.apache.poi.hwpf.model; import java.nio.charset.Charset; import org.apache.poi.util.Internal; +import org.apache.poi.util.StringUtil; /** * Lightweight representation of a text piece. @@ -40,7 +41,6 @@ public class TextPiece extends PropertyNode { * @param start Beginning offset in main document stream, in characters. * @param end Ending offset in main document stream, in characters. * @param text The raw bytes of our text - * @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)} * instead */ public TextPiece(int start, int end, byte[] text, PieceDescriptor pd, @@ -72,8 +72,13 @@ public class TextPiece extends PropertyNode { * Create the StringBuilder from the text and unicode flag */ private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) { - String str = new String(text, Charset.forName(pd.isUnicode() ? "UTF-16LE" : "Cp1252")); + byte[] textBuffer = text; + if (StringUtil.BIG5.equals(pd.getCharset())) { + String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString(); + return new StringBuilder(txt); + } + String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset()); return new StringBuilder(str); } @@ -207,4 +212,5 @@ public class TextPiece extends PropertyNode { return "TextPiece from " + getStart() + " to " + getEnd() + " (" + getPieceDescriptor() + ")"; } + } diff --git a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java index 0108877c7..bbddd8645 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/model/TextPieceTable.java @@ -101,7 +101,7 @@ public class TextPieceTable implements CharIndexTranslator { System.arraycopy(documentStream, start, buf, 0, textSizeBytes); // And now build the piece - final TextPiece newTextPiece = new TextPiece(nodeStartChars, nodeEndChars, buf, + final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf, pieces[x]); _textPieces.add(newTextPiece); @@ -114,6 +114,10 @@ public class TextPieceTable implements CharIndexTranslator { Collections.sort(_textPiecesFCOrder, new FCComparator()); } + protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) { + return new TextPiece(nodeStartChars, nodeEndChars, buf, pd); + } + public void add(TextPiece piece) { _textPieces.add(piece); _textPiecesFCOrder.add(piece); @@ -249,7 +253,7 @@ public class TextPieceTable implements CharIndexTranslator { if (rangeStartBytes > rangeEndBytes) continue; - final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1; + final int encodingMultiplier = getEncodingMultiplier(textPiece); final int rangeStartCp = textPiece.getStart() + (rangeStartBytes - tpStart) / encodingMultiplier; @@ -262,6 +266,10 @@ public class TextPieceTable implements CharIndexTranslator { return result.toArray(new int[result.size()][]); } + protected int getEncodingMultiplier(TextPiece textPiece) { + return textPiece.isUnicode() ? 2 : 1; + } + public int getCpMin() { return _cpMin; } @@ -439,7 +447,7 @@ public class TextPieceTable implements CharIndexTranslator { return textPlex.toByteArray(); } - private static class FCComparator implements Comparator, Serializable { + protected static class FCComparator implements Comparator, Serializable { public int compare(TextPiece textPiece, TextPiece textPiece1) { if (textPiece.getPieceDescriptor().fc > textPiece1 .getPieceDescriptor().fc) { diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java index 730133319..5c2dc4749 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/CharacterRun.java @@ -18,6 +18,7 @@ package org.apache.poi.hwpf.usermodel; import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.HWPFOldDocument; import org.apache.poi.hwpf.model.CHPX; import org.apache.poi.hwpf.model.FFData; import org.apache.poi.hwpf.model.Ffn; @@ -438,6 +439,10 @@ public final class CharacterRun extends Range public String getFontName() { + if (_doc instanceof HWPFOldDocument) { + return ((HWPFOldDocument) _doc).getOldFontTable().getMainFont(_props.getFtcAscii()); + } + if (_doc.getFontTable() == null) // old word format return null; diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java index f3194bf2f..5a3bc6e38 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToConverterSuite.java @@ -16,18 +16,19 @@ ==================================================================== */ package org.apache.poi.hwpf.converter; -import java.io.File; -import java.io.FilenameFilter; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; +import static org.junit.Assert.assertNotNull; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +import java.io.File; +import java.io.FilenameFilter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import org.apache.poi.POIDataSamples; import org.apache.poi.hwpf.HWPFDocumentCore; @@ -36,8 +37,6 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.junit.Assert.assertNotNull; - @RunWith(Parameterized.class) public class TestWordToConverterSuite { @@ -45,7 +44,11 @@ public class TestWordToConverterSuite * YK: a quick hack to exclude failing documents from the suite. */ private static List failingFiles = Arrays - .asList( "ProblemExtracting.doc" ); + .asList( "ProblemExtracting.doc", + "Bug50955.doc" //basic extraction works, + // but these extractors modify the document, + // which is a no-go for this Word 6.0 file + ); @Parameterized.Parameters(name="{index}: {0}") public static Iterable files() { diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java index b1e02f35c..1ff7abd25 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java @@ -57,6 +57,7 @@ import junit.framework.TestCase; * against HWPF */ public class TestBugs{ + private static final POILogger logger = POILogFactory.getLogger(TestBugs.class); public static void assertEqualsIgnoreNewline(String expected, String actual ) @@ -536,13 +537,6 @@ public class TestBugs{ hwpfDocument.getPicturesTable().getAllPictures(); } - /** - * [FAILING] Bug 50955 - error while retrieving the text file - */ - @Test(expected=IllegalStateException.class) - public void test50955() throws IOException { - getTextOldFile("Bug50955.doc"); - } /** * [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java index 47017dbf7..bfe22605a 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHWPFOldDocument.java @@ -17,14 +17,19 @@ package org.apache.poi.hwpf.usermodel; +import static org.apache.poi.POITestCase.assertContains; import static org.junit.Assert.assertEquals; import java.io.IOException; +import java.nio.charset.Charset; import org.apache.poi.OldFileFormatException; +import org.apache.poi.hwmf.record.HwmfFont; import org.apache.poi.hwpf.HWPFOldDocument; import org.apache.poi.hwpf.HWPFTestCase; import org.apache.poi.hwpf.HWPFTestDataSamples; +import org.apache.poi.hwpf.extractor.Word6Extractor; +import org.apache.poi.hwpf.model.OldFontTable; import org.junit.Test; /** @@ -98,7 +103,7 @@ public final class TestHWPFOldDocument extends HWPFTestCase { assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns()); // Normal, superscript for 4th, normal assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns()); - + doc.close(); } @@ -143,4 +148,87 @@ public final class TestHWPFOldDocument extends HWPFTestCase { doc.getRange().getParagraph(1).text()); doc.close(); } + + @Test + public void testDefaultCodePageEncoding() throws IOException { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc"); + Word6Extractor ex = new Word6Extractor(doc); + String txt = ex.getText(); + assertContains(txt, "BERTHOD"); + assertContains(txt, "APPLICOLOR"); + assertContains(txt, "les meilleurs"); + assertContains(txt, "GUY LECOLE"); + } + + + @Test + public void testCodePageBug50955() throws IOException { + //windows 1251 + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc"); + Word6Extractor ex = new Word6Extractor(doc); + + StringBuilder sb = new StringBuilder(); + for (String p : ex.getParagraphText()) { + sb.append(p); + } + assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings! + } + + @Test + public void testCodePageBug60936() throws IOException { + //windows 1250 -- this test file was generated with OpenOffice + //see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration + + + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc"); + Word6Extractor ex = new Word6Extractor(doc); + StringBuilder sb = new StringBuilder(); + for (String p : ex.getParagraphText()) { + sb.append(p); + } + assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings! + } + + @Test + public void testOldFontTableEncoding() throws IOException { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); + OldFontTable oldFontTable = doc.getOldFontTable(); + assertEquals(5, oldFontTable.getFontNames().length); + assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName()); + assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5")); + assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName()); + doc.close(); + + } + + @Test + public void testOldFontTableAltName() throws IOException { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc"); + OldFontTable oldFontTable = doc.getOldFontTable(); + assertEquals(5, oldFontTable.getFontNames().length); + assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName()); + assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName()); + assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName()); + assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName()); + } + + + @Test + public void test51944() throws IOException { + HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc"); + Word6Extractor ex = new Word6Extractor(doc); + StringBuilder sb = new StringBuilder(); + for (String p : ex.getParagraphText()) { + sb.append(p.replaceAll("[\r\n]+", "\n")); + } + String txt = sb.toString(); + assertContains(txt, "Post and Fax"); + assertContains(txt, "also maintain");//this is at a critical juncture + assertContains(txt, "which are available for");//this too + + //TODO: figure out why these two aren't passing +// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly +// assertContains(txt, "We are able to");//not sure if we can get this easily? + } + } diff --git a/test-data/document/Bug60936.doc b/test-data/document/Bug60936.doc new file mode 100644 index 0000000000000000000000000000000000000000..e7e397d568b80815761bb5e4e9b08a812b736dbe GIT binary patch literal 6656 zcmeHL&u<$=6nNFfyPduP%0s${4j`$mPSoz5fIU}y=j)%yU}{1 zI*3Ch#1RS90~aJzi9!(w3GoNe3r9qSR3Rh|;erB42ucoAqA}l_wTWXAdu=!*lArZ? zX5Y-bdGqGY%$vFXbH&IzJAfz<1LDA) zKp(Id=m+)z1HfIt-N1f81?~Y30QUm-0rvxgz(HULcmQ}1I0Os>2_OlK0IcUj<#gQw z?OZb!5U`>j6U=QW#3wq@HTh3Z@qC>$GPYH)W*zmcWoOl+$-{}1HNTkGO-IH8C#RU7 zvNF;eIG@CaEKeSF<<$L$|GAykt+jRG8X^y|i3s-Vsl3rcWk2M5LvSKyFYpWsQ@SfyBR?-FyCSeF(zg{Ve`mCB+PAlz- z_yHQIPctkDC|6%K@rY`)c zXVu5lobD`jcSq^{>myf|pZLI3J~Wl9rt*=gTxzLZSK7)3q3w?yWdS~N zV^N9@AV>(%5rMM}VKSm)1jdMqM@S&LMQ}UhhPJ*i3W7Lwo{DgUoh@YMY-85Z zY{*-uL0a37=m|P?JgsY4!#vNY`W}k?M)4vA#V(rhQD%Tt8jo;2l|s{c(NWV@7T3fe zhQ`AxLSU3eB0}_vC@Ck5f`fR8Pjy%R0hT|ZEjWf{vix3(6}eSd1b8rK1a1nf$HStJ z0>{m)4uJ}67RC^hvSeH&sb|vGXDvIgISvjAR)aN&mJJI{AuK9~#_^FNS5!H!#oFRM z6;o1PX@0~gvG`k%KW*gog8Hm}K|O8dHIur)_0iN)K5gZwbC9cy*@l**ZiVxPIj7r( zBfozA=d0&YE}O6xZ9}(N7zQ79E+k;?^SYC8q^|CrNGgaw)^QwKir|}lL|;O+Ify{+ zmnqDFOOhd@RlZfZv57*jb6+)FwKM&vfGzDIu zhW8S97=VnMSPM8i+gZlSTk%ocrfp(@|MUGvdST5Sa7qGyTPyxXp$Cdmtbw?bpw_S=%RE1Hgv$+b;HnMMaLm&R~74^WC5nS z3c2sU51FE=R~*N2%Hw?ZKhOU^Z=Um?U-{0rYqDSbv-M&!h7Ry|$O419@=wCWy5aW9>Kk~b8+P@+6 zd_1Y(1bHEUlBQ^sPSZIfavk3-BneAC)3p{oZDu?U=?NOA0njj|QIm+?4{PO9Z_4Mj zvsU?4$a|ePOE{y|5y(!FA9c_uW+bffTSr3h9?ZSIocxR5z4v#x-z{3m+SXaaTv&A=X@yr10u5fSKR8EcUx6|dc$7=r;VoSuxtQ|fRuoY0cm znUs25ON^^M?snoOJRIO4a_PSY>~~v5+&xGqTJ9vlGjHknfy1}I#bpVFJs?xnf=n+R zbmlWc7Bf1cbLaLA5AO?wwsv&x z!+!^Qx9M&3s6%@^!Juy>I5;|@4sf}krVb~Ps+IszhTZBx-&pXG(E*2^e{}4yXJjxi z%o)Af?M$Temd^bPsEqy9nP(J_=c6#As9#;+;u z-D>Xv^}#N+vt8}pZ{+uT4*LSF>Pgi*I_w`C45(h;(2%dqJBV`r(Ah|`RgK2OvGjN} zepXFQL{Z0hMCs7$1wlg|)$bV#vP>3~&?c3`>GLW}AMz;*_l(0ati_YjNIKC9e;^%A zMbxYs*HVdynwSmq{uI{YsZcnjre;Ji-~z zL=%&dgjGHgACLT6+Y?$ksg9@BjsxmAI8L4h>#2kmo(M(bkkzq&qP<-m55<$}6IxP@ zY2i@%lg#dzb~c7sx5M#h1W8~rm6%PUMv(-I)-WP0G?GG<6SL`fREwiRNv;OEB#EFd z7KPfzqoK2LEt!gj)nqCWO3j|-Mu0P>1acWXLSO{uXxgHQ~hG!{d@asv!kb@=E>v-1(r(b1|V(h>D=8u_&JxZ0-n z9B6OvQio1r-nFU0*_c~>NNw+S_jbB_9(1?!e~9P#QW)Lk?&)y^+#a_YN~__}R5TTe zajp0F^r-FV*+XLiw>swbxB0;|#sM9bC71`hyLB^lA5(|K9VJCvAWH zr0s=({Nj_gOM)&7`p!DRP!_!a%6BN$ z7w|rGYB(@@=+L0ock1}y$nnv!fm6ey16a#}Q-?e! zc+}hjqe0{}oQ(qdsdth>`q_UFr{6RBnV4fz6oo%-WiCPYq6Z4=QzFYvQGBk*qD}9W zTd(~0O0LD{!y=#O5YId7H_2*WA>}+gC;f@YnMfiL8CSjOWJ<$!dL!3b^7JG7#lQRd z<9siveyw?YwQ-BSybo;BC*<-*8fKQ_sO%+;g^d4q#n ziL!1nX69Ro^4|F=14wIalxt%9_5#{}>8}a`N zx&iW-Wu}*e+@ql%Dpi>Y^HfGTv zmvcdak^*9!IT_lEjWaGe??8SuO)s0D5e+t^3n!#zT}rNM4kBi3vdC=_4pEhp>2cR| zR>&2s6H%Ct)Yq6J^bmiz5n^`(lqnU=PFj#y<)Y+l3OHnhQZX-Q6zEPW#D>0RE}&$j z`3fTm+M1T>Y?W~3Gqo4?$vIg8=r^PpA*2RDOj0Eb^bWums2X4!PzShxT|gsn)kgFM;91}~;CbMuKnu<%4*+~l zc?4LX>+{$Dm-ydx_Uhk7j}hbq#Lv;({}BE6t^a;}P&r6xL2`I2*^XYow=yjT(2;P` zu?r*}y`abc5+5(o_dNa^fQ^7P4|on(_agyzZKiZ8y*i>;d)yoj@1R4SWmu2cQXSaW~Kmd=|I_Tn0V| zJOw-rTmg6;ms-R37p!B97mwbYF{C2UgVa$m25Q(SIVu7V0 zeR{n?B_qwWrGyt1naqB`b=fNZU5`Vg?#$5lmPDFqW&tB>R{cq*Ji2F~CUyDVA5~iIZeAMrWYs4$lKz_CV@!Oux!NK5|kF^{3V6it>Rqm;(#PV*bs#NIcg^yhbtGrTp4M?DssVE*pI{bssitI$& zYF-qz$8!~WP8GRc#lljl+Q=2ei$HP;m|Z`I8JcQAc2^#2&q~uh`iFOuOTbecLMcAC;U@$>{w3m*oDH0K zn`r?sR66h4hOPN+nRB$|X0@XVjrXFW+jDem&^I!2N`KLUx*Mu)@4Cu(b@jM6-|zDW z@V=q?{6lT~_qVsxkU!Y*5c&L%_>beeG!2zp;&7Id2k0s}dwDOeqCec&T{({BsL%-s z_XGWVP_khww;acw72H~O|0VNF!)J6q_t!pfc~sQVAZ9{o_uUE^dDvOX7<43fGDl++ zIBhvMIKut?VQbX}eHO_XK~JM=7|=pxD8oK>Pl9@ATOGg31o0x4R7WDes*Y)sp*U3| zR-(Z9$Pa=vRL#Le+j^iPdA2XA*3zJutFKs_o{TkT4sQ2}MoZk5r1`9D{ zHN*EhU<^eCeVUV&aVI6H4fm|^8K=@xUx8VPd0rywhEzi{{Jz?MUE6W_9q9Xapb{CZ zc6Psr_Ft?&9yU)>ciR4|@a@X---_K8pGDcGlzA|4M%C9L?_3;StNm9!@*{iM-R2&+ z6YsygDQ1nG5x!e@g{oOji-KqD-J8l1ax7c6FS?T9MUY-N}w#_N$l}yzuQVvftc`?(h&%3>t z3~M~Nmz*{{ci(UOoA>*!xt3d*^s?DK_k*?qtT)*Q@U@t)v|qsCfsIg}1D*%=VkKIS zRS&?=>InRb>FZP2S;c^hhP@Htb|LJKn&DO>D_ls~-s=a*$LG=?gL2qP|7HA9st4hD z#PeNrlWBjoUiwv(!_Go4ndJ*9-`~H5c((g`9(G<#pMWslzpp}m5cEf&Z2$EP;=ch} z>?|uNt6qHi##;_#S}_(aE@{PElqImU!WL(XTJc7%8(;1rc2@dww3ttEn0u;}Td)0> zCEw!5;b#l#S4zgM$KC1kSNZQR66L?Y_z?X4MMq~x=Yif8|3sr~|5djCTD0-G9rj;* zu;;@(AK*uU$ANce6L$9-hON>5tCufHW&1BSD=OQ6eGu%wc-{U&+JKoBVXK{J-(&l) zrn3E4sU9!ef9dvRW&5vElVdTiZ2!gfVcZ>t{TJ`umHMmw^7dcWJ0dF|*aTuf&;vXO z909@r>!Hp76F>@JTd^y^Rp1%m1>hCnRp6VzYd{vb4!i^809K6F0Cm7Fpb=oppDVys zKvxmNTK6UV-z?Z1*swR|Fa7{TH$i4U$Mf0XgOtj)&s=A>{^h#q_2)M04vkx4VaKOn syzJtf$B(=`M$22+JzL6(=(bqcl`u_OUQXF6Z_y6X9OOlEtG0&!53+H`f&c&j literal 0 HcmV?d00001 diff --git a/test-data/document/Bug60942b.doc b/test-data/document/Bug60942b.doc new file mode 100644 index 0000000000000000000000000000000000000000..7ca3b9839dd0e9327eaa65af7d16ea13ad7b56c3 GIT binary patch literal 6144 zcmeHL&2Jk;6o2EaYrFY!-8i^up-U-Fp@|bwR8>Ntq&27&EtTTXARrQJdqdpDyOpyM zDdK>HI3gA0zy+zQL@8fFLi_{0aBD?{R3Q)t_YjFH9NHj+3N+^TW_ROFXzYzaqE$TB z&olexvalY67?*8pv|D1N$KkxUv{#$`IpdIJ{IsqI3v;~L(TY)Yh4s-(# z0103l&;x7-b^s3o4*@#?6?hof1v~=m1|9{HKrgTdcno+P*bDRl{Q&oGz=^9PZ>Act zfPghyFu`1xB7C9~U6TLw9M9LWLat;^n-i8gW|s2m{?uT9+B`K=)D27e#jy;_(`HV3 z1E*8?kol?ob~`oy;eWQ$nx)pxTmthDcWHwCy8Q0B#oNaxqo2PYroFvi|1b#ITL7PL zoPPjd{~iL^$BzQ+=Pv>5>v@3v{WQRxym80&T~2dbLx3BG+yr7yzx`w9-0si56P&IU ze!8Gj|D^~Yy`*~!wqV@=zb+H5Y9_Czx&eo`%7b*{Iu%st;q~h`Jc;sbvuB*xt#=sT zWA4ZQ5TAqo(T4N}z<7o9i-xKY<+*6-&R|GYV`UN0{!l+~ZrTl2sCr7p}bs_k9$ z(ba(q<)=S3l#7P)iJ^RID02zKo<}P5&-W9g8==u;QZ*u zJw3J!*}1@spsg>{E0h(vK6;Jl1ieV(%3Wlb$~J5|a$bMmkha=rgT`STqgykyjp>qc)?0^q!%v_7HDOj$Nx*++dSo27B0T!OTSa*7Z4)gej4LyML3 zV9?u!v(uXOW`D-gjJ#ILPv<5}g$YY5p}lHI($Y?_Cy}JkVO`4?jMw>8@1*!m>Yk;r z*hJ$2%IzSP4o110N|8~0)>5-(9@oS$MghK$lFS_jqC&((7b!;z(-!y=-wf>bcX9iN zw9{6>G`Rg3#b>!zhY0duP6*r-xF1i51O<;6c^wU6y&&K2Df1mgt`di*V$JYA2p;CF zo)&P9c=uV9A1xI1Y4rvD4RypUY6i7I8lcRXV%D6Z)+84hE)}#XiYaHs<LelJ1v4YWzS6xev?B(!Z-v8wgSKT3& z6?qcU0!*+G1RcWrLlGSNIK2C$EnLTc45Q`oa+~rF8RQBMQo=lhp?uL=G`^ZM*@paK0U=wGo7)*WT-0_&ZB@B2&bmi^`a H8~uL++M0I+ literal 0 HcmV?d00001