Bug 60608 -- improve charset handling in Hwmf

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1779519 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-01-19 20:19:26 +00:00
parent f6388c2fff
commit a9436e9789
4 changed files with 161 additions and 40 deletions

View File

@ -29,6 +29,7 @@ import java.awt.font.TextAttribute;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.nio.charset.Charset;
import java.text.AttributedString;
import java.util.ArrayList;
import java.util.LinkedList;
@ -48,8 +49,11 @@ import org.apache.poi.hwmf.record.HwmfPenStyle.HwmfLineDash;
import org.apache.poi.sl.draw.DrawFactory;
import org.apache.poi.sl.draw.DrawFontManager;
import org.apache.poi.sl.draw.Drawable;
import org.apache.poi.util.LocaleUtil;
public class HwmfGraphics {
private static final Charset DEFAULT_CHARSET = LocaleUtil.CHARSET_1252;
private final Graphics2D graphicsCtx;
private final List<HwmfDrawProperties> propStack = new LinkedList<HwmfDrawProperties>();
private HwmfDrawProperties prop = new HwmfDrawProperties();
@ -311,14 +315,34 @@ public class HwmfGraphics {
break;
}
}
/**
*
* @param text
* @param bounds
* @deprecated use {@link #drawString(byte[], Rectangle2D)}
*/
public void drawString(String text, Rectangle2D bounds) {
drawString(text, bounds, null);
}
public void drawString(byte[] text, Rectangle2D bounds) {
drawString(text, bounds, null);
}
/**
*
* @param text
* @param bounds
* @deprecated use {@link #drawString(byte[], Rectangle2D, int[])}
*/
public void drawString(String text, Rectangle2D bounds, int dx[]) {
drawString(text.getBytes(DEFAULT_CHARSET), bounds, dx);
}
public void drawString(byte[] text, Rectangle2D bounds, int dx[]) {
HwmfFont font = prop.getFont();
if (font == null || text == null || text.isEmpty()) {
if (font == null || text == null || text.length == 0) {
return;
}
@ -326,8 +350,11 @@ public class HwmfGraphics {
// TODO: another approx. ...
double fontW = fontH/1.8;
int len = text.length();
AttributedString as = new AttributedString(text);
int len = text.length;
Charset charset = (font.getCharSet().getCharset() == null)?
DEFAULT_CHARSET : font.getCharSet().getCharset();
String textString = new String(text, charset);
AttributedString as = new AttributedString(textString);
if (dx == null || dx.length == 0) {
addAttributes(as, font);
} else {

View File

@ -19,67 +19,93 @@ package org.apache.poi.hwmf.record;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import org.apache.poi.util.LittleEndianConsts;
import org.apache.poi.util.LittleEndianInputStream;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* The Font object specifies the attributes of a logical font
*/
public class HwmfFont {
private static final POILogger logger = POILogFactory.getLogger(HwmfFont.class);
public enum WmfCharset {
/** Specifies the English character set. */
ANSI_CHARSET(0x00000000),
ANSI_CHARSET(0x00000000, "Cp1252"),
/**
* Specifies a character set based on the current system locale;
* for example, when the system locale is United States English,
* the default character set is ANSI_CHARSET.
*/
DEFAULT_CHARSET(0x00000001),
DEFAULT_CHARSET(0x00000001, "Cp1252"),
/** Specifies a character set of symbols. */
SYMBOL_CHARSET(0x00000002),
SYMBOL_CHARSET(0x00000002, ""),
/** Specifies the Apple Macintosh character set. */
MAC_CHARSET(0x0000004D),
MAC_CHARSET(0x0000004D, "MacRoman"),
/** Specifies the Japanese character set. */
SHIFTJIS_CHARSET(0x00000080),
SHIFTJIS_CHARSET(0x00000080, "Shift_JIS"),
/** Also spelled "Hangeul". Specifies the Hangul Korean character set. */
HANGUL_CHARSET(0x00000081),
HANGUL_CHARSET(0x00000081, "cp949"),
/** Also spelled "Johap". Specifies the Johab Korean character set. */
JOHAB_CHARSET(0x00000082),
JOHAB_CHARSET(0x00000082, "x-Johab"),
/** Specifies the "simplified" Chinese character set for People's Republic of China. */
GB2312_CHARSET(0x00000086),
GB2312_CHARSET(0x00000086, "GB2312"),
/**
* Specifies the "traditional" Chinese character set, used mostly in
* Taiwan and in the Hong Kong and Macao Special Administrative Regions.
*/
CHINESEBIG5_CHARSET(0x00000088),
CHINESEBIG5_CHARSET(0x00000088, "Big5"),
/** Specifies the Greek character set. */
GREEK_CHARSET(0x000000A1),
GREEK_CHARSET(0x000000A1, "Cp1253"),
/** Specifies the Turkish character set. */
TURKISH_CHARSET(0x000000A2),
TURKISH_CHARSET(0x000000A2, "Cp1254"),
/** Specifies the Vietnamese character set. */
VIETNAMESE_CHARSET(0x000000A3),
VIETNAMESE_CHARSET(0x000000A3, "Cp1258"),
/** Specifies the Hebrew character set. */
HEBREW_CHARSET(0x000000B1),
HEBREW_CHARSET(0x000000B1, "Cp1255"),
/** Specifies the Arabic character set. */
ARABIC_CHARSET(0x000000B2),
ARABIC_CHARSET(0x000000B2, "Cp1256"),
/** Specifies the Baltic (Northeastern European) character set. */
BALTIC_CHARSET(0x000000BA),
BALTIC_CHARSET(0x000000BA, "Cp1257"),
/** Specifies the Russian Cyrillic character set. */
RUSSIAN_CHARSET(0x000000CC),
RUSSIAN_CHARSET(0x000000CC, "Cp1251"),
/** Specifies the Thai character set. */
THAI_CHARSET(0x000000DE),
THAI_CHARSET(0x000000DE, "x-windows-874"),
/** Specifies a Eastern European character set. */
EASTEUROPE_CHARSET(0x000000EE),
EASTEUROPE_CHARSET(0x000000EE, "Cp1250"),
/**
* Specifies a mapping to one of the OEM code pages,
* according to the current system locale setting.
*/
OEM_CHARSET(0x000000FF);
OEM_CHARSET(0x000000FF, "Cp1252");
int flag;
WmfCharset(int flag) {
Charset charset;
WmfCharset(int flag, String javaCharsetName) {
this.flag = flag;
if (javaCharsetName.length() > 0) {
try {
charset = Charset.forName(javaCharsetName);
return;
} catch (UnsupportedCharsetException e) {
logger.log(POILogger.WARN, "Unsupported charset: "+javaCharsetName);
}
}
charset = null;
}
/**
*
* @return charset for the font or <code>null</code> if there is no matching charset or
* if the charset is a &quot;default&quot;
*/
public Charset getCharset() {
return charset;
}
static WmfCharset valueOf(int flag) {

View File

@ -19,6 +19,7 @@ package org.apache.poi.hwmf.record;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.nio.charset.Charset;
import org.apache.poi.hwmf.draw.HwmfDrawProperties;
import org.apache.poi.hwmf.draw.HwmfGraphics;
@ -27,7 +28,6 @@ import org.apache.poi.util.BitField;
import org.apache.poi.util.BitFieldFactory;
import org.apache.poi.util.LittleEndianConsts;
import org.apache.poi.util.LittleEndianInputStream;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
@ -144,7 +144,7 @@ public class HwmfText {
* length of the string.
* The string is written at the location specified by the XStart and YStart fields.
*/
private String text;
private byte[] rawTextBytes;
/**
* A 16-bit signed integer that defines the vertical (y-axis) coordinate, in logical
* units, of the point where drawing is to start.
@ -164,18 +164,33 @@ public class HwmfText {
@Override
public int init(LittleEndianInputStream leis, long recordSize, int recordFunction) throws IOException {
stringLength = leis.readShort();
byte buf[] = new byte[stringLength+(stringLength&1)];
leis.readFully(buf);
text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252).trim();
rawTextBytes = new byte[stringLength+(stringLength&1)];
leis.readFully(rawTextBytes);
yStart = leis.readShort();
xStart = leis.readShort();
return 3*LittleEndianConsts.SHORT_SIZE+buf.length;
return 3*LittleEndianConsts.SHORT_SIZE+rawTextBytes.length;
}
@Override
public void draw(HwmfGraphics ctx) {
Rectangle2D bounds = new Rectangle2D.Double(xStart, yStart, 0, 0);
ctx.drawString(text, bounds);
ctx.drawString(getTextBytes(), bounds);
}
public String getText(Charset charset) {
return new String(getTextBytes(), charset);
}
/**
*
* @return a copy of a trimmed byte array of rawTextBytes bytes.
* This includes only the bytes from 0..stringLength.
* This does not include the extra optional padding on the byte array.
*/
private byte[] getTextBytes() {
byte[] ret = new byte[stringLength];
System.arraycopy(rawTextBytes, 0, ret, 0, stringLength);
return ret;
}
}
@ -264,7 +279,7 @@ public class HwmfText {
* the length is odd, an extra byte is placed after it so that the following member (optional Dx) is
* aligned on a 16-bit boundary.
*/
private String text;
private byte[] rawTextBytes;
/**
* An optional array of 16-bit signed integers that indicate the distance between
* origins of adjacent character cells. For example, Dx[i] logical units separate the origins of
@ -300,10 +315,9 @@ public class HwmfText {
size += 4*LittleEndianConsts.SHORT_SIZE;
}
byte buf[] = new byte[stringLength+(stringLength&1)];
leis.readFully(buf);
text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252);
size += buf.length;
rawTextBytes = new byte[stringLength+(stringLength&1)];
leis.readFully(rawTextBytes);
size += rawTextBytes.length;
if (size >= remainingRecordSize) {
logger.log(POILogger.INFO, "META_EXTTEXTOUT doesn't contain character tracking info");
@ -327,7 +341,23 @@ public class HwmfText {
@Override
public void draw(HwmfGraphics ctx) {
Rectangle2D bounds = new Rectangle2D.Double(x, y, 0, 0);
ctx.drawString(text, bounds, dx);
ctx.drawString(getTextBytes(), bounds, dx);
}
public String getText(Charset charset) {
return new String(getTextBytes(), charset);
}
/**
*
* @return a copy of a trimmed byte array of rawTextBytes bytes.
* This includes only the bytes from 0..stringLength.
* This does not include the extra optional padding on the byte array.
*/
private byte[] getTextBytes() {
byte[] ret = new byte[stringLength];
System.arraycopy(rawTextBytes, 0, ret, 0, stringLength);
return ret;
}
}
@ -523,5 +553,9 @@ public class HwmfText {
public void applyObject(HwmfGraphics ctx) {
ctx.getProperties().setFont(font);
}
public HwmfFont getFont() {
return font;
}
}
}

View File

@ -18,7 +18,9 @@
package org.apache.poi.hwmf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import javax.imageio.ImageIO;
import java.awt.Dimension;
import java.awt.Graphics2D;
import java.awt.RenderingHints;
@ -31,21 +33,24 @@ import java.io.FileOutputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Locale;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import javax.imageio.ImageIO;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwmf.record.HwmfFill.HwmfImageRecord;
import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.hwmf.record.HwmfRecord;
import org.apache.poi.hwmf.record.HwmfRecordType;
import org.apache.poi.hwmf.record.HwmfText;
import org.apache.poi.hwmf.usermodel.HwmfPicture;
import org.apache.poi.sl.usermodel.PictureData;
import org.apache.poi.sl.usermodel.PictureData.PictureType;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.util.Units;
import org.junit.Ignore;
import org.junit.Test;
@ -188,4 +193,33 @@ public class TestHwmfParsing {
}
}
}
@Test
@Ignore("If we decide we can use common crawl file specified, we can turn this back on")
public void testCyrillic() throws Exception {
//TODO: move test file to framework and fix this
File dir = new File("C:/somethingOrOther");
File f = new File(dir, "ZMLH54SPLI76NQ7XMKVB7SMUJA2HTXTS-2.wmf");
HwmfPicture wmf = new HwmfPicture(new FileInputStream(f));
Charset charset = LocaleUtil.CHARSET_1252;
StringBuilder sb = new StringBuilder();
//this is pure hackery for specifying the font
//this happens to work on this test file, but you need to
//do what Graphics does by maintaining the stack, etc.!
for (HwmfRecord r : wmf.getRecords()) {
if (r.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
HwmfFont font = ((HwmfText.WmfCreateFontIndirect)r).getFont();
charset = (font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();
}
if (r.getRecordType().equals(HwmfRecordType.extTextOut)) {
HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut)r;
sb.append(textOut.getText(charset)).append("\n");
}
}
String txt = sb.toString();
assertTrue(txt.contains("\u041E\u0431\u0449\u043E"));
assertTrue(txt.contains("\u0411\u0430\u043B\u0430\u043D\u0441"));
}
}