Bug 49541 - Mapping of symbol characters to unicode equivalent
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1648415 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8e3eee35cd
commit
26ae1edcd8
@ -20,7 +20,9 @@ package org.apache.poi.util;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.FieldPosition;
|
||||
import java.text.NumberFormat;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.poi.hssf.record.RecordInputStream;
|
||||
/**
|
||||
@ -37,6 +39,7 @@ import org.apache.poi.hssf.record.RecordInputStream;
|
||||
public class StringUtil {
|
||||
private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
|
||||
private static final Charset UTF16LE = Charset.forName("UTF-16LE");
|
||||
private static Map<Integer,Integer> msCodepointToUnicode;
|
||||
|
||||
private StringUtil() {
|
||||
// no instances of this class
|
||||
@ -396,4 +399,248 @@ public class StringUtil {
|
||||
}
|
||||
public void remove() {}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Some strings may contain encoded characters of the unicode private use area.
|
||||
* Currently the characters of the symbol fonts are mapped to the corresponding
|
||||
* characters in the normal unicode range.
|
||||
*
|
||||
* @param string the original string
|
||||
* @return the string with mapped characters
|
||||
*
|
||||
* @see <a href="http://www.alanwood.net/unicode/private_use_area.html#symbol">Private Use Area (symbol)</a>
|
||||
* @see <a href="http://www.alanwood.net/demos/symbol.html">Symbol font - Unicode alternatives for Greek and special characters in HTML</a>
|
||||
*/
|
||||
public static String mapMsCodepointString(String string) {
|
||||
if (string == null || "".equals(string)) return string;
|
||||
initMsCodepointMap();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
final int length = string.length();
|
||||
for (int offset = 0; offset < length; ) {
|
||||
Integer msCodepoint = string.codePointAt(offset);
|
||||
Integer uniCodepoint = msCodepointToUnicode.get(msCodepoint);
|
||||
sb.appendCodePoint(uniCodepoint == null ? msCodepoint : uniCodepoint);
|
||||
offset += Character.charCount(msCodepoint);
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static synchronized void mapMsCodepoint(int msCodepoint, int unicodeCodepoint) {
|
||||
initMsCodepointMap();
|
||||
msCodepointToUnicode.put(msCodepoint, unicodeCodepoint);
|
||||
}
|
||||
|
||||
private static synchronized void initMsCodepointMap() {
|
||||
if (msCodepointToUnicode != null) return;
|
||||
msCodepointToUnicode = new HashMap<Integer,Integer>();
|
||||
int i=0xF020;
|
||||
for (int ch : symbolMap_f020) {
|
||||
msCodepointToUnicode.put(i++, ch);
|
||||
}
|
||||
i = 0xf0a0;
|
||||
for (int ch : symbolMap_f0a0) {
|
||||
msCodepointToUnicode.put(i++, ch);
|
||||
}
|
||||
}
|
||||
|
||||
private static final int symbolMap_f020[] = {
|
||||
' ', // 0xf020 space
|
||||
'!', // 0xf021 exclam
|
||||
8704, // 0xf022 universal
|
||||
'#', // 0xf023 numbersign
|
||||
8707, // 0xf024 existential
|
||||
'%', // 0xf025 percent
|
||||
'&', // 0xf026 ampersand
|
||||
8717, // 0xf027 suchthat
|
||||
'(', // 0xf028 parenleft
|
||||
')', // 0xf029 parentright
|
||||
8727, // 0xf02a asteriskmath
|
||||
'+', // 0xf02b plus
|
||||
',', // 0xf02c comma
|
||||
8722, // 0xf02d minus sign (long -)
|
||||
'.', // 0xf02e period
|
||||
'/', // 0xf02f slash
|
||||
'0', // 0xf030 0
|
||||
'1', // 0xf031 1
|
||||
'2', // 0xf032 2
|
||||
'3', // 0xf033 3
|
||||
'4', // 0xf034 4
|
||||
'5', // 0xf035 5
|
||||
'6', // 0xf036 6
|
||||
'7', // 0xf037 7
|
||||
'8', // 0xf038 8
|
||||
'9', // 0xf039 9
|
||||
':', // 0xf03a colon
|
||||
';', // 0xf03b semicolon
|
||||
'<', // 0xf03c less
|
||||
'=', // 0xf03d equal
|
||||
'>', // 0xf03e greater
|
||||
'?', // 0xf03f question
|
||||
8773, // 0xf040 congruent
|
||||
913, // 0xf041 alpha (upper)
|
||||
914, // 0xf042 beta (upper)
|
||||
935, // 0xf043 chi (upper)
|
||||
916, // 0xf044 delta (upper)
|
||||
917, // 0xf045 epsilon (upper)
|
||||
934, // 0xf046 phi (upper)
|
||||
915, // 0xf047 gamma (upper)
|
||||
919, // 0xf048 eta (upper)
|
||||
921, // 0xf049 iota (upper)
|
||||
977, // 0xf04a theta1 (lower)
|
||||
922, // 0xf04b kappa (upper)
|
||||
923, // 0xf04c lambda (upper)
|
||||
924, // 0xf04d mu (upper)
|
||||
925, // 0xf04e nu (upper)
|
||||
927, // 0xf04f omicron (upper)
|
||||
928, // 0xf050 pi (upper)
|
||||
920, // 0xf051 theta (upper)
|
||||
929, // 0xf052 rho (upper)
|
||||
931, // 0xf053 sigma (upper)
|
||||
932, // 0xf054 tau (upper)
|
||||
933, // 0xf055 upsilon (upper)
|
||||
962, // 0xf056 simga1 (lower)
|
||||
937, // 0xf057 omega (upper)
|
||||
926, // 0xf058 xi (upper)
|
||||
936, // 0xf059 psi (upper)
|
||||
918, // 0xf05a zeta (upper)
|
||||
'[', // 0xf05b bracketleft
|
||||
8765, // 0xf05c therefore
|
||||
']', // 0xf05d bracketright
|
||||
8869, // 0xf05e perpendicular
|
||||
'_', // 0xf05f underscore
|
||||
' ', // 0xf060 radicalex (doesn't exist in unicode)
|
||||
945, // 0xf061 alpha (lower)
|
||||
946, // 0xf062 beta (lower)
|
||||
967, // 0xf063 chi (lower)
|
||||
948, // 0xf064 delta (lower)
|
||||
949, // 0xf065 epsilon (lower)
|
||||
966, // 0xf066 phi (lower)
|
||||
947, // 0xf067 gamma (lower)
|
||||
951, // 0xf068 eta (lower)
|
||||
953, // 0xf069 iota (lower)
|
||||
981, // 0xf06a phi1 (lower)
|
||||
954, // 0xf06b kappa (lower)
|
||||
955, // 0xf06c lambda (lower)
|
||||
956, // 0xf06d mu (lower)
|
||||
957, // 0xf06e nu (lower)
|
||||
959, // 0xf06f omnicron (lower)
|
||||
960, // 0xf070 pi (lower)
|
||||
952, // 0xf071 theta (lower)
|
||||
961, // 0xf072 rho (lower)
|
||||
963, // 0xf073 sigma (lower)
|
||||
964, // 0xf074 tau (lower)
|
||||
965, // 0xf075 upsilon (lower)
|
||||
982, // 0xf076 piv (lower)
|
||||
969, // 0xf077 omega (lower)
|
||||
958, // 0xf078 xi (lower)
|
||||
968, // 0xf079 psi (lower)
|
||||
950, // 0xf07a zeta (lower)
|
||||
'{', // 0xf07b braceleft
|
||||
'|', // 0xf07c bar
|
||||
'}', // 0xf07d braceright
|
||||
8764, // 0xf07e similar '~'
|
||||
' ', // 0xf07f not defined
|
||||
};
|
||||
|
||||
private static final int symbolMap_f0a0[] = {
|
||||
8364, // 0xf0a0 not defined / euro symbol
|
||||
978, // 0xf0a1 upsilon1 (upper)
|
||||
8242, // 0xf0a2 minute
|
||||
8804, // 0xf0a3 lessequal
|
||||
8260, // 0xf0a4 fraction
|
||||
8734, // 0xf0a5 infinity
|
||||
402, // 0xf0a6 florin
|
||||
9827, // 0xf0a7 club
|
||||
9830, // 0xf0a8 diamond
|
||||
9829, // 0xf0a9 heart
|
||||
9824, // 0xf0aa spade
|
||||
8596, // 0xf0ab arrowboth
|
||||
8591, // 0xf0ac arrowleft
|
||||
8593, // 0xf0ad arrowup
|
||||
8594, // 0xf0ae arrowright
|
||||
8595, // 0xf0af arrowdown
|
||||
176, // 0xf0b0 degree
|
||||
177, // 0xf0b1 plusminus
|
||||
8243, // 0xf0b2 second
|
||||
8805, // 0xf0b3 greaterequal
|
||||
215, // 0xf0b4 multiply
|
||||
181, // 0xf0b5 proportional
|
||||
8706, // 0xf0b6 partialdiff
|
||||
8729, // 0xf0b7 bullet
|
||||
247, // 0xf0b8 divide
|
||||
8800, // 0xf0b9 notequal
|
||||
8801, // 0xf0ba equivalence
|
||||
8776, // 0xf0bb approxequal
|
||||
8230, // 0xf0bc ellipsis
|
||||
9168, // 0xf0bd arrowvertex
|
||||
9135, // 0xf0be arrowhorizex
|
||||
8629, // 0xf0bf carriagereturn
|
||||
8501, // 0xf0c0 aleph
|
||||
8475, // 0xf0c1 Ifraktur
|
||||
8476, // 0xf0c2 Rfraktur
|
||||
8472, // 0xf0c3 weierstrass
|
||||
8855, // 0xf0c4 circlemultiply
|
||||
8853, // 0xf0c5 circleplus
|
||||
8709, // 0xf0c6 emptyset
|
||||
8745, // 0xf0c7 intersection
|
||||
8746, // 0xf0c8 union
|
||||
8835, // 0xf0c9 propersuperset
|
||||
8839, // 0xf0ca reflexsuperset
|
||||
8836, // 0xf0cb notsubset
|
||||
8834, // 0xf0cc propersubset
|
||||
8838, // 0xf0cd reflexsubset
|
||||
8712, // 0xf0ce element
|
||||
8713, // 0xf0cf notelement
|
||||
8736, // 0xf0d0 angle
|
||||
8711, // 0xf0d1 gradient
|
||||
174, // 0xf0d2 registerserif
|
||||
169, // 0xf0d3 copyrightserif
|
||||
8482, // 0xf0d4 trademarkserif
|
||||
8719, // 0xf0d5 product
|
||||
8730, // 0xf0d6 radical
|
||||
8901, // 0xf0d7 dotmath
|
||||
172, // 0xf0d8 logicalnot
|
||||
8743, // 0xf0d9 logicaland
|
||||
8744, // 0xf0da logicalor
|
||||
8660, // 0xf0db arrowdblboth
|
||||
8656, // 0xf0dc arrowdblleft
|
||||
8657, // 0xf0dd arrowdblup
|
||||
8658, // 0xf0de arrowdblright
|
||||
8659, // 0xf0df arrowdbldown
|
||||
9674, // 0xf0e0 lozenge
|
||||
9001, // 0xf0e1 angleleft
|
||||
174, // 0xf0e2 registersans
|
||||
169, // 0xf0e3 copyrightsans
|
||||
8482, // 0xf0e4 trademarksans
|
||||
8721, // 0xf0e5 summation
|
||||
9115, // 0xf0e6 parenlefttp
|
||||
9116, // 0xf0e7 parenleftex
|
||||
9117, // 0xf0e8 parenleftbt
|
||||
9121, // 0xf0e9 bracketlefttp
|
||||
9122, // 0xf0ea bracketleftex
|
||||
9123, // 0xf0eb bracketleftbt
|
||||
9127, // 0xf0ec bracelefttp
|
||||
9128, // 0xf0ed braceleftmid
|
||||
9129, // 0xf0ee braceleftbt
|
||||
9130, // 0xf0ef braceex
|
||||
' ', // 0xf0f0 not defined
|
||||
9002, // 0xf0f1 angleright
|
||||
8747, // 0xf0f2 integral
|
||||
8992, // 0xf0f3 integraltp
|
||||
9134, // 0xf0f4 integralex
|
||||
8993, // 0xf0f5 integralbt
|
||||
9118, // 0xf0f6 parenrighttp
|
||||
9119, // 0xf0f7 parenrightex
|
||||
9120, // 0xf0f8 parenrightbt
|
||||
9124, // 0xf0f9 bracketrighttp
|
||||
9125, // 0xf0fa bracketrightex
|
||||
9126, // 0xf0fb bracketrightbt
|
||||
9131, // 0xf0fc bracerighttp
|
||||
9132, // 0xf0fd bracerightmid
|
||||
9133, // 0xf0fe bracerightbt
|
||||
' ', // 0xf0ff not defined
|
||||
};
|
||||
}
|
||||
|
@ -58,6 +58,7 @@ import org.apache.poi.hslf.record.Record;
|
||||
import org.apache.poi.hslf.record.SlideListWithText;
|
||||
import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet;
|
||||
import org.apache.poi.hslf.record.TextHeaderAtom;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
@ -579,4 +580,19 @@ public final class TestBugs {
|
||||
inputStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void bug49541() throws Exception {
|
||||
InputStream inputStream = new FileInputStream(_slTests.getFile("49541_symbol_map.ppt"));
|
||||
try {
|
||||
SlideShow slideShow = new SlideShow(inputStream);
|
||||
Slide slide = slideShow.getSlides()[0];
|
||||
ShapeGroup sg = (ShapeGroup)slide.getShapes()[0];
|
||||
TextBox tb = (TextBox)sg.getShapes()[0];
|
||||
String text = StringUtil.mapMsCodepointString(tb.getText());
|
||||
assertEquals("\u226575 years", text);
|
||||
} finally {
|
||||
inputStream.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
BIN
test-data/slideshow/49541_symbol_map.ppt
Normal file
BIN
test-data/slideshow/49541_symbol_map.ppt
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user