diff --git a/src/java/org/apache/poi/util/StringUtil.java b/src/java/org/apache/poi/util/StringUtil.java index 3a323f67a..99880f500 100644 --- a/src/java/org/apache/poi/util/StringUtil.java +++ b/src/java/org/apache/poi/util/StringUtil.java @@ -20,7 +20,9 @@ package org.apache.poi.util; import java.nio.charset.Charset; import java.text.FieldPosition; import java.text.NumberFormat; +import java.util.HashMap; import java.util.Iterator; +import java.util.Map; import org.apache.poi.hssf.record.RecordInputStream; /** @@ -37,6 +39,7 @@ import org.apache.poi.hssf.record.RecordInputStream; public class StringUtil { private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1"); private static final Charset UTF16LE = Charset.forName("UTF-16LE"); + private static Map msCodepointToUnicode; private StringUtil() { // no instances of this class @@ -396,4 +399,248 @@ public class StringUtil { } public void remove() {} } + + + /** + * Some strings may contain encoded characters of the unicode private use area. + * Currently the characters of the symbol fonts are mapped to the corresponding + * characters in the normal unicode range. + * + * @param string the original string + * @return the string with mapped characters + * + * @see Private Use Area (symbol) + * @see Symbol font - Unicode alternatives for Greek and special characters in HTML + */ + public static String mapMsCodepointString(String string) { + if (string == null || "".equals(string)) return string; + initMsCodepointMap(); + + StringBuilder sb = new StringBuilder(); + final int length = string.length(); + for (int offset = 0; offset < length; ) { + Integer msCodepoint = string.codePointAt(offset); + Integer uniCodepoint = msCodepointToUnicode.get(msCodepoint); + sb.appendCodePoint(uniCodepoint == null ? msCodepoint : uniCodepoint); + offset += Character.charCount(msCodepoint); + } + + return sb.toString(); + } + + public static synchronized void mapMsCodepoint(int msCodepoint, int unicodeCodepoint) { + initMsCodepointMap(); + msCodepointToUnicode.put(msCodepoint, unicodeCodepoint); + } + + private static synchronized void initMsCodepointMap() { + if (msCodepointToUnicode != null) return; + msCodepointToUnicode = new HashMap(); + int i=0xF020; + for (int ch : symbolMap_f020) { + msCodepointToUnicode.put(i++, ch); + } + i = 0xf0a0; + for (int ch : symbolMap_f0a0) { + msCodepointToUnicode.put(i++, ch); + } + } + + private static final int symbolMap_f020[] = { + ' ', // 0xf020 space + '!', // 0xf021 exclam + 8704, // 0xf022 universal + '#', // 0xf023 numbersign + 8707, // 0xf024 existential + '%', // 0xf025 percent + '&', // 0xf026 ampersand + 8717, // 0xf027 suchthat + '(', // 0xf028 parenleft + ')', // 0xf029 parentright + 8727, // 0xf02a asteriskmath + '+', // 0xf02b plus + ',', // 0xf02c comma + 8722, // 0xf02d minus sign (long -) + '.', // 0xf02e period + '/', // 0xf02f slash + '0', // 0xf030 0 + '1', // 0xf031 1 + '2', // 0xf032 2 + '3', // 0xf033 3 + '4', // 0xf034 4 + '5', // 0xf035 5 + '6', // 0xf036 6 + '7', // 0xf037 7 + '8', // 0xf038 8 + '9', // 0xf039 9 + ':', // 0xf03a colon + ';', // 0xf03b semicolon + '<', // 0xf03c less + '=', // 0xf03d equal + '>', // 0xf03e greater + '?', // 0xf03f question + 8773, // 0xf040 congruent + 913, // 0xf041 alpha (upper) + 914, // 0xf042 beta (upper) + 935, // 0xf043 chi (upper) + 916, // 0xf044 delta (upper) + 917, // 0xf045 epsilon (upper) + 934, // 0xf046 phi (upper) + 915, // 0xf047 gamma (upper) + 919, // 0xf048 eta (upper) + 921, // 0xf049 iota (upper) + 977, // 0xf04a theta1 (lower) + 922, // 0xf04b kappa (upper) + 923, // 0xf04c lambda (upper) + 924, // 0xf04d mu (upper) + 925, // 0xf04e nu (upper) + 927, // 0xf04f omicron (upper) + 928, // 0xf050 pi (upper) + 920, // 0xf051 theta (upper) + 929, // 0xf052 rho (upper) + 931, // 0xf053 sigma (upper) + 932, // 0xf054 tau (upper) + 933, // 0xf055 upsilon (upper) + 962, // 0xf056 simga1 (lower) + 937, // 0xf057 omega (upper) + 926, // 0xf058 xi (upper) + 936, // 0xf059 psi (upper) + 918, // 0xf05a zeta (upper) + '[', // 0xf05b bracketleft + 8765, // 0xf05c therefore + ']', // 0xf05d bracketright + 8869, // 0xf05e perpendicular + '_', // 0xf05f underscore + ' ', // 0xf060 radicalex (doesn't exist in unicode) + 945, // 0xf061 alpha (lower) + 946, // 0xf062 beta (lower) + 967, // 0xf063 chi (lower) + 948, // 0xf064 delta (lower) + 949, // 0xf065 epsilon (lower) + 966, // 0xf066 phi (lower) + 947, // 0xf067 gamma (lower) + 951, // 0xf068 eta (lower) + 953, // 0xf069 iota (lower) + 981, // 0xf06a phi1 (lower) + 954, // 0xf06b kappa (lower) + 955, // 0xf06c lambda (lower) + 956, // 0xf06d mu (lower) + 957, // 0xf06e nu (lower) + 959, // 0xf06f omnicron (lower) + 960, // 0xf070 pi (lower) + 952, // 0xf071 theta (lower) + 961, // 0xf072 rho (lower) + 963, // 0xf073 sigma (lower) + 964, // 0xf074 tau (lower) + 965, // 0xf075 upsilon (lower) + 982, // 0xf076 piv (lower) + 969, // 0xf077 omega (lower) + 958, // 0xf078 xi (lower) + 968, // 0xf079 psi (lower) + 950, // 0xf07a zeta (lower) + '{', // 0xf07b braceleft + '|', // 0xf07c bar + '}', // 0xf07d braceright + 8764, // 0xf07e similar '~' + ' ', // 0xf07f not defined + }; + + private static final int symbolMap_f0a0[] = { + 8364, // 0xf0a0 not defined / euro symbol + 978, // 0xf0a1 upsilon1 (upper) + 8242, // 0xf0a2 minute + 8804, // 0xf0a3 lessequal + 8260, // 0xf0a4 fraction + 8734, // 0xf0a5 infinity + 402, // 0xf0a6 florin + 9827, // 0xf0a7 club + 9830, // 0xf0a8 diamond + 9829, // 0xf0a9 heart + 9824, // 0xf0aa spade + 8596, // 0xf0ab arrowboth + 8591, // 0xf0ac arrowleft + 8593, // 0xf0ad arrowup + 8594, // 0xf0ae arrowright + 8595, // 0xf0af arrowdown + 176, // 0xf0b0 degree + 177, // 0xf0b1 plusminus + 8243, // 0xf0b2 second + 8805, // 0xf0b3 greaterequal + 215, // 0xf0b4 multiply + 181, // 0xf0b5 proportional + 8706, // 0xf0b6 partialdiff + 8729, // 0xf0b7 bullet + 247, // 0xf0b8 divide + 8800, // 0xf0b9 notequal + 8801, // 0xf0ba equivalence + 8776, // 0xf0bb approxequal + 8230, // 0xf0bc ellipsis + 9168, // 0xf0bd arrowvertex + 9135, // 0xf0be arrowhorizex + 8629, // 0xf0bf carriagereturn + 8501, // 0xf0c0 aleph + 8475, // 0xf0c1 Ifraktur + 8476, // 0xf0c2 Rfraktur + 8472, // 0xf0c3 weierstrass + 8855, // 0xf0c4 circlemultiply + 8853, // 0xf0c5 circleplus + 8709, // 0xf0c6 emptyset + 8745, // 0xf0c7 intersection + 8746, // 0xf0c8 union + 8835, // 0xf0c9 propersuperset + 8839, // 0xf0ca reflexsuperset + 8836, // 0xf0cb notsubset + 8834, // 0xf0cc propersubset + 8838, // 0xf0cd reflexsubset + 8712, // 0xf0ce element + 8713, // 0xf0cf notelement + 8736, // 0xf0d0 angle + 8711, // 0xf0d1 gradient + 174, // 0xf0d2 registerserif + 169, // 0xf0d3 copyrightserif + 8482, // 0xf0d4 trademarkserif + 8719, // 0xf0d5 product + 8730, // 0xf0d6 radical + 8901, // 0xf0d7 dotmath + 172, // 0xf0d8 logicalnot + 8743, // 0xf0d9 logicaland + 8744, // 0xf0da logicalor + 8660, // 0xf0db arrowdblboth + 8656, // 0xf0dc arrowdblleft + 8657, // 0xf0dd arrowdblup + 8658, // 0xf0de arrowdblright + 8659, // 0xf0df arrowdbldown + 9674, // 0xf0e0 lozenge + 9001, // 0xf0e1 angleleft + 174, // 0xf0e2 registersans + 169, // 0xf0e3 copyrightsans + 8482, // 0xf0e4 trademarksans + 8721, // 0xf0e5 summation + 9115, // 0xf0e6 parenlefttp + 9116, // 0xf0e7 parenleftex + 9117, // 0xf0e8 parenleftbt + 9121, // 0xf0e9 bracketlefttp + 9122, // 0xf0ea bracketleftex + 9123, // 0xf0eb bracketleftbt + 9127, // 0xf0ec bracelefttp + 9128, // 0xf0ed braceleftmid + 9129, // 0xf0ee braceleftbt + 9130, // 0xf0ef braceex + ' ', // 0xf0f0 not defined + 9002, // 0xf0f1 angleright + 8747, // 0xf0f2 integral + 8992, // 0xf0f3 integraltp + 9134, // 0xf0f4 integralex + 8993, // 0xf0f5 integralbt + 9118, // 0xf0f6 parenrighttp + 9119, // 0xf0f7 parenrightex + 9120, // 0xf0f8 parenrightbt + 9124, // 0xf0f9 bracketrighttp + 9125, // 0xf0fa bracketrightex + 9126, // 0xf0fb bracketrightbt + 9131, // 0xf0fc bracerighttp + 9132, // 0xf0fd bracerightmid + 9133, // 0xf0fe bracerightbt + ' ', // 0xf0ff not defined + }; } diff --git a/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java b/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java index 4f5d7ae10..6b26c28d7 100644 --- a/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java +++ b/src/scratchpad/testcases/org/apache/poi/hslf/usermodel/TestBugs.java @@ -58,6 +58,7 @@ import org.apache.poi.hslf.record.Record; import org.apache.poi.hslf.record.SlideListWithText; import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet; import org.apache.poi.hslf.record.TextHeaderAtom; +import org.apache.poi.util.StringUtil; import org.junit.Test; /** @@ -579,4 +580,19 @@ public final class TestBugs { inputStream.close(); } } + + @Test + public void bug49541() throws Exception { + InputStream inputStream = new FileInputStream(_slTests.getFile("49541_symbol_map.ppt")); + try { + SlideShow slideShow = new SlideShow(inputStream); + Slide slide = slideShow.getSlides()[0]; + ShapeGroup sg = (ShapeGroup)slide.getShapes()[0]; + TextBox tb = (TextBox)sg.getShapes()[0]; + String text = StringUtil.mapMsCodepointString(tb.getText()); + assertEquals("\u226575 years", text); + } finally { + inputStream.close(); + } + } } diff --git a/test-data/slideshow/49541_symbol_map.ppt b/test-data/slideshow/49541_symbol_map.ppt new file mode 100644 index 000000000..781049290 Binary files /dev/null and b/test-data/slideshow/49541_symbol_map.ppt differ