Another attempt to fix unicode problems in functionMetadata.txt. Made that file pure ascii.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@653892 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Josh Micich 2008-05-06 19:48:55 +00:00
parent 3475425c5f
commit 609058dea5
4 changed files with 136 additions and 59 deletions

View File

@ -38,14 +38,13 @@ final class FunctionMetadataReader {
private static final String METADATA_FILE_NAME = "functionMetadata.txt"; private static final String METADATA_FILE_NAME = "functionMetadata.txt";
/** plain ASCII text metadata file uses three dots for ellipsis */
private static final String ELLIPSIS = "...";
private static final Pattern TAB_DELIM_PATTERN = Pattern.compile("\t"); private static final Pattern TAB_DELIM_PATTERN = Pattern.compile("\t");
private static final Pattern SPACE_DELIM_PATTERN = Pattern.compile(" "); private static final Pattern SPACE_DELIM_PATTERN = Pattern.compile(" ");
private static final byte[] EMPTY_BYTE_ARRAY = { }; private static final byte[] EMPTY_BYTE_ARRAY = { };
// special characters from the ooo document
private static final int CHAR_ELLIPSIS_8230 = 8230;
private static final int CHAR_NDASH_8211 = 8211;
private static final String[] DIGIT_ENDING_FUNCTION_NAMES = { private static final String[] DIGIT_ENDING_FUNCTION_NAMES = {
// Digits at the end of a function might be due to a left-over footnote marker. // Digits at the end of a function might be due to a left-over footnote marker.
// except in these cases // except in these cases
@ -59,10 +58,12 @@ final class FunctionMetadataReader {
throw new RuntimeException("resource '" + METADATA_FILE_NAME + "' not found"); throw new RuntimeException("resource '" + METADATA_FILE_NAME + "' not found");
} }
BufferedReader br = null; BufferedReader br;
try { try {
br = new BufferedReader(new InputStreamReader(is,"UTF-8")); br = new BufferedReader(new InputStreamReader(is,"UTF-8"));
} catch(UnsupportedEncodingException e) { /* never happens */ } } catch(UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
FunctionDataBuilder fdb = new FunctionDataBuilder(400); FunctionDataBuilder fdb = new FunctionDataBuilder(400);
try { try {
@ -127,7 +128,9 @@ final class FunctionMetadataReader {
} }
String[] array = SPACE_DELIM_PATTERN.split(codes); String[] array = SPACE_DELIM_PATTERN.split(codes);
int nItems = array.length; int nItems = array.length;
if(array[nItems-1].charAt(0) == CHAR_ELLIPSIS_8230) { if(ELLIPSIS.equals(array[nItems-1])) {
// final ellipsis is optional, and ignored
// (all unspecified params are assumed to be the same as the last)
nItems --; nItems --;
} }
byte[] result = new byte[nItems]; byte[] result = new byte[nItems];
@ -141,7 +144,6 @@ final class FunctionMetadataReader {
if(codes.length() == 1) { if(codes.length() == 1) {
switch (codes.charAt(0)) { switch (codes.charAt(0)) {
case '-': case '-':
case CHAR_NDASH_8211: // this is what the ooo doc has
return true; return true;
} }
} }

View File

@ -29,7 +29,7 @@
7 MAX 1 30 V R 7 MAX 1 30 V R
8 ROW 0 1 V R 8 ROW 0 1 V R
9 COLUMN 0 1 V R 9 COLUMN 0 1 V R
10 NA 0 0 V 10 NA 0 0 V -
11 NPV 2 30 V V R 11 NPV 2 30 V V R
12 STDEV 1 30 V R 12 STDEV 1 30 V R
13 DOLLAR 1 2 V V V 13 DOLLAR 1 2 V V V
@ -38,7 +38,7 @@
16 COS 1 1 V V 16 COS 1 1 V V
17 TAN 1 1 V V 17 TAN 1 1 V V
18 ARCTAN 1 1 V V 18 ARCTAN 1 1 V V
19 PI 0 0 V 19 PI 0 0 V -
20 SQRT 1 1 V V 20 SQRT 1 1 V V
21 EXP 1 1 V V 21 EXP 1 1 V V
22 LN 1 1 V V 22 LN 1 1 V V
@ -53,8 +53,8 @@
31 MID 3 3 V V V V 31 MID 3 3 V V V V
32 LEN 1 1 V V 32 LEN 1 1 V V
33 VALUE 1 1 V V 33 VALUE 1 1 V V
34 TRUE 0 0 V 34 TRUE 0 0 V -
35 FALSE 0 0 V 35 FALSE 0 0 V -
36 AND 1 30 V R 36 AND 1 30 V R
37 OR 1 30 V R 37 OR 1 30 V R
38 NOT 1 1 V V 38 NOT 1 1 V V
@ -80,7 +80,7 @@
60 RATE 3 6 V V V V V V V 60 RATE 3 6 V V V V V V V
61 MIRR 3 3 V R V V 61 MIRR 3 3 V R V V
62 IRR 1 2 V R V 62 IRR 1 2 V R V
63 RAND 0 0 V x 63 RAND 0 0 V - x
64 MATCH 2 3 V V R R 64 MATCH 2 3 V V R R
65 DATE 3 3 V V V V 65 DATE 3 3 V V V V
66 TIME 3 3 V V V V 66 TIME 3 3 V V V V
@ -91,7 +91,7 @@
71 HOUR 1 1 V V 71 HOUR 1 1 V V
72 MINUTE 1 1 V V 72 MINUTE 1 1 V V
73 SECOND 1 1 V V 73 SECOND 1 1 V V
74 NOW 0 0 V x 74 NOW 0 0 V - x
75 AREAS 1 1 V R 75 AREAS 1 1 V R
76 ROWS 1 1 V R 76 ROWS 1 1 V R
77 COLUMNS 1 1 V R 77 COLUMNS 1 1 V R
@ -170,10 +170,10 @@
215 JIS 1 1 V V x 215 JIS 1 1 V V x
219 ADDRESS 2 5 V V V V V V 219 ADDRESS 2 5 V V V V V V
220 DAYS360 2 2 V V V x 220 DAYS360 2 2 V V V x
221 TODAY 0 0 V x 221 TODAY 0 0 V - x
222 VDB 5 7 V V V V V V V V 222 VDB 5 7 V V V V V V V V
227 MEDIAN 1 30 V R 227 MEDIAN 1 30 V R ...
228 SUMPRODUCT 1 30 V A 228 SUMPRODUCT 1 30 V A ...
229 SINH 1 1 V V 229 SINH 1 1 V V
230 COSH 1 1 V V 230 COSH 1 1 V V
231 TANH 1 1 V V 231 TANH 1 1 V V
@ -188,7 +188,7 @@
247 DB 4 5 V V V V V V 247 DB 4 5 V V V V V V
252 FREQUENCY 2 2 A R R 252 FREQUENCY 2 2 A R R
261 ERROR.TYPE 1 1 V V 261 ERROR.TYPE 1 1 V V
269 AVEDEV 1 30 V R 269 AVEDEV 1 30 V R ...
270 BETADIST 3 5 V V V V V V 270 BETADIST 3 5 V V V V V V
271 GAMMALN 1 1 V V 271 GAMMALN 1 1 V V
272 BETAINV 3 5 V V V V V V 272 BETAINV 3 5 V V V V V V
@ -237,12 +237,12 @@
315 SLOPE 2 2 V A A 315 SLOPE 2 2 V A A
316 TTEST 4 4 V A A V V 316 TTEST 4 4 V A A V V
317 PROB 3 4 V A A V V 317 PROB 3 4 V A A V V
318 DEVSQ 1 30 V R 318 DEVSQ 1 30 V R ...
319 GEOMEAN 1 30 V R 319 GEOMEAN 1 30 V R ...
320 HARMEAN 1 30 V R 320 HARMEAN 1 30 V R ...
321 SUMSQ 0 30 V R 321 SUMSQ 0 30 V R ...
322 KURT 1 30 V R 322 KURT 1 30 V R ...
323 SKEW 1 30 V R 323 SKEW 1 30 V R ...
324 ZTEST 2 3 V R V V 324 ZTEST 2 3 V R V V
325 LARGE 2 2 V R V 325 LARGE 2 2 V R V
326 SMALL 2 2 V R V 326 SMALL 2 2 V R V
@ -274,10 +274,10 @@
358 GETPIVOTDATA 2 30 358 GETPIVOTDATA 2 30
359 HYPERLINK 1 2 V V V 359 HYPERLINK 1 2 V V V
360 PHONETIC 1 1 V R 360 PHONETIC 1 1 V R
361 AVERAGEA 1 30 V R 361 AVERAGEA 1 30 V R ...
362 MAXA 1 30 V R 362 MAXA 1 30 V R ...
363 MINA 1 30 V R 363 MINA 1 30 V R ...
364 STDEVPA 1 30 V R 364 STDEVPA 1 30 V R ...
365 VARPA 1 30 V R 365 VARPA 1 30 V R ...
366 STDEVA 1 30 V R 366 STDEVA 1 30 V R ...
367 VARA 1 30 V R 367 VARA 1 30 V R ...

View File

@ -31,7 +31,7 @@
7 MAX 1 30 V R 7 MAX 1 30 V R
8 ROW 0 1 V R 8 ROW 0 1 V R
9 COLUMN 0 1 V R 9 COLUMN 0 1 V R
10 NA 0 0 V 10 NA 0 0 V -
11 NPV 2 30 V V R 11 NPV 2 30 V V R
12 STDEV 1 30 V R 12 STDEV 1 30 V R
13 DOLLAR 1 2 V V V 13 DOLLAR 1 2 V V V
@ -40,7 +40,7 @@
16 COS 1 1 V V 16 COS 1 1 V V
17 TAN 1 1 V V 17 TAN 1 1 V V
18 ATAN 1 1 V V 18 ATAN 1 1 V V
19 PI 0 0 V 19 PI 0 0 V -
20 SQRT 1 1 V V 20 SQRT 1 1 V V
21 EXP 1 1 V V 21 EXP 1 1 V V
22 LN 1 1 V V 22 LN 1 1 V V
@ -55,8 +55,8 @@
31 MID 3 3 V V V V 31 MID 3 3 V V V V
32 LEN 1 1 V V 32 LEN 1 1 V V
33 VALUE 1 1 V V 33 VALUE 1 1 V V
34 TRUE 0 0 V 34 TRUE 0 0 V -
35 FALSE 0 0 V 35 FALSE 0 0 V -
36 AND 1 30 V R 36 AND 1 30 V R
37 OR 1 30 V R 37 OR 1 30 V R
38 NOT 1 1 V V 38 NOT 1 1 V V
@ -82,7 +82,7 @@
60 RATE 3 6 V V V V V V V 60 RATE 3 6 V V V V V V V
61 MIRR 3 3 V R V V 61 MIRR 3 3 V R V V
62 IRR 1 2 V R V 62 IRR 1 2 V R V
63 RAND 0 0 V x 63 RAND 0 0 V - x
64 MATCH 2 3 V V R R 64 MATCH 2 3 V V R R
65 DATE 3 3 V V V V 65 DATE 3 3 V V V V
66 TIME 3 3 V V V V 66 TIME 3 3 V V V V
@ -93,7 +93,7 @@
71 HOUR 1 1 V V 71 HOUR 1 1 V V
72 MINUTE 1 1 V V 72 MINUTE 1 1 V V
73 SECOND 1 1 V V 73 SECOND 1 1 V V
74 NOW 0 0 V x 74 NOW 0 0 V - x
75 AREAS 1 1 V R 75 AREAS 1 1 V R
76 ROWS 1 1 V R 76 ROWS 1 1 V R
77 COLUMNS 1 1 V R 77 COLUMNS 1 1 V R
@ -172,10 +172,10 @@
215 JIS 1 1 V V x 215 JIS 1 1 V V x
219 ADDRESS 2 5 V V V V V V 219 ADDRESS 2 5 V V V V V V
220 DAYS360 2 2 V V V x 220 DAYS360 2 2 V V V x
221 TODAY 0 0 V x 221 TODAY 0 0 V - x
222 VDB 5 7 V V V V V V V V 222 VDB 5 7 V V V V V V V V
227 MEDIAN 1 30 V R 227 MEDIAN 1 30 V R ...
228 SUMPRODUCT 1 30 V A 228 SUMPRODUCT 1 30 V A ...
229 SINH 1 1 V V 229 SINH 1 1 V V
230 COSH 1 1 V V 230 COSH 1 1 V V
231 TANH 1 1 V V 231 TANH 1 1 V V
@ -192,7 +192,7 @@
247 DB 4 5 V V V V V V 247 DB 4 5 V V V V V V
252 FREQUENCY 2 2 A R R 252 FREQUENCY 2 2 A R R
261 ERROR.TYPE 1 1 V V 261 ERROR.TYPE 1 1 V V
269 AVEDEV 1 30 V R 269 AVEDEV 1 30 V R ...
270 BETADIST 3 5 V V V V V V 270 BETADIST 3 5 V V V V V V
271 GAMMALN 1 1 V V 271 GAMMALN 1 1 V V
272 BETAINV 3 5 V V V V V V 272 BETAINV 3 5 V V V V V V
@ -241,12 +241,12 @@
315 SLOPE 2 2 V A A 315 SLOPE 2 2 V A A
316 TTEST 4 4 V A A V V 316 TTEST 4 4 V A A V V
317 PROB 3 4 V A A V V 317 PROB 3 4 V A A V V
318 DEVSQ 1 30 V R 318 DEVSQ 1 30 V R ...
319 GEOMEAN 1 30 V R 319 GEOMEAN 1 30 V R ...
320 HARMEAN 1 30 V R 320 HARMEAN 1 30 V R ...
321 SUMSQ 0 30 V R 321 SUMSQ 0 30 V R ...
322 KURT 1 30 V R 322 KURT 1 30 V R ...
323 SKEW 1 30 V R 323 SKEW 1 30 V R ...
324 ZTEST 2 3 V R V V 324 ZTEST 2 3 V R V V
325 LARGE 2 2 V R V 325 LARGE 2 2 V R V
326 SMALL 2 2 V R V 326 SMALL 2 2 V R V
@ -278,10 +278,10 @@
358 GETPIVOTDATA 2 30 358 GETPIVOTDATA 2 30
359 HYPERLINK 1 2 V V V 359 HYPERLINK 1 2 V V V
360 PHONETIC 1 1 V R 360 PHONETIC 1 1 V R
361 AVERAGEA 1 30 V R 361 AVERAGEA 1 30 V R ...
362 MAXA 1 30 V R 362 MAXA 1 30 V R ...
363 MINA 1 30 V R 363 MINA 1 30 V R ...
364 STDEVPA 1 30 V R 364 STDEVPA 1 30 V R ...
365 VARPA 1 30 V R 365 VARPA 1 30 V R ...
366 STDEVA 1 30 V R 366 STDEVA 1 30 V R ...
367 VARA 1 30 V R 367 VARA 1 30 V R ...

View File

@ -61,7 +61,32 @@ public final class ExcelFileFormatDocFunctionExtractor {
private static final String SOURCE_DOC_FILE_NAME = "excelfileformat.odt"; private static final String SOURCE_DOC_FILE_NAME = "excelfileformat.odt";
/**
* For simplicity, the output file is strictly simple ASCII.
* This method detects any unexpected characters.
*/
/* package */ static boolean isSimpleAscii(char c) {
if (c>=0x21 && c<=0x7E) {
// everything from '!' to '~' (includes letters, digits, punctuation
return true;
}
// some specific whitespace chars below 0x21:
switch(c) {
case ' ':
case '\t':
case '\r':
case '\n':
return true;
}
return false;
}
private static final class FunctionData { private static final class FunctionData {
// special characters from the ooo document
private static final int CHAR_ELLIPSIS_8230 = 8230;
private static final int CHAR_NDASH_8211 = 8211;
private final int _index; private final int _index;
private final boolean _hasFootnote; private final boolean _hasFootnote;
@ -79,10 +104,30 @@ public final class ExcelFileFormatDocFunctionExtractor {
_name = funcName; _name = funcName;
_minParams = minParams; _minParams = minParams;
_maxParams = maxParams; _maxParams = maxParams;
_returnClass = returnClass; _returnClass = convertSpecialChars(returnClass);
_paramClasses = paramClasses; _paramClasses = convertSpecialChars(paramClasses);
_isVolatile = isVolatile; _isVolatile = isVolatile;
} }
private static String convertSpecialChars(String ss) {
StringBuffer sb = new StringBuffer(ss.length() + 4);
for(int i=0; i<ss.length(); i++) {
char c = ss.charAt(i);
if (isSimpleAscii(c)) {
sb.append(c);
continue;
}
switch (c) {
case CHAR_NDASH_8211:
sb.append('-');
continue;
case CHAR_ELLIPSIS_8230:
sb.append("...");
continue;
}
throw new RuntimeException("bad char (" + ((int)c) + ") in string '" + ss + "'");
}
return sb.toString();
}
public int getIndex() { public int getIndex() {
return _index; return _index;
} }
@ -382,6 +427,33 @@ public final class ExcelFileFormatDocFunctionExtractor {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
/**
* To be sure that no tricky unicode chars make it through to the output file.
*/
private static final class SimpleAsciiOutputStream extends OutputStream {
private final OutputStream _os;
public SimpleAsciiOutputStream(OutputStream os) {
_os = os;
}
public void write(int b) throws IOException {
checkByte(b);
_os.write(b);
}
private static void checkByte(int b) {
if (!isSimpleAscii((char)b)) {
throw new RuntimeException("Encountered char (" + b + ") which was not simple ascii as expected");
}
}
public void write(byte[] b, int off, int len) throws IOException {
for (int i = 0; i < len; i++) {
checkByte(b[i + off]);
}
_os.write(b, off, len);
}
}
private static void processFile(File effDocFile, File outFile) { private static void processFile(File effDocFile, File outFile) {
OutputStream os; OutputStream os;
@ -390,10 +462,13 @@ public final class ExcelFileFormatDocFunctionExtractor {
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
PrintStream ps = null; os = new SimpleAsciiOutputStream(os);
PrintStream ps;
try { try {
ps = new PrintStream(os,true, "UTF-8"); ps = new PrintStream(os, true, "UTF-8");
} catch(UnsupportedEncodingException e) {} } catch(UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
outputLicenseHeader(ps); outputLicenseHeader(ps);
Class genClass = ExcelFileFormatDocFunctionExtractor.class; Class genClass = ExcelFileFormatDocFunctionExtractor.class;