Another attempt to fix unicode problems in functionMetadata.txt. Made that file pure ascii.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@653892 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Josh Micich 2008-05-06 19:48:55 +00:00
parent 3475425c5f
commit 609058dea5
4 changed files with 136 additions and 59 deletions

View File

@ -38,14 +38,13 @@ final class FunctionMetadataReader {
private static final String METADATA_FILE_NAME = "functionMetadata.txt";
/** plain ASCII text metadata file uses three dots for ellipsis */
private static final String ELLIPSIS = "...";
private static final Pattern TAB_DELIM_PATTERN = Pattern.compile("\t");
private static final Pattern SPACE_DELIM_PATTERN = Pattern.compile(" ");
private static final byte[] EMPTY_BYTE_ARRAY = { };
// special characters from the ooo document
private static final int CHAR_ELLIPSIS_8230 = 8230;
private static final int CHAR_NDASH_8211 = 8211;
private static final String[] DIGIT_ENDING_FUNCTION_NAMES = {
// Digits at the end of a function might be due to a left-over footnote marker.
// except in these cases
@ -59,10 +58,12 @@ final class FunctionMetadataReader {
throw new RuntimeException("resource '" + METADATA_FILE_NAME + "' not found");
}
BufferedReader br = null;
BufferedReader br;
try {
br = new BufferedReader(new InputStreamReader(is,"UTF-8"));
} catch(UnsupportedEncodingException e) { /* never happens */ }
} catch(UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
FunctionDataBuilder fdb = new FunctionDataBuilder(400);
try {
@ -127,7 +128,9 @@ final class FunctionMetadataReader {
}
String[] array = SPACE_DELIM_PATTERN.split(codes);
int nItems = array.length;
if(array[nItems-1].charAt(0) == CHAR_ELLIPSIS_8230) {
if(ELLIPSIS.equals(array[nItems-1])) {
// final ellipsis is optional, and ignored
// (all unspecified params are assumed to be the same as the last)
nItems --;
}
byte[] result = new byte[nItems];
@ -141,7 +144,6 @@ final class FunctionMetadataReader {
if(codes.length() == 1) {
switch (codes.charAt(0)) {
case '-':
case CHAR_NDASH_8211: // this is what the ooo doc has
return true;
}
}

View File

@ -29,7 +29,7 @@
7 MAX 1 30 V R
8 ROW 0 1 V R
9 COLUMN 0 1 V R
10 NA 0 0 V
10 NA 0 0 V -
11 NPV 2 30 V V R
12 STDEV 1 30 V R
13 DOLLAR 1 2 V V V
@ -38,7 +38,7 @@
16 COS 1 1 V V
17 TAN 1 1 V V
18 ARCTAN 1 1 V V
19 PI 0 0 V
19 PI 0 0 V -
20 SQRT 1 1 V V
21 EXP 1 1 V V
22 LN 1 1 V V
@ -53,8 +53,8 @@
31 MID 3 3 V V V V
32 LEN 1 1 V V
33 VALUE 1 1 V V
34 TRUE 0 0 V
35 FALSE 0 0 V
34 TRUE 0 0 V -
35 FALSE 0 0 V -
36 AND 1 30 V R
37 OR 1 30 V R
38 NOT 1 1 V V
@ -80,7 +80,7 @@
60 RATE 3 6 V V V V V V V
61 MIRR 3 3 V R V V
62 IRR 1 2 V R V
63 RAND 0 0 V x
63 RAND 0 0 V - x
64 MATCH 2 3 V V R R
65 DATE 3 3 V V V V
66 TIME 3 3 V V V V
@ -91,7 +91,7 @@
71 HOUR 1 1 V V
72 MINUTE 1 1 V V
73 SECOND 1 1 V V
74 NOW 0 0 V x
74 NOW 0 0 V - x
75 AREAS 1 1 V R
76 ROWS 1 1 V R
77 COLUMNS 1 1 V R
@ -170,10 +170,10 @@
215 JIS 1 1 V V x
219 ADDRESS 2 5 V V V V V V
220 DAYS360 2 2 V V V x
221 TODAY 0 0 V x
221 TODAY 0 0 V - x
222 VDB 5 7 V V V V V V V V
227 MEDIAN 1 30 V R
228 SUMPRODUCT 1 30 V A
227 MEDIAN 1 30 V R ...
228 SUMPRODUCT 1 30 V A ...
229 SINH 1 1 V V
230 COSH 1 1 V V
231 TANH 1 1 V V
@ -188,7 +188,7 @@
247 DB 4 5 V V V V V V
252 FREQUENCY 2 2 A R R
261 ERROR.TYPE 1 1 V V
269 AVEDEV 1 30 V R
269 AVEDEV 1 30 V R ...
270 BETADIST 3 5 V V V V V V
271 GAMMALN 1 1 V V
272 BETAINV 3 5 V V V V V V
@ -237,12 +237,12 @@
315 SLOPE 2 2 V A A
316 TTEST 4 4 V A A V V
317 PROB 3 4 V A A V V
318 DEVSQ 1 30 V R
319 GEOMEAN 1 30 V R
320 HARMEAN 1 30 V R
321 SUMSQ 0 30 V R
322 KURT 1 30 V R
323 SKEW 1 30 V R
318 DEVSQ 1 30 V R ...
319 GEOMEAN 1 30 V R ...
320 HARMEAN 1 30 V R ...
321 SUMSQ 0 30 V R ...
322 KURT 1 30 V R ...
323 SKEW 1 30 V R ...
324 ZTEST 2 3 V R V V
325 LARGE 2 2 V R V
326 SMALL 2 2 V R V
@ -274,10 +274,10 @@
358 GETPIVOTDATA 2 30
359 HYPERLINK 1 2 V V V
360 PHONETIC 1 1 V R
361 AVERAGEA 1 30 V R
362 MAXA 1 30 V R
363 MINA 1 30 V R
364 STDEVPA 1 30 V R
365 VARPA 1 30 V R
366 STDEVA 1 30 V R
367 VARA 1 30 V R
361 AVERAGEA 1 30 V R ...
362 MAXA 1 30 V R ...
363 MINA 1 30 V R ...
364 STDEVPA 1 30 V R ...
365 VARPA 1 30 V R ...
366 STDEVA 1 30 V R ...
367 VARA 1 30 V R ...

View File

@ -31,7 +31,7 @@
7 MAX 1 30 V R
8 ROW 0 1 V R
9 COLUMN 0 1 V R
10 NA 0 0 V
10 NA 0 0 V -
11 NPV 2 30 V V R
12 STDEV 1 30 V R
13 DOLLAR 1 2 V V V
@ -40,7 +40,7 @@
16 COS 1 1 V V
17 TAN 1 1 V V
18 ATAN 1 1 V V
19 PI 0 0 V
19 PI 0 0 V -
20 SQRT 1 1 V V
21 EXP 1 1 V V
22 LN 1 1 V V
@ -55,8 +55,8 @@
31 MID 3 3 V V V V
32 LEN 1 1 V V
33 VALUE 1 1 V V
34 TRUE 0 0 V
35 FALSE 0 0 V
34 TRUE 0 0 V -
35 FALSE 0 0 V -
36 AND 1 30 V R
37 OR 1 30 V R
38 NOT 1 1 V V
@ -82,7 +82,7 @@
60 RATE 3 6 V V V V V V V
61 MIRR 3 3 V R V V
62 IRR 1 2 V R V
63 RAND 0 0 V x
63 RAND 0 0 V - x
64 MATCH 2 3 V V R R
65 DATE 3 3 V V V V
66 TIME 3 3 V V V V
@ -93,7 +93,7 @@
71 HOUR 1 1 V V
72 MINUTE 1 1 V V
73 SECOND 1 1 V V
74 NOW 0 0 V x
74 NOW 0 0 V - x
75 AREAS 1 1 V R
76 ROWS 1 1 V R
77 COLUMNS 1 1 V R
@ -172,10 +172,10 @@
215 JIS 1 1 V V x
219 ADDRESS 2 5 V V V V V V
220 DAYS360 2 2 V V V x
221 TODAY 0 0 V x
221 TODAY 0 0 V - x
222 VDB 5 7 V V V V V V V V
227 MEDIAN 1 30 V R
228 SUMPRODUCT 1 30 V A
227 MEDIAN 1 30 V R ...
228 SUMPRODUCT 1 30 V A ...
229 SINH 1 1 V V
230 COSH 1 1 V V
231 TANH 1 1 V V
@ -192,7 +192,7 @@
247 DB 4 5 V V V V V V
252 FREQUENCY 2 2 A R R
261 ERROR.TYPE 1 1 V V
269 AVEDEV 1 30 V R
269 AVEDEV 1 30 V R ...
270 BETADIST 3 5 V V V V V V
271 GAMMALN 1 1 V V
272 BETAINV 3 5 V V V V V V
@ -241,12 +241,12 @@
315 SLOPE 2 2 V A A
316 TTEST 4 4 V A A V V
317 PROB 3 4 V A A V V
318 DEVSQ 1 30 V R
319 GEOMEAN 1 30 V R
320 HARMEAN 1 30 V R
321 SUMSQ 0 30 V R
322 KURT 1 30 V R
323 SKEW 1 30 V R
318 DEVSQ 1 30 V R ...
319 GEOMEAN 1 30 V R ...
320 HARMEAN 1 30 V R ...
321 SUMSQ 0 30 V R ...
322 KURT 1 30 V R ...
323 SKEW 1 30 V R ...
324 ZTEST 2 3 V R V V
325 LARGE 2 2 V R V
326 SMALL 2 2 V R V
@ -278,10 +278,10 @@
358 GETPIVOTDATA 2 30
359 HYPERLINK 1 2 V V V
360 PHONETIC 1 1 V R
361 AVERAGEA 1 30 V R
362 MAXA 1 30 V R
363 MINA 1 30 V R
364 STDEVPA 1 30 V R
365 VARPA 1 30 V R
366 STDEVA 1 30 V R
367 VARA 1 30 V R
361 AVERAGEA 1 30 V R ...
362 MAXA 1 30 V R ...
363 MINA 1 30 V R ...
364 STDEVPA 1 30 V R ...
365 VARPA 1 30 V R ...
366 STDEVA 1 30 V R ...
367 VARA 1 30 V R ...

View File

@ -61,7 +61,32 @@ public final class ExcelFileFormatDocFunctionExtractor {
private static final String SOURCE_DOC_FILE_NAME = "excelfileformat.odt";
/**
* For simplicity, the output file is strictly simple ASCII.
* This method detects any unexpected characters.
*/
/* package */ static boolean isSimpleAscii(char c) {
if (c>=0x21 && c<=0x7E) {
// everything from '!' to '~' (includes letters, digits, punctuation
return true;
}
// some specific whitespace chars below 0x21:
switch(c) {
case ' ':
case '\t':
case '\r':
case '\n':
return true;
}
return false;
}
private static final class FunctionData {
// special characters from the ooo document
private static final int CHAR_ELLIPSIS_8230 = 8230;
private static final int CHAR_NDASH_8211 = 8211;
private final int _index;
private final boolean _hasFootnote;
@ -79,10 +104,30 @@ public final class ExcelFileFormatDocFunctionExtractor {
_name = funcName;
_minParams = minParams;
_maxParams = maxParams;
_returnClass = returnClass;
_paramClasses = paramClasses;
_returnClass = convertSpecialChars(returnClass);
_paramClasses = convertSpecialChars(paramClasses);
_isVolatile = isVolatile;
}
private static String convertSpecialChars(String ss) {
StringBuffer sb = new StringBuffer(ss.length() + 4);
for(int i=0; i<ss.length(); i++) {
char c = ss.charAt(i);
if (isSimpleAscii(c)) {
sb.append(c);
continue;
}
switch (c) {
case CHAR_NDASH_8211:
sb.append('-');
continue;
case CHAR_ELLIPSIS_8230:
sb.append("...");
continue;
}
throw new RuntimeException("bad char (" + ((int)c) + ") in string '" + ss + "'");
}
return sb.toString();
}
public int getIndex() {
return _index;
}
@ -382,6 +427,33 @@ public final class ExcelFileFormatDocFunctionExtractor {
throw new RuntimeException(e);
}
}
/**
* To be sure that no tricky unicode chars make it through to the output file.
*/
private static final class SimpleAsciiOutputStream extends OutputStream {
private final OutputStream _os;
public SimpleAsciiOutputStream(OutputStream os) {
_os = os;
}
public void write(int b) throws IOException {
checkByte(b);
_os.write(b);
}
private static void checkByte(int b) {
if (!isSimpleAscii((char)b)) {
throw new RuntimeException("Encountered char (" + b + ") which was not simple ascii as expected");
}
}
public void write(byte[] b, int off, int len) throws IOException {
for (int i = 0; i < len; i++) {
checkByte(b[i + off]);
}
_os.write(b, off, len);
}
}
private static void processFile(File effDocFile, File outFile) {
OutputStream os;
@ -390,10 +462,13 @@ public final class ExcelFileFormatDocFunctionExtractor {
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
}
PrintStream ps = null;
os = new SimpleAsciiOutputStream(os);
PrintStream ps;
try {
ps = new PrintStream(os,true, "UTF-8");
} catch(UnsupportedEncodingException e) {}
ps = new PrintStream(os, true, "UTF-8");
} catch(UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
outputLicenseHeader(ps);
Class genClass = ExcelFileFormatDocFunctionExtractor.class;