400 lines
13 KiB
Java
400 lines
13 KiB
Java
/* ====================================================================
|
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
contributor license agreements. See the NOTICE file distributed with
|
|
this work for additional information regarding copyright ownership.
|
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
(the "License"); you may not use this file except in compliance with
|
|
the License. You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==================================================================== */
|
|
|
|
package org.apache.poi.util;
|
|
|
|
import java.nio.charset.Charset;
|
|
import java.text.FieldPosition;
|
|
import java.text.NumberFormat;
|
|
import java.util.Iterator;
|
|
|
|
import org.apache.poi.hssf.record.RecordInputStream;
|
|
/**
|
|
* Title: String Utility Description: Collection of string handling utilities<p/>
|
|
*
|
|
* Note - none of the methods in this class deals with {@link org.apache.poi.hssf.record.ContinueRecord}s.
|
|
* For such functionality, consider using {@link RecordInputStream}
|
|
*
|
|
*
|
|
*@author Andrew C. Oliver
|
|
*@author Sergei Kozello (sergeikozello at mail.ru)
|
|
*@author Toshiaki Kamoshida (kamoshida.toshiaki at future dot co dot jp)
|
|
*/
|
|
public class StringUtil {
|
|
private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
|
|
private static final Charset UTF16LE = Charset.forName("UTF-16LE");
|
|
|
|
private StringUtil() {
|
|
// no instances of this class
|
|
}
|
|
|
|
/**
|
|
* Given a byte array of 16-bit unicode characters in Little Endian
|
|
* format (most important byte last), return a Java String representation
|
|
* of it.
|
|
*
|
|
* { 0x16, 0x00 } -0x16
|
|
*
|
|
* @param string the byte array to be converted
|
|
* @param offset the initial offset into the
|
|
* byte array. it is assumed that string[ offset ] and string[ offset +
|
|
* 1 ] contain the first 16-bit unicode character
|
|
* @param len the length of the final string
|
|
* @return the converted string, never <code>null</code>.
|
|
* @exception ArrayIndexOutOfBoundsException if offset is out of bounds for
|
|
* the byte array (i.e., is negative or is greater than or equal to
|
|
* string.length)
|
|
* @exception IllegalArgumentException if len is too large (i.e.,
|
|
* there is not enough data in string to create a String of that
|
|
* length)
|
|
*/
|
|
public static String getFromUnicodeLE(
|
|
final byte[] string,
|
|
final int offset,
|
|
final int len)
|
|
throws ArrayIndexOutOfBoundsException, IllegalArgumentException {
|
|
if ((offset < 0) || (offset >= string.length)) {
|
|
throw new ArrayIndexOutOfBoundsException("Illegal offset " + offset + " (String data is of length " + string.length + ")");
|
|
}
|
|
if ((len < 0) || (((string.length - offset) / 2) < len)) {
|
|
throw new IllegalArgumentException("Illegal length " + len);
|
|
}
|
|
|
|
return new String(string, offset, len * 2, UTF16LE);
|
|
}
|
|
|
|
/**
|
|
* Given a byte array of 16-bit unicode characters in little endian
|
|
* format (most important byte last), return a Java String representation
|
|
* of it.
|
|
*
|
|
* { 0x16, 0x00 } -0x16
|
|
*
|
|
* @param string the byte array to be converted
|
|
* @return the converted string, never <code>null</code>
|
|
*/
|
|
public static String getFromUnicodeLE(byte[] string) {
|
|
if(string.length == 0) { return ""; }
|
|
return getFromUnicodeLE(string, 0, string.length / 2);
|
|
}
|
|
|
|
/**
|
|
* Convert String to 16-bit unicode characters in little endian format
|
|
*
|
|
* @param string the string
|
|
* @return the byte array of 16-bit unicode characters
|
|
*/
|
|
public static byte[] getToUnicodeLE(String string) {
|
|
return string.getBytes(UTF16LE);
|
|
}
|
|
|
|
/**
|
|
* Read 8 bit data (in ISO-8859-1 codepage) into a (unicode) Java
|
|
* String and return.
|
|
* (In Excel terms, read compressed 8 bit unicode as a string)
|
|
*
|
|
* @param string byte array to read
|
|
* @param offset offset to read byte array
|
|
* @param len length to read byte array
|
|
* @return String generated String instance by reading byte array
|
|
*/
|
|
public static String getFromCompressedUnicode(
|
|
final byte[] string,
|
|
final int offset,
|
|
final int len) {
|
|
int len_to_use = Math.min(len, string.length - offset);
|
|
return new String(string, offset, len_to_use, ISO_8859_1);
|
|
}
|
|
|
|
public static String readCompressedUnicode(LittleEndianInput in, int nChars) {
|
|
byte[] buf = new byte[nChars];
|
|
in.readFully(buf);
|
|
return new String(buf, ISO_8859_1);
|
|
}
|
|
|
|
/**
|
|
* InputStream <tt>in</tt> is expected to contain:
|
|
* <ol>
|
|
* <li>ushort nChars</li>
|
|
* <li>byte is16BitFlag</li>
|
|
* <li>byte[]/char[] characterData</li>
|
|
* </ol>
|
|
* For this encoding, the is16BitFlag is always present even if nChars==0.
|
|
*
|
|
* This structure is also known as a XLUnicodeString.
|
|
*/
|
|
public static String readUnicodeString(LittleEndianInput in) {
|
|
|
|
int nChars = in.readUShort();
|
|
byte flag = in.readByte();
|
|
if ((flag & 0x01) == 0) {
|
|
return readCompressedUnicode(in, nChars);
|
|
}
|
|
return readUnicodeLE(in, nChars);
|
|
}
|
|
/**
|
|
* InputStream <tt>in</tt> is expected to contain:
|
|
* <ol>
|
|
* <li>byte is16BitFlag</li>
|
|
* <li>byte[]/char[] characterData</li>
|
|
* </ol>
|
|
* For this encoding, the is16BitFlag is always present even if nChars==0.
|
|
* <br/>
|
|
* This method should be used when the nChars field is <em>not</em> stored
|
|
* as a ushort immediately before the is16BitFlag. Otherwise, {@link
|
|
* #readUnicodeString(LittleEndianInput)} can be used.
|
|
*/
|
|
public static String readUnicodeString(LittleEndianInput in, int nChars) {
|
|
byte is16Bit = in.readByte();
|
|
if ((is16Bit & 0x01) == 0) {
|
|
return readCompressedUnicode(in, nChars);
|
|
}
|
|
return readUnicodeLE(in, nChars);
|
|
}
|
|
/**
|
|
* OutputStream <tt>out</tt> will get:
|
|
* <ol>
|
|
* <li>ushort nChars</li>
|
|
* <li>byte is16BitFlag</li>
|
|
* <li>byte[]/char[] characterData</li>
|
|
* </ol>
|
|
* For this encoding, the is16BitFlag is always present even if nChars==0.
|
|
*/
|
|
public static void writeUnicodeString(LittleEndianOutput out, String value) {
|
|
|
|
int nChars = value.length();
|
|
out.writeShort(nChars);
|
|
boolean is16Bit = hasMultibyte(value);
|
|
out.writeByte(is16Bit ? 0x01 : 0x00);
|
|
if (is16Bit) {
|
|
putUnicodeLE(value, out);
|
|
} else {
|
|
putCompressedUnicode(value, out);
|
|
}
|
|
}
|
|
/**
|
|
* OutputStream <tt>out</tt> will get:
|
|
* <ol>
|
|
* <li>byte is16BitFlag</li>
|
|
* <li>byte[]/char[] characterData</li>
|
|
* </ol>
|
|
* For this encoding, the is16BitFlag is always present even if nChars==0.
|
|
* <br/>
|
|
* This method should be used when the nChars field is <em>not</em> stored
|
|
* as a ushort immediately before the is16BitFlag. Otherwise, {@link
|
|
* #writeUnicodeString(LittleEndianOutput, String)} can be used.
|
|
*/
|
|
public static void writeUnicodeStringFlagAndData(LittleEndianOutput out, String value) {
|
|
boolean is16Bit = hasMultibyte(value);
|
|
out.writeByte(is16Bit ? 0x01 : 0x00);
|
|
if (is16Bit) {
|
|
putUnicodeLE(value, out);
|
|
} else {
|
|
putCompressedUnicode(value, out);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @return the number of bytes that would be written by {@link #writeUnicodeString(LittleEndianOutput, String)}
|
|
*/
|
|
public static int getEncodedSize(String value) {
|
|
int result = 2 + 1;
|
|
result += value.length() * (StringUtil.hasMultibyte(value) ? 2 : 1);
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Takes a unicode (java) string, and returns it as 8 bit data (in ISO-8859-1
|
|
* codepage).
|
|
* (In Excel terms, write compressed 8 bit unicode)
|
|
*
|
|
* @param input the String containing the data to be written
|
|
* @param output the byte array to which the data is to be written
|
|
* @param offset an offset into the byte arrat at which the data is start
|
|
* when written
|
|
*/
|
|
public static void putCompressedUnicode(String input, byte[] output, int offset) {
|
|
byte[] bytes = input.getBytes(ISO_8859_1);
|
|
System.arraycopy(bytes, 0, output, offset, bytes.length);
|
|
}
|
|
|
|
public static void putCompressedUnicode(String input, LittleEndianOutput out) {
|
|
byte[] bytes = input.getBytes(ISO_8859_1);
|
|
out.write(bytes);
|
|
}
|
|
|
|
/**
|
|
* Takes a unicode string, and returns it as little endian (most
|
|
* important byte last) bytes in the supplied byte array.
|
|
* (In Excel terms, write uncompressed unicode)
|
|
*
|
|
* @param input the String containing the unicode data to be written
|
|
* @param output the byte array to hold the uncompressed unicode, should be twice the length of the String
|
|
* @param offset the offset to start writing into the byte array
|
|
*/
|
|
public static void putUnicodeLE(String input, byte[] output, int offset) {
|
|
byte[] bytes = input.getBytes(UTF16LE);
|
|
System.arraycopy(bytes, 0, output, offset, bytes.length);
|
|
}
|
|
public static void putUnicodeLE(String input, LittleEndianOutput out) {
|
|
byte[] bytes = input.getBytes(UTF16LE);
|
|
out.write(bytes);
|
|
}
|
|
|
|
public static String readUnicodeLE(LittleEndianInput in, int nChars) {
|
|
byte[] bytes = new byte[nChars*2];
|
|
in.readFully(bytes);
|
|
return new String(bytes, UTF16LE);
|
|
}
|
|
|
|
/**
|
|
* Apply printf() like formatting to a string.
|
|
* Primarily used for logging.
|
|
* @param message the string with embedded formatting info
|
|
* eg. "This is a test %2.2"
|
|
* @param params array of values to format into the string
|
|
* @return The formatted string
|
|
*/
|
|
public static String format(String message, Object[] params) {
|
|
int currentParamNumber = 0;
|
|
StringBuffer formattedMessage = new StringBuffer();
|
|
for (int i = 0; i < message.length(); i++) {
|
|
if (message.charAt(i) == '%') {
|
|
if (currentParamNumber >= params.length) {
|
|
formattedMessage.append("?missing data?");
|
|
} else if (
|
|
(params[currentParamNumber] instanceof Number)
|
|
&& (i + 1 < message.length())) {
|
|
i
|
|
+= matchOptionalFormatting(
|
|
(Number) params[currentParamNumber++],
|
|
message.substring(i + 1),
|
|
formattedMessage);
|
|
} else {
|
|
formattedMessage.append(
|
|
params[currentParamNumber++].toString());
|
|
}
|
|
} else {
|
|
if ((message.charAt(i) == '\\')
|
|
&& (i + 1 < message.length())
|
|
&& (message.charAt(i + 1) == '%')) {
|
|
formattedMessage.append('%');
|
|
i++;
|
|
} else {
|
|
formattedMessage.append(message.charAt(i));
|
|
}
|
|
}
|
|
}
|
|
return formattedMessage.toString();
|
|
}
|
|
|
|
|
|
private static int matchOptionalFormatting(
|
|
Number number,
|
|
String formatting,
|
|
StringBuffer outputTo) {
|
|
NumberFormat numberFormat = NumberFormat.getInstance();
|
|
if ((0 < formatting.length())
|
|
&& Character.isDigit(formatting.charAt(0))) {
|
|
numberFormat.setMinimumIntegerDigits(
|
|
Integer.parseInt(formatting.charAt(0) + ""));
|
|
if ((2 < formatting.length())
|
|
&& (formatting.charAt(1) == '.')
|
|
&& Character.isDigit(formatting.charAt(2))) {
|
|
numberFormat.setMaximumFractionDigits(
|
|
Integer.parseInt(formatting.charAt(2) + ""));
|
|
numberFormat.format(number, outputTo, new FieldPosition(0));
|
|
return 3;
|
|
}
|
|
numberFormat.format(number, outputTo, new FieldPosition(0));
|
|
return 1;
|
|
} else if (
|
|
(0 < formatting.length()) && (formatting.charAt(0) == '.')) {
|
|
if ((1 < formatting.length())
|
|
&& Character.isDigit(formatting.charAt(1))) {
|
|
numberFormat.setMaximumFractionDigits(
|
|
Integer.parseInt(formatting.charAt(1) + ""));
|
|
numberFormat.format(number, outputTo, new FieldPosition(0));
|
|
return 2;
|
|
}
|
|
}
|
|
numberFormat.format(number, outputTo, new FieldPosition(0));
|
|
return 1;
|
|
}
|
|
|
|
/**
|
|
* @return the encoding we want to use, currently hardcoded to ISO-8859-1
|
|
*/
|
|
public static String getPreferredEncoding() {
|
|
return ISO_8859_1.name();
|
|
}
|
|
|
|
/**
|
|
* check the parameter has multibyte character
|
|
*
|
|
* @param value string to check
|
|
* @return boolean result true:string has at least one multibyte character
|
|
*/
|
|
public static boolean hasMultibyte(String value) {
|
|
if (value == null)
|
|
return false;
|
|
for (int i = 0; i < value.length(); i++) {
|
|
char c = value.charAt(i);
|
|
if (c > 0xFF) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Checks to see if a given String needs to be represented as Unicode
|
|
*
|
|
* @param value
|
|
* @return true if string needs Unicode to be represented.
|
|
*/
|
|
public static boolean isUnicodeString(final String value) {
|
|
return !value.equals(new String(value.getBytes(ISO_8859_1), ISO_8859_1));
|
|
}
|
|
|
|
/**
|
|
* An Iterator over an array of Strings.
|
|
*/
|
|
public static class StringsIterator implements Iterator<String> {
|
|
private String[] strings;
|
|
private int position = 0;
|
|
public StringsIterator(String[] strings) {
|
|
if(strings != null) {
|
|
this.strings = strings;
|
|
} else {
|
|
this.strings = new String[0];
|
|
}
|
|
}
|
|
|
|
public boolean hasNext() {
|
|
return position < strings.length;
|
|
}
|
|
public String next() {
|
|
int ourPos = position++;
|
|
if(ourPos >= strings.length)
|
|
throw new ArrayIndexOutOfBoundsException(ourPos);
|
|
return strings[ourPos];
|
|
}
|
|
public void remove() {}
|
|
}
|
|
}
|