Mega Code Archive

UTF8 String utilities

// // Copyright 2004-2005 Mort Bay Consulting Pty. Ltd. // ------------------------------------------------------------------------ // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // import java.io.UnsupportedEncodingException; // /** * Fast String Utilities. * * These string utilities provide both conveniance methods and performance * improvements over most standard library versions. The main aim of the * optimizations is to avoid object creation unless absolutely required. * * @author Greg Wilkins (gregw) */ public class StringUtil { public static final String CRLF = "\015\012"; public static final String __LINE_SEPARATOR = System.getProperty("line.separator", "\n"); public static String __ISO_8859_1; static { String iso = System.getProperty("ISO_8859_1"); if (iso != null) __ISO_8859_1 = iso; else { try { new String(new byte[] { (byte) 20 }, "ISO-8859-1"); __ISO_8859_1 = "ISO-8859-1"; } catch (java.io.UnsupportedEncodingException e) { __ISO_8859_1 = "ISO8859_1"; } } } public final static String __UTF8 = "UTF-8"; private static char[] lowercases = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177' }; /* ------------------------------------------------------------ */ /** * fast lower case conversion. Only works on ascii (not unicode) * * @param s * the string to convert * @return a lower case version of s */ public static String asciiToLowerCase(String s) { char[] c = null; int i = s.length(); // look for first conversion while (i-- > 0) { char c1 = s.charAt(i); if (c1 <= 127) { char c2 = lowercases[c1]; if (c1 != c2) { c = s.toCharArray(); c[i] = c2; break; } } } while (i-- > 0) { if (c[i] <= 127) c[i] = lowercases[c[i]]; } return c == null ? s : new String(c); } /* ------------------------------------------------------------ */ public static boolean startsWithIgnoreCase(String s, String w) { if (w == null) return true; if (s == null || s.length() < w.length()) return false; for (int i = 0; i < w.length(); i++) { char c1 = s.charAt(i); char c2 = w.charAt(i); if (c1 != c2) { if (c1 <= 127) c1 = lowercases[c1]; if (c2 <= 127) c2 = lowercases[c2]; if (c1 != c2) return false; } } return true; } /* ------------------------------------------------------------ */ public static boolean endsWithIgnoreCase(String s, String w) { if (w == null) return true; if (s == null) return false; int sl = s.length(); int wl = w.length(); if (sl < wl) return false; for (int i = wl; i-- > 0;) { char c1 = s.charAt(--sl); char c2 = w.charAt(i); if (c1 != c2) { if (c1 <= 127) c1 = lowercases[c1]; if (c2 <= 127) c2 = lowercases[c2]; if (c1 != c2) return false; } } return true; } /* ------------------------------------------------------------ */ public static boolean equals(String s, char[] buf, int offset, int length) { if (s.length() != length) return false; for (int i = 0; i < length; i++) if (buf[offset + i] != s.charAt(i)) return false; return true; } /* ------------------------------------------------------------ */ public static String toUTF8String(byte[] b, int offset, int length) { try { if (length < 32) { Utf8StringBuffer buffer = new Utf8StringBuffer(length); buffer.append(b, offset, length); return buffer.toString(); } return new String(b, offset, length, __UTF8); } catch (UnsupportedEncodingException e) { e.printStackTrace(); return null; } } /* ------------------------------------------------------------ */ public static String toString(byte[] b, int offset, int length, String charset) { if (charset == null || StringUtil.isUTF8(charset)) return toUTF8String(b, offset, length); try { return new String(b, offset, length, charset); } catch (UnsupportedEncodingException e) { e.printStackTrace(); return null; } } /* ------------------------------------------------------------ */ public static boolean isUTF8(String charset) { return charset == __UTF8 || __UTF8.equalsIgnoreCase(charset); } /* ------------------------------------------------------------ */ public static String printable(String name) { if (name == null) return null; StringBuffer buf = new StringBuffer(name.length()); for (int i = 0; i < name.length(); i++) { char c = name.charAt(i); if (!Character.isISOControl(c)) buf.append(c); } return buf.toString(); } } // // Copyright 2006 Mort Bay Consulting Pty. Ltd. // ------------------------------------------------------------------------ // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /* ------------------------------------------------------------ */ /** * UTF-8 StringBuffer. * * This class wraps a standard {@link java.lang.StringBuffer} and provides * methods to append UTF-8 encoded bytes, that are converted into characters. * * This class is stateful and up to 6 calls to {@link #append(byte)} may be * needed before state a character is appended to the string buffer. * * The UTF-8 decoding is done by this class and no additional buffers or Readers * are used. The UTF-8 code was inspired by http://javolution.org * */ class Utf8StringBuffer { StringBuffer _buffer; int _more; int _bits; boolean _errors; Utf8StringBuffer() { _buffer = new StringBuffer(); } Utf8StringBuffer(int capacity) { _buffer = new StringBuffer(capacity); } public void append(byte[] b, int offset, int length) { int end = offset + length; for (int i = offset; i < end; i++) append(b[i]); } public void append(byte b) { if (b > 0) { if (_more > 0) { _buffer.append('?'); _more = 0; _bits = 0; } else _buffer.append((char) (0x7f & b)); } else if (_more == 0) { if ((b & 0xc0) != 0xc0) { // 10xxxxxx _buffer.append('?'); _more = 0; _bits = 0; } else if ((b & 0xe0) == 0xc0) { // 110xxxxx _more = 1; _bits = b & 0x1f; } else if ((b & 0xf0) == 0xe0) { // 1110xxxx _more = 2; _bits = b & 0x0f; } else if ((b & 0xf8) == 0xf0) { // 11110xxx _more = 3; _bits = b & 0x07; } else if ((b & 0xfc) == 0xf8) { // 111110xx _more = 4; _bits = b & 0x03; } else if ((b & 0xfe) == 0xfc) { // 1111110x _more = 5; _bits = b & 0x01; } } else { if ((b & 0xc0) == 0xc0) { // 11?????? _buffer.append('?'); _more = 0; _bits = 0; _errors = true; } else { // 10xxxxxx _bits = (_bits << 6) | (b & 0x3f); if (--_more == 0) _buffer.append((char) _bits); } } } public int length() { return _buffer.length(); } public void reset() { _buffer.setLength(0); _more = 0; _bits = 0; _errors = false; } public StringBuffer getStringBuffer() { return _buffer; } public String toString() { return _buffer.toString(); } /* ------------------------------------------------------------ */ /** * @return True if there are non UTF-8 characters or incomplete UTF-8 * characters in the buffer. */ public boolean isError() { return _errors || _more > 0; } }