Pileus Git - ~andy/freeotp/blob - src/com/google/zxing/common/StringUtils.java

   1 /*
   2  * Copyright (C) 2010 ZXing authors
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 package com.google.zxing.common;
  18
  19 import java.util.Map;
  20
  21 import com.google.zxing.DecodeHintType;
  22
  23 /**
  24  * Common string-related functions.
  25  *
  26  * @author Sean Owen
  27  * @author Alex Dupre
  28  */
  29 public final class StringUtils {
  30
  31   private static final String PLATFORM_DEFAULT_ENCODING =
  32       System.getProperty("file.encoding");
  33   public static final String SHIFT_JIS = "SJIS";
  34   public static final String GB2312 = "GB2312";
  35   private static final String EUC_JP = "EUC_JP";
  36   private static final String UTF8 = "UTF8";
  37   private static final String ISO88591 = "ISO8859_1";
  38   private static final boolean ASSUME_SHIFT_JIS =
  39       SHIFT_JIS.equalsIgnoreCase(PLATFORM_DEFAULT_ENCODING) ||
  40       EUC_JP.equalsIgnoreCase(PLATFORM_DEFAULT_ENCODING);
  41
  42   private StringUtils() {}
  43
  44   /**
  45    * @param bytes bytes encoding a string, whose encoding should be guessed
  46    * @param hints decode hints if applicable
  47    * @return name of guessed encoding; at the moment will only guess one of:
  48    *  {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the platform
  49    *  default encoding if none of these can possibly be correct
  50    */
  51   public static String guessEncoding(byte[] bytes, Map<DecodeHintType,?> hints) {
  52     if (hints != null) {
  53       String characterSet = (String) hints.get(DecodeHintType.CHARACTER_SET);
  54       if (characterSet != null) {
  55         return characterSet;
  56       }
  57     }
  58     // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
  59     // which should be by far the most common encodings.
  60     int length = bytes.length;
  61     boolean canBeISO88591 = true;
  62     boolean canBeShiftJIS = true;
  63     boolean canBeUTF8 = true;
  64     int utf8BytesLeft = 0;
  65     //int utf8LowChars = 0;
  66     int utf2BytesChars = 0;
  67     int utf3BytesChars = 0;
  68     int utf4BytesChars = 0;
  69     int sjisBytesLeft = 0;
  70     //int sjisLowChars = 0;
  71     int sjisKatakanaChars = 0;
  72     //int sjisDoubleBytesChars = 0;
  73     int sjisCurKatakanaWordLength = 0;
  74     int sjisCurDoubleBytesWordLength = 0;
  75     int sjisMaxKatakanaWordLength = 0;
  76     int sjisMaxDoubleBytesWordLength = 0;
  77     //int isoLowChars = 0;
  78     //int isoHighChars = 0;
  79     int isoHighOther = 0;
  80
  81     boolean utf8bom = bytes.length > 3 &&
  82         bytes[0] == (byte) 0xEF &&
  83         bytes[1] == (byte) 0xBB &&
  84         bytes[2] == (byte) 0xBF;
  85
  86     for (int i = 0;
  87          i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
  88          i++) {
  89
  90       int value = bytes[i] & 0xFF;
  91
  92       // UTF-8 stuff
  93       if (canBeUTF8) {
  94         if (utf8BytesLeft > 0) {
  95           if ((value & 0x80) == 0) {
  96             canBeUTF8 = false;
  97           } else {
  98             utf8BytesLeft--;
  99           }
 100         } else if ((value & 0x80) != 0) {
 101           if ((value & 0x40) == 0) {
 102             canBeUTF8 = false;
 103           } else {
 104             utf8BytesLeft++;
 105             if ((value & 0x20) == 0) {
 106               utf2BytesChars++;
 107             } else {
 108               utf8BytesLeft++;
 109               if ((value & 0x10) == 0) {
 110                 utf3BytesChars++;
 111               } else {
 112                 utf8BytesLeft++;
 113                 if ((value & 0x08) == 0) {
 114                   utf4BytesChars++;
 115                 } else {
 116                   canBeUTF8 = false;
 117                 }
 118               }
 119             }
 120           }
 121         } //else {
 122           //utf8LowChars++;
 123         //}
 124       }
 125
 126       // ISO-8859-1 stuff
 127       if (canBeISO88591) {
 128         if (value > 0x7F && value < 0xA0) {
 129           canBeISO88591 = false;
 130         } else if (value > 0x9F) {
 131           if (value < 0xC0 || value == 0xD7 || value == 0xF7) {
 132             isoHighOther++;
 133           } //else {
 134             //isoHighChars++;
 135           //}
 136         } //else {
 137           //isoLowChars++;
 138         //}
 139       }
 140
 141       // Shift_JIS stuff
 142       if (canBeShiftJIS) {
 143         if (sjisBytesLeft > 0) {
 144           if (value < 0x40 || value == 0x7F || value > 0xFC) {
 145             canBeShiftJIS = false;
 146           } else {
 147             sjisBytesLeft--;
 148           }
 149         } else if (value == 0x80 || value == 0xA0 || value > 0xEF) {
 150           canBeShiftJIS = false;
 151         } else if (value > 0xA0 && value < 0xE0) {
 152           sjisKatakanaChars++;
 153           sjisCurDoubleBytesWordLength = 0;
 154           sjisCurKatakanaWordLength++;
 155           if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength) {
 156             sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength;
 157           }
 158         } else if (value > 0x7F) {
 159           sjisBytesLeft++;
 160           //sjisDoubleBytesChars++;
 161           sjisCurKatakanaWordLength = 0;
 162           sjisCurDoubleBytesWordLength++;
 163           if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength) {
 164             sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength;
 165           }
 166         } else {
 167           //sjisLowChars++;
 168           sjisCurKatakanaWordLength = 0;
 169           sjisCurDoubleBytesWordLength = 0;
 170         }
 171       }
 172     }
 173
 174     if (canBeUTF8 && utf8BytesLeft > 0) {
 175       canBeUTF8 = false;
 176     }
 177     if (canBeShiftJIS && sjisBytesLeft > 0) {
 178       canBeShiftJIS = false;
 179     }
 180
 181     // Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done
 182     if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0)) {
 183       return UTF8;
 184     }
 185     // Easy -- if assuming Shift_JIS or at least 3 valid consecutive not-ascii characters (and no evidence it can't be), done
 186     if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) {
 187       return SHIFT_JIS;
 188     }
 189     // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is:
 190     // - If we saw
 191     //   - only two consecutive katakana chars in the whole text, or
 192     //   - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
 193     // - then we conclude Shift_JIS, else ISO-8859-1
 194     if (canBeISO88591 && canBeShiftJIS) {
 195       return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= length
 196           ? SHIFT_JIS : ISO88591;
 197     }
 198
 199     // Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding
 200     if (canBeISO88591) {
 201       return ISO88591;
 202     }
 203     if (canBeShiftJIS) {
 204       return SHIFT_JIS;
 205     }
 206     if (canBeUTF8) {
 207       return UTF8;
 208     }
 209     // Otherwise, we take a wild guess with platform encoding
 210     return PLATFORM_DEFAULT_ENCODING;
 211   }
 212
 213 }