`

Java根据文件的BOM判断文件的编码类型

阅读更多
    随着多字节文本数据的处理,尤其是非ASCII文字的出现,典型的如中文,文件的编码判断就提上日程,有很多字节流和字符流默认能处理的编码格式的是和程序文件的编码一致,例如:程序文件编码是UTF-8,默认处理的文本也是UTF-8。处理其他格式的文本时,当不提供具体的编码时,就非常容易把其他格式的文本当成乱码处理。

        当前处理的方式一般通过相关reader或writer的装饰类:
InputStreamReader(InputStream in, String charsetName) 
OutputStreamWriter(OutputStream out, String charsetName) 
 
         实现显示地将字符编码传进去,但是无法实现自动的发掘文件字符编码,也就是说,此种模式仅仅支持用户将文件的编码传进去。

    对中文字符编码主要是GBK(GB2312,GB18030)系列和UTF系列的区别,UTF系列的编码通常在文件的头部若干个字节已经告诉用户此文件的字符编码格式,即文件包含BOM(Byte Order Mark),此标志标志文件的编码方式,常见的有:

BOMs:
  •    00 00 FE FF    = UTF-32, big-endian
  •    FF FE 00 00    = UTF-32, little-endian
  •    EF BB BF       = UTF-8,
  •    FE FF          = UTF-16, big-endian
  •    FF FE          = UTF-16, little-endian


    在此处提供两个输入流方法,一种是基于字符的reader:
/**
 * http://www.unicode.org/unicode/faq/utf_bom.html
 *BOMs:
 *  00 00 FE FF    = UTF-32, big-endian
 *  FF FE 00 00    = UTF-32, little-endian
 *  EF BB BF       = UTF-8,
 *  FE FF          = UTF-16, big-endian
 *  FF FE          = UTF-16, little-endian
 *
 *Win2k Notepad:
 *  Unicode format = UTF-16LE
 *
 * @author Semantic Wang
 * 
 */
public class UnicodeReader extends Reader{

	PushbackInputStream pbin;
	InputStreamReader reader = null;
	String defaultEnc;

	private static final int BOM_SIZE = 4;
	
	
	/**
	 * 
	 * @param in
	 *            inputstream to be read
	 * 
	 */
	public UnicodeReader(InputStream in) {
		this(in, "GBK");
	}
	
	/**
	 * 
	 * @param in
	 *            inputstream to be read
	 * @param defaultEnc
	 *            default encoding if stream does not have BOM marker. Give NULL
	 *            to use system-level default.
	 */
	public UnicodeReader(InputStream in, String defaultEnc) {
		pbin = new PushbackInputStream(in, BOM_SIZE);
		this.defaultEnc = defaultEnc;
	}

	public String getDefaultEncoding() {
		return defaultEnc;
	}

	/**
	 * Get stream encoding or NULL if stream is uninitialized. Call init() or
	 * read() method to initialize it.
	 */
	public String getEncoding() {
		if (reader == null)
			return null;
		return reader.getEncoding();
	}

	/**
	 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
	 * back to the stream, only BOM bytes are skipped.
	 */
	protected void init() throws IOException {
		if (reader != null)
			return;

		String encoding;
		byte bom[] = new byte[BOM_SIZE];
		int n, unread;
		n = pbin.read(bom, 0, bom.length);

		if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
				&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
			encoding = "UTF-32BE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
				&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
			encoding = "UTF-32LE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
				&& (bom[2] == (byte) 0xBF)) {
			encoding = "UTF-8";
			unread = n - 3;
		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
			encoding = "UTF-16BE";
			unread = n - 2;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
			encoding = "UTF-16LE";
			unread = n - 2;
		} else {
			// Unicode BOM mark not found, unread all bytes
			encoding = defaultEnc;
			unread = n;
		}
		// System.out.println("read=" + n + ", unread=" + unread);

		if (unread > 0)
			pbin.unread(bom, (n - unread), unread);

		// Use given encoding
		if (encoding == null) {
			reader = new InputStreamReader(pbin);
		} else {
			reader = new InputStreamReader(pbin, encoding);
		}
	}

	public void close() throws IOException {
		init();
		reader.close();
	}

	public int read(char[] cbuf, int off, int len) throws IOException {
		init();
		return reader.read(cbuf, off, len);
	}

}

另一种是基于字节的输入流InputStream:

/**
 * @author Semantic Wang
 * 
 */
public class UnicodeInputStream extends InputStream {

	PushbackInputStream pbin;
	boolean isInited = false;
	String defaultEnc;
	String encoding;

	private static final int BOM_SIZE = 4;
	
	public UnicodeInputStream(InputStream in) {
		this(in, "GBK");
	}

	public UnicodeInputStream(InputStream in, String defaultEnc) {
		pbin = new PushbackInputStream(in, BOM_SIZE);
		this.defaultEnc = defaultEnc;
	}

	public String getDefaultEncoding() {
		return defaultEnc;
	}

	public String getEncoding() {
		if (!isInited) {
			try {
				init();
			} catch (IOException ex) {
				IllegalStateException ise = new IllegalStateException(
						"Init method failed.");
				ise.initCause(ise);
				throw ise;
			}
		}
		return encoding;
	}

	/**
	 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
	 * back to the stream, only BOM bytes are skipped.
	 */
	protected void init() throws IOException {
		if (isInited)
			return;

		byte bom[] = new byte[BOM_SIZE];
		int n, unread;
		n = pbin.read(bom, 0, bom.length);

		if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
				&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
			encoding = "UTF-32BE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
				&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
			encoding = "UTF-32LE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
				&& (bom[2] == (byte) 0xBF)) {
			encoding = "UTF-8";
			unread = n - 3;
		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
			encoding = "UTF-16BE";
			unread = n - 2;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
			encoding = "UTF-16LE";
			unread = n - 2;
		} else {
			// Unicode BOM mark not found, unread all bytes
			encoding = defaultEnc;
			unread = n;
		}
		// System.out.println("read=" + n + ", unread=" + unread);

		if (unread > 0)
			pbin.unread(bom, (n - unread), unread);

		isInited = true;
	}

	public void close() throws IOException {
		// init();
		isInited = true;
		pbin.close();
	}

	public int read() throws IOException {
		// init();
		isInited = true;
		return pbin.read();
	}

}


    最后的调用方式为:
InputStream in = new FileInputStream(fileName);
BufferedReader reader = new BufferedReader(new UnicodeReader(in));
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics