Java根据文件的BOM判断文件的编码类型

roufenghust

浏览: 25979 次
性别:
来自: 深圳

最近访客更多访客>>

a805617894a

qiangwushuang

fendo

melin

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

中文乱码
Java 文件编码

中文乱码 java GBK UTF-8 UTF16

随着多字节文本数据的处理，尤其是非ASCII文字的出现，典型的如中文，文件的编码判断就提上日程，有很多字节流和字符流默认能处理的编码格式的是和程序文件的编码一致，例如：程序文件编码是UTF-8，默认处理的文本也是UTF-8。处理其他格式的文本时，当不提供具体的编码时，就非常容易把其他格式的文本当成乱码处理。

当前处理的方式一般通过相关reader或writer的装饰类：

InputStreamReader(InputStream in, String charsetName)

或

OutputStreamWriter(OutputStream out, String charsetName)

实现显示地将字符编码传进去，但是无法实现自动的发掘文件字符编码，也就是说，此种模式仅仅支持用户将文件的编码传进去。

对中文字符编码主要是GBK（GB2312，GB18030）系列和UTF系列的区别，UTF系列的编码通常在文件的头部若干个字节已经告诉用户此文件的字符编码格式，即文件包含BOM（Byte Order Mark），此标志标志文件的编码方式，常见的有：

BOMs:

00 00 FE FF = UTF-32, big-endian
FF FE 00 00 = UTF-32, little-endian
EF BB BF = UTF-8,
FE FF = UTF-16, big-endian
FF FE = UTF-16, little-endian

在此处提供两个输入流方法，一种是基于字符的reader：

/**
 * http://www.unicode.org/unicode/faq/utf_bom.html
 *BOMs:
 *  00 00 FE FF    = UTF-32, big-endian
 *  FF FE 00 00    = UTF-32, little-endian
 *  EF BB BF       = UTF-8,
 *  FE FF          = UTF-16, big-endian
 *  FF FE          = UTF-16, little-endian
 *
 *Win2k Notepad:
 *  Unicode format = UTF-16LE
 *
 * @author Semantic Wang
 * 
 */
public class UnicodeReader extends Reader{

	PushbackInputStream pbin;
	InputStreamReader reader = null;
	String defaultEnc;

	private static final int BOM_SIZE = 4;
	
	
	/**
	 * 
	 * @param in
	 *            inputstream to be read
	 * 
	 */
	public UnicodeReader(InputStream in) {
		this(in, "GBK");
	}
	
	/**
	 * 
	 * @param in
	 *            inputstream to be read
	 * @param defaultEnc
	 *            default encoding if stream does not have BOM marker. Give NULL
	 *            to use system-level default.
	 */
	public UnicodeReader(InputStream in, String defaultEnc) {
		pbin = new PushbackInputStream(in, BOM_SIZE);
		this.defaultEnc = defaultEnc;
	}

	public String getDefaultEncoding() {
		return defaultEnc;
	}

	/**
	 * Get stream encoding or NULL if stream is uninitialized. Call init() or
	 * read() method to initialize it.
	 */
	public String getEncoding() {
		if (reader == null)
			return null;
		return reader.getEncoding();
	}

	/**
	 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
	 * back to the stream, only BOM bytes are skipped.
	 */
	protected void init() throws IOException {
		if (reader != null)
			return;

		String encoding;
		byte bom[] = new byte[BOM_SIZE];
		int n, unread;
		n = pbin.read(bom, 0, bom.length);

		if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
				&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
			encoding = "UTF-32BE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
				&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
			encoding = "UTF-32LE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
				&& (bom[2] == (byte) 0xBF)) {
			encoding = "UTF-8";
			unread = n - 3;
		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
			encoding = "UTF-16BE";
			unread = n - 2;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
			encoding = "UTF-16LE";
			unread = n - 2;
		} else {
			// Unicode BOM mark not found, unread all bytes
			encoding = defaultEnc;
			unread = n;
		}
		// System.out.println("read=" + n + ", unread=" + unread);

		if (unread > 0)
			pbin.unread(bom, (n - unread), unread);

		// Use given encoding
		if (encoding == null) {
			reader = new InputStreamReader(pbin);
		} else {
			reader = new InputStreamReader(pbin, encoding);
		}
	}

	public void close() throws IOException {
		init();
		reader.close();
	}

	public int read(char[] cbuf, int off, int len) throws IOException {
		init();
		return reader.read(cbuf, off, len);
	}

}

另一种是基于字节的输入流InputStream：

/**
 * @author Semantic Wang
 * 
 */
public class UnicodeInputStream extends InputStream {

	PushbackInputStream pbin;
	boolean isInited = false;
	String defaultEnc;
	String encoding;

	private static final int BOM_SIZE = 4;
	
	public UnicodeInputStream(InputStream in) {
		this(in, "GBK");
	}

	public UnicodeInputStream(InputStream in, String defaultEnc) {
		pbin = new PushbackInputStream(in, BOM_SIZE);
		this.defaultEnc = defaultEnc;
	}

	public String getDefaultEncoding() {
		return defaultEnc;
	}

	public String getEncoding() {
		if (!isInited) {
			try {
				init();
			} catch (IOException ex) {
				IllegalStateException ise = new IllegalStateException(
						"Init method failed.");
				ise.initCause(ise);
				throw ise;
			}
		}
		return encoding;
	}

	/**
	 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
	 * back to the stream, only BOM bytes are skipped.
	 */
	protected void init() throws IOException {
		if (isInited)
			return;

		byte bom[] = new byte[BOM_SIZE];
		int n, unread;
		n = pbin.read(bom, 0, bom.length);

		if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
				&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
			encoding = "UTF-32BE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
				&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
			encoding = "UTF-32LE";
			unread = n - 4;
		} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
				&& (bom[2] == (byte) 0xBF)) {
			encoding = "UTF-8";
			unread = n - 3;
		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
			encoding = "UTF-16BE";
			unread = n - 2;
		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
			encoding = "UTF-16LE";
			unread = n - 2;
		} else {
			// Unicode BOM mark not found, unread all bytes
			encoding = defaultEnc;
			unread = n;
		}
		// System.out.println("read=" + n + ", unread=" + unread);

		if (unread > 0)
			pbin.unread(bom, (n - unread), unread);

		isInited = true;
	}

	public void close() throws IOException {
		// init();
		isInited = true;
		pbin.close();
	}

	public int read() throws IOException {
		// init();
		isInited = true;
		return pbin.read();
	}

}

最后的调用方式为：

InputStream in = new FileInputStream(fileName);
BufferedReader reader = new BufferedReader(new UnicodeReader(in));

分享到：

免费的UML生成工具Open ModelSphere | 从今天起，做一个热爱分享的知识控

2013-05-03 12:17
浏览 1324
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Java根据文件的BOM判断文件的编码类型

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Java根据文件的BOM判断文件的编码类型

评论

发表评论

相关推荐

最近访客更多访客>>