Unicode 라이브러리인 icu4j 로 텍스트 파일의 character set encoding 알아 내기.

사용

maven 설정

<dependency>
	<groupId>com.ibm.icu</groupId>
	<artifactId>icu4j</artifactId>
	<version>52.1</version>
</dependency>

CharsetDetector 클래스로 인코딩 확인

package com.ktnet.tradesign;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

public class CharDetector {
	private String[] data = {
			"ms949.txt",
			"utf8.txt"
	};	
	
	Logger logger = LoggerFactory.getLogger(CharDetector.class);
	
	@Test
	public void detect() throws IOException {		
		CharsetDetector detector;
	    CharsetMatch match;	    
	    
	    FileInputStream fis = null;
	    try {
		    for(String fn : data) {
		    	File f = new File(fn);
			    fis = new FileInputStream(f);
			    
			    byte[] byteData = new byte[(int) f.length()];
			    
			    fis.read(byteData);
				fis.close();
			    detector = new CharsetDetector();
		
			    detector.setText(byteData);
			    match = detector.detect();
			    
			    logger.info(fn + " encoding is \"" + match.getName() + "\"");
		    }
	    }
	    finally {
	    	IOUtils.closeQuietly(fis);
	    }
	}
}

문제

'똠'이 들어간 데이타를 입력해 보았는데 MS949 는 표준 인코딩이 아니라 그런지 EUC-KR 로 표시됨.
한글 검출이 정확하지 않음. EUC-KR 코드표에 있는 문자만 있는데 Windows-1256 으로 검출

같이 보기

Browser not supported