www.pudn.com > charsettools_1.0.2.zip > CharsetToolkit.java


/* 

Program CharsetToolkit.java *

Author Keiven Ju *

Created 2003-12-12 5:03:51 *

Copyright 1995-1998,2000-2003 by AKuP International, Inc.,

*

Nanjing:

* Unit C, 26F Jianjiang Culture Building, No. 89 Zhongshannan Rd. Nanjing 210005 China

*

26F, Jianjiang Building No.89,Zhongshan South Rd, Nanjing *

Taipei :

*

5F, NO.6, Alley 36, Lane 26, Rueiguang Rd, Neihu District, Taipei City 114, Taiwan (R.O.C) *

* All rights reserved.

*

* This software is the confidential and proprietary information

* of AKuP International, Inc. ("Confidential Information"). You

* shall not disclose such Confidential Information and shall use

* it only in accordance with the terms of the license agreement

* you entered into with AKuP.

*

* E-mail keiven.ju@akup.com for more information. */ package com.akup.charset; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.util.HashMap; /** * program CharsetToolkit * @author Keiven Ju * @since 1.0.0 * @version 1.0.0 2003-12-12 5:03:51 */ public class CharsetToolkit extends Encoding { /** Simplfied-->Traditional character equivalence hashes*/ private HashMap s2thash; /** Traditional-->Simplfied character equivalence hashes*/ private HashMap t2shash; /** * Constructor * @since 1.0.0 2003-12-15 */ public CharsetToolkit() { super(); init(); } /** * init, * @since 1.0.0 2003-12-11 */ private void init() { String dataline; // Initialize and load in the simplified/traditional character hashses s2thash = new HashMap(); t2shash = new HashMap(); try { InputStream pydata = getClass().getResourceAsStream("hcutf8.txt"); //System.out.println("pydata:" + pydata); //pydata = new FileInputStream(new File("hcutf8.txt")); BufferedReader in = new BufferedReader(new InputStreamReader(pydata, "UTF8")); while ((dataline = in.readLine()) != null) { // Skip empty and commented lines if (dataline.length() == 0 || dataline.charAt(0) == '#') { continue; } // Simplified to Traditional, (one to many, but pick only one) s2thash.put(dataline.substring(0, 1).intern(), dataline.substring(1, 2)); // Traditional to Simplified, (many to one) for (int i = 1; i < dataline.length(); i++) { t2shash.put(dataline.substring(i, i + 1).intern(), dataline.substring(0, 1)); } } } catch (Exception e) { e.printStackTrace(); System.err.println(e); } } /** * indexCharset,µÃµ½ÐòºÅ * @param charsetname * @return int * @since 1.0.0 2003-12-15 */ protected int indexCharset(String charsetname) { int idx = -1; for (int i = 0; i < Encoding.TOTALTYPES; i++) { if (Encoding.nicename[i].equalsIgnoreCase(charsetname) || Encoding.javaname[i].equalsIgnoreCase(charsetname) || Encoding.htmlname[i].equalsIgnoreCase(charsetname)) { return i; } } return idx; } /** * convertFile,´ÓÎļ?жÁ³ö×Ö·û´® * @param strInFile * @param srccharset * @param strOutFile * @param trgcharset * @throws IOException * @since 1.0.0 2003-12-15 */ public void convertFile(String strInFile, String srccharset, String strOutFile, String trgcharset) throws IOException { convertFile(new File(strInFile), srccharset, new File(strOutFile), trgcharset); } /** * convertFile, * @param strInFile * @param srccharset * @param strOutFile * @param trgcharset * @throws IOException * @since 1.0.0 2003-12-15 */ public void convertFile(File strInFile, String srccharset, File strOutFile, String trgcharset) throws IOException { if (srccharset == null || trgcharset == null) return; int src_code = this.indexCharset(srccharset); int trg_code = this.indexCharset(trgcharset); if (src_code * trg_code < 0) { //ÓÐÕÒ²»µ½µÄ System.err.println(src_code + " " + trg_code); throw new UnsupportedEncodingException(srccharset + " or " + trgcharset); } InputStreamReader isr = new InputStreamReader(new FileInputStream(strInFile), srccharset); BufferedReader in = new BufferedReader(isr); OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(strOutFile), trgcharset); BufferedWriter out = new BufferedWriter(osw); String strline = null; StringBuffer buffer = new StringBuffer(); while ((strline = in.readLine()) != null) { this.myConvertString(strline, src_code, trg_code, trgcharset, buffer); out.write(buffer.toString()); out.newLine(); out.flush(); buffer.delete(0, buffer.length()); } in.close(); out.close(); } /** * convertString, * @param instring * @param srccharset * @param trgcharset * @return String * @since 1.0.0 2003-12-15 */ public String convertString(String instring, String srccharset, String trgcharset) throws IOException { StringBuffer sb = new StringBuffer(); int src_code = this.indexCharset(srccharset); int trg_code = this.indexCharset(trgcharset); this.myConvertString(instring, src_code, trg_code, trgcharset, sb); return sb.toString(); } private boolean useZhCode = false; /** * convertString, * @param instring * @param srccharset * @param trgbuffer * @param trgcharset * @since 1.0.0 2003-12-15 */ private void myConvertString(String instring, int src_code, int trg_code, String trgcharset, StringBuffer trgbuffer) throws IOException { if (useZhCode) { this.convertStringZHcode(instring, src_code, trg_code, trgbuffer); } else { ByteArrayOutputStream fos = new ByteArrayOutputStream(); OutputStreamWriter osw = new OutputStreamWriter(fos, trgcharset); osw.write(instring); osw.close(); //ʱ¼ä¿ÉÒÔ¼õÉÙÒ»°? String trgstr = fos.toString(trgcharset); int len = trgstr.length(); int ch; //char[] chs = trgstr.toCharArray();//ƵÂʸßʱ£¬ÂÔÓп? for (int idx = 0; idx < len; idx++) { ch = trgstr.charAt(idx); //Ê?ÝÉÙʱÓп? //ch = chs[idx]; if (ch == 0x3F) { this.convertStringZHcode(instring.substring(idx, idx + 1), src_code, trg_code, trgbuffer); } else { trgbuffer.append((char) ch); } } } } /** * convertStringZHcode, * @param dataline * @param src_cod * @param trg_cod * @param outline * @since 1.0.0 2003-12-11 */ private void convertStringZHcode(String dataline, int src_cod, int trg_cod, StringBuffer outline) { int lineindex; for (lineindex = 0; lineindex < dataline.length(); lineindex++) { if ((src_cod == GB2312 || src_cod == GBK || src_cod == ISO2022CN_GB || src_cod == UNICODE || src_cod == UNICODES || src_cod == UTF8) && (trg_cod == BIG5 || trg_cod == CNS11643 || trg_cod == UNICODET || trg_cod == ISO2022CN_CNS)) { if (s2thash.containsKey(dataline.substring(lineindex, lineindex + 1)) == true) { outline.append(s2thash.get(dataline.substring(lineindex, lineindex + 1).intern())); } else { outline.append(dataline.substring(lineindex, lineindex + 1)); } } else if ((src_cod == BIG5 || src_cod == CNS11643 || src_cod == UNICODET || src_cod == UTF8 || src_cod == ISO2022CN_CNS || src_cod == GBK || src_cod == UNICODE) && (trg_cod == GB2312 || trg_cod == UNICODES || trg_cod == ISO2022CN_GB)) { if (t2shash.containsKey(dataline.substring(lineindex, lineindex + 1)) == true) { outline.append(t2shash.get(dataline.substring(lineindex, lineindex + 1).intern())); } else { outline.append(dataline.substring(lineindex, lineindex + 1)); } } else { outline.append(dataline.substring(lineindex, lineindex + 1)); } } } /** * compareprint, * @param src * @param trg * @since 1.0.0 2003-12-15 */ public static void compareString(String src, String trg) { StringBuffer srcb = new StringBuffer(src); StringBuffer trgb = new StringBuffer(trg); int len = Math.min(srcb.length(), trgb.length()); char srch, trgh; System.out.println("-----------------------------------"); System.out.println("src " + " " + "int" + " " + "int" + " " + "trg" + " " + " src-trg"); System.out.println("-----------------------------------"); for (int i = 0; i < len; i++) { srch = srcb.charAt(i); trgh = trgb.charAt(i); System.out.println(srch + " " + (int) srch + " " + (int) trgh + " " + trgh + " " + ((int) srch - (int) trgh)); } } /** * Method readInput ´ÓÎļ?жÁ³ö×Ö·û´® * @param strInFile * @return String */ public static String readInput(String strInFile, String charset) throws IOException { if (charset == null) return null; StringBuffer buffer = new StringBuffer(); FileInputStream fis = new FileInputStream(strInFile); InputStreamReader isr = new InputStreamReader(fis, charset); BufferedReader in = new BufferedReader(isr); int ch; while ((ch = in.read()) > -1) { buffer.append((char) ch); } in.close(); return buffer.toString(); } /** * writeOutput,×Ö·û´®Êä³öΪtrgcharset±àÂ?ļ?£ * @param str ´?ä³ö×Ö·û´® * @param strOutFile Êä³öÎļ? * @param trgcharset Êä³öÎļ?harset * @since 1.0.0 2003-12-15 */ public static void writeOutput(String str, String strOutFile, String trgcharset) { try { FileOutputStream fos = new FileOutputStream(strOutFile); OutputStreamWriter out = new OutputStreamWriter(fos, trgcharset); out.write(str); out.close(); } catch (IOException e) { e.printStackTrace(); } } /** * ÊäÈ?Ö·û´®Êä³?UnicodeÏÔʾ×Ö·û´®¡£ * Method toUnicode * @param instring * @return String */ public static String toUnicode(String instring) { String unicode = "\\u"; StringBuffer out = new StringBuffer(); try { StringReader in = new StringReader(instring); int ch; while ((ch = in.read()) > -1) { if (ch < 0x200) { out.append((char) ch); } else { out.append(unicode); out.append(Integer.toHexString(ch)); } } in.close(); } catch (IOException e) { e.printStackTrace(); return null; } return out.toString(); } /** * toIEUnicode,ÄÜÖ±½ÓÔÚIEÉÏÏÔʾµÄÍòÄܱàÂ?£ * @param instring * @return String * @since 1.0.0 2003-12-11 */ public static String toIEUnicode(String instring) { String IEUnicode = "&#x"; char IEUnicoded = ';'; StringBuffer out = new StringBuffer(); try { StringReader in = new StringReader(instring); String temp = null; int ch; while ((ch = in.read()) > -1) { if (ch < 0x200) { out.append((char) ch); } else { out.append(IEUnicode); out.append(Integer.toHexString(ch)); out.append(IEUnicoded); } } in.close(); } catch (IOException e) { e.printStackTrace(); return null; } return out.toString(); } /** * toFixCode, * @param srcstring * @param trgcharset * @return String * @since 1.0.0 2003-12-16 */ public static String toFixCode(String srcstring, String trgcharset) throws IOException { StringBuffer trgbuffer = new StringBuffer(); ByteArrayOutputStream fos = new ByteArrayOutputStream(); OutputStreamWriter osw = new OutputStreamWriter(fos, trgcharset); osw.write(srcstring); osw.close(); String trgstr = new String(fos.toString(trgcharset)); int len = trgstr.length(); int ch; for (int index = 0; index < len; index++) { ch = trgstr.charAt(index); if (ch == 0x3F) { trgbuffer.append(toIEUnicode(srcstring.substring(index, index + 1))); //} else if (ch < 0x200) { // trgbuffer.append((char) ch); } else { trgbuffer.append((char) ch); } } return trgbuffer.toString(); } /**Input chartset file, output Unicode file. * @param chartset * @param infile * @param outfile */ public static void FiletoUnicode(String chartset, String infile, String outfile) throws IOException { System.out.println("Input " + chartset + " file, output " + chartset + " file."); String str = readInput(infile, chartset); String strunicode = toUnicode(str); writeOutput(strunicode, outfile, chartset); System.out.println("All of Characters in file.ok."); } /** * FiletoIEUnicode,Input GB2312 file, output Unicode file. * @param chartset ÊäÈ?Ä×Ö·û¼¯ * @param infile * @param outfile * @since 1.0.0 2003-12-15 */ public static void FiletoIEUnicode(String chartset, String infile, String outfile) throws IOException { System.out.println("Input " + chartset + " file, output " + chartset + " file."); String str = readInput(infile, chartset); String strunicode = toUnicode(str); writeOutput(strunicode, outfile, chartset); System.out.println("All of Characters in file.ok."); } /** * isUseZhCode, * @return boolean * @since 1.0.0 2003-12-17 */ public boolean isUseZhCode() { return useZhCode; } /** * setUseZhCode, * @param b * @since 1.0.0 2003-12-17 */ public void setUseZhCode(boolean b) { useZhCode = b; } public static void main(String[] args) { try { CharsetToolkit ctk = new CharsetToolkit(); String gbstring = "‚킽‚µ“c’†"; gbstring = readInput("AA.txt", "GB2312"); String bigstring = ctk.convertString(gbstring, "GB2312", "BIG5"); bigstring = toFixCode(gbstring, "BIG5"); writeOutput(bigstring, "BB.txt", "Big5"); compareString(gbstring, bigstring); } catch (IOException e) { e.printStackTrace(); } } }