none
unicode convert to big5 RRS feed

  • 問題

  • 大家好,

    小弟現有一個用UNICODE編碼的HTML, 因是UNICODE編碼, 所以中文字例如 "中文測試" 被編碼為 "中文測試" .

    自己寫了一段小程式來轉換, 但都沒成功, 煩請前輩協助能將此HTML檔案讀進, 後將還原為 "中文測試" 的方法, 謝謝.

    下列為該UNICODE編碼的HTML

    <html>
    <body>
    <ul><li>&#20013;&#25991;&#28204;&#35430;</li>
    <li>good test</li>
    </ul></body>
    </html>

    希望能透過程式轉換為

    <html>
    <body>
    <ul><li>中文測試</li>
    <li>good test</li>
    </ul></body>
    </html>

    下列為小弟不才自己寫的程式

    using System;
    using System.IO;
    using System.Text;

    namespace w_convert
    {
     class w_convert
     {
        static void Main(string[] args)
      {
         try
       {
        if(args.Length==0 || args[0].ToString().Trim()=="")
        {
         Console.WriteLine("Please enter right string such as c:\\abc\\helloworld.htm");
         return;
        }
        else
        {
         string w_FilePath, w_line, output_path, output_filename;
         int pos;
         byte[] bytData ;
         byte[] cp950Bytes;
         Encoding cp950 = Encoding.GetEncoding(950);

         w_FilePath = args[0].ToString().Trim();
         pos = w_FilePath.LastIndexOf("\\");
         if(pos==0)
         {
          Console.WriteLine("Please enter right string such as c:\\abc\\helloworld.htm");
          return;
         }
         pos++;
         output_path = w_FilePath.Substring(0,pos);
         if(!File.Exists(w_FilePath))
         {
          Console.WriteLine("File not exist");
          return;
         }

         output_filename = "new_" + w_FilePath.Substring(pos,(w_FilePath.Length-pos));
         if(File.Exists(output_path + output_filename))
          File.Delete(output_path + output_filename);

         FileStream fs = new FileStream(output_path + output_filename, FileMode.CreateNew);
         BinaryWriter bw = new BinaryWriter(fs,Encoding.GetEncoding(950));

        using (StreamReader sr = new StreamReader(w_FilePath))
         {
          while((w_line = sr.ReadLine())!=null)
          {
           Console.WriteLine(w_line);
           bytData = Encoding.Unicode.GetBytes(w_line);
           cp950Bytes = System.Text.Encoding.Convert(System.Text.Encoding.Unicode, cp950, bytData);
           bw.Write(cp950Bytes);
          }
         }
         bw.Close();
         fs.Close();
         return;
        }
       }
       catch (Exception e)
       {
        Console.WriteLine("system error message:" + e.ToString());
        return;
       }
      }
     }
    }

    2006年9月15日 上午 06:33

所有回覆

  • 下列code可以達到你要的轉換方式

    測試方式:

    C:\unicode.html內容為

    <html>
    <body>
    <ul><li>&#20013;&#25991;&#28204;&#35430;</li>
    <li>good test</li>
    </ul></body>
    </html>

     

    執行這段code後,會得到c:\MS950.html,內容為

    <html>
    <body>
    <ul><li>中文測試</li>
    <li>good test</li>
    </ul></body>
    </html>

     

    只是小弟的RegularExpression實在學的頗爛,感覺上,如果這個html內容大一點的時候,就會效能很差

     

                StreamReader sr = new StreamReader("c:\\unicode.html");

                // get content of html file
                string unicodeContent = sr.ReadToEnd();
                string ms950Content = unicodeContent;

                sr.Close();

                string matchString = @"&#\d+;";

                for (Match m = Regex.Match(unicodeContent, matchString); m.Success; m = m.NextMatch())
                {
                    // get the unicode number. Ex: &#20013; => 20013
                    int unicodeNumber = -1;
                    if (Int32.TryParse(m.Value.Substring(2, m.Value.Length - 3), out unicodeNumber))
                    {
                        // convert from Unicode to MS950
                        string convertedWord = Encoding.GetEncoding(950).GetString(
                            Encoding.Convert(Encoding.Unicode, Encoding.GetEncoding(950), BitConverter.GetBytes(unicodeNumber))
                            );

                        // remove the null ending of converted string
                        convertedWord = convertedWord.Replace("\0", "");

                        // replace the original string with MS950-string
                        ms950Content = ms950Content.Replace(m.Value, convertedWord);
                    }
                }

                Console.WriteLine(ms950Content);

                StreamWriter sw = new StreamWriter("c:\\ms950.html", false, Encoding.GetEncoding(950));
                sw.Write(ms950Content);
                sw.Flush();
                sw.Close();

    2006年10月2日 上午 07:43
  • using System.IO;

    using System.Text;

    str=Encoding.Unicode.GetString(bytes);

     

    這樣不可以嗎?

    2006年10月4日 下午 04:09