none
How to detect encoding file in ANSI, UTF8 and UTF8 without BOM RRS feed

  • Вопрос

  • Hi all,

    I am having a problem with detecting a .txt/.csv file encoding. I need to detect a file in ANSI, UTF8 and UTF8 without BOM but the problem is the encoding of ANSI and UTF8 without BOM are the same. I checked the function below and saw that ANSI and UTF8 without BOM have the same encoding. so, How can I detect UTF8 without BOM encoding file? because I need to handle for this case in my code.

    Thanks.

    ///////////////////////////////////////////////////////////////////

    public Encoding GetFileEncoding(string srcFile)
            {
                // *** Use Default of Encoding.Default (Ansi CodePage)
                Encoding enc = Encoding.Default;

                // *** Detect byte order mark if any - otherwise assume default
                byte[] buffer = new byte[10];
                FileStream file = new FileStream(srcFile, FileMode.Open);
                file.Read(buffer, 0, 10);
                file.Close();

                if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
                    enc = Encoding.UTF8;
                else if (buffer[0] == 0xfe && buffer[1] == 0xff)
                    enc = Encoding.Unicode;
                else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
                    enc = Encoding.UTF32;
                else if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
                    enc = Encoding.UTF7;
                else if (buffer[0] == 0xFE && buffer[1] == 0xFF)
                    // 1201 unicodeFFFE Unicode (Big-Endian)
                    enc = Encoding.GetEncoding(1201);
                else if (buffer[0] == 0xFF && buffer[1] == 0xFE)
                    // 1200 utf-16 Unicode
                    enc = Encoding.GetEncoding(1200);

                return enc;
            }

    //////////////////////////////////////////////

Ответы

  • Hi,

    There is no 100% reliable way to detemine if a byte stream in ANSI or UTF-8 (without BOM).

    If by ANSI you really meant ASCII, things are a little simpler - If there are any bytes over x7F, then you could infer its UTF-8 (or bad ASCII, or another code page altogether. Only you know if you can exclude those possibilities).

    If it really is ANSI (and btw, there is no such single code page. There is a set of Windows Code Pages, the most common of which is code page 1252, so I will assume you mean that), you going to have to do a little more work. You could look to see if the bytes above x7F are legal UTF-8. The will always come in at least pairs, but up to 6 bytes can be used to encode a single character. (See here: http://en.wikipedia.org/wiki/UTF-8).

    So, if the bytes stream is not legal UTF-8, you could infer its ANSI (Code Page 1252). (Or just bad UTF-8 or something else. Again only you know if these can be excluded.)

    If the byte stream has no bytes values over x7F, your in luck - either encoding will work.

    HTH,

    Nick

    • Помечено в качестве ответа khoana 9 июля 2014 г. 9:45

Все ответы

  • Hi,

    There is no 100% reliable way to detemine if a byte stream in ANSI or UTF-8 (without BOM).

    If by ANSI you really meant ASCII, things are a little simpler - If there are any bytes over x7F, then you could infer its UTF-8 (or bad ASCII, or another code page altogether. Only you know if you can exclude those possibilities).

    If it really is ANSI (and btw, there is no such single code page. There is a set of Windows Code Pages, the most common of which is code page 1252, so I will assume you mean that), you going to have to do a little more work. You could look to see if the bytes above x7F are legal UTF-8. The will always come in at least pairs, but up to 6 bytes can be used to encode a single character. (See here: http://en.wikipedia.org/wiki/UTF-8).

    So, if the bytes stream is not legal UTF-8, you could infer its ANSI (Code Page 1252). (Or just bad UTF-8 or something else. Again only you know if these can be excluded.)

    If the byte stream has no bytes values over x7F, your in luck - either encoding will work.

    HTH,

    Nick

    • Помечено в качестве ответа khoana 9 июля 2014 г. 9:45
  • what you want is to get the encoding utf-8 without bom which can only be detected if the file has special characters, so do the following:

    public Encoding GetFileEncoding(string srcFile)
            {
                // *** Use Default of Encoding.Default (Ansi CodePage)
                Encoding enc = Encoding.Default;

                // *** Detect byte order mark if any - otherwise assume default
                byte[] buffer = new byte[10];
                FileStream file = new FileStream(srcFile, FileMode.Open);
                file.Read(buffer, 0, 10);
                file.Close();

                if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
                    enc = Encoding.UTF8;
                else if (buffer[0] == 0xfe && buffer[1] == 0xff)
                    enc = Encoding.Unicode;
                else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
                    enc = Encoding.UTF32;
                else if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
                    enc = Encoding.UTF7;
                else if (buffer[0] == 0xFE && buffer[1] == 0xFF)
                    // 1201 unicodeFFFE Unicode (Big-Endian)
                    enc = Encoding.GetEncoding(1201);
                else if (buffer[0] == 0xFF && buffer[1] == 0xFE)
                    // 1200 utf-16 Unicode
                    enc = Encoding.GetEncoding(1200);
               else if (validatUtf8whitBOM(srcFile))

                    enc = UTF8Encoding(false);
                return enc;
            }

    private bool validateUtf8whitBOM(string FileSource)

            {

                bool bReturn = false;

                string TextUTF8 = "", TextANSI = "";

                //lread the file as utf8

               StreamReader srFileWhitBOM  = new StreamReader(FileSource);

               TextUTF8 = srFileWhitBOM .ReadToEnd();

               srFileWhitBOM .Close();

               

                //lread the file as  ANSI

               StreamReader srFileWhitBOM  = new StreamReader(FileSource,Encoding.Defaul,false);

               TextANSI  = srFileWhitBOM .ReadToEnd();

               srFileWhitBOM .Close();

               // if the file contains special characters is UTF8 text read ansi show signs

                if(TextANSI.Contains("Ã") || TextANSI.Contains("±")

                     bReturn = true;

                return bReturn;

            }

    6 февраля 2015 г. 4:37
  • TextUTF8 is never used. Is it as designed?
    13 октября 2017 г. 17:17