zoukankan      html  css  js  c++  java
  • VC2005解决UNICODE字符集下CStdioFile的Writestring无法写入中文

    在VC2005环境下, 以下代码无法实现使用CStdioFile向文本文件中写入中文(用notepad.exe查看不到写入的中文)

    CStdioFile file;

    file.Open(…);

    file.WriteString(_T("abc你好"));//只能写入abc

    在VC2005中的解决办法:

    使用setlocale语句设定区域

    #include <locale>//头文件

    CStdioFile file;

    file.Open(…);

    char* old_locale = _strdup( setlocale(LC_CTYPE,NULL) );

    setlocale( LC_CTYPE, "chs" );//设定

    file.WriteString(_T("abc你好"));//正常写入

    setlocale( LC_CTYPE, old_locale );

    free( old_locale );//还原区域设定

    简化处理可以仅使用语句setlocale( LC_CTYPE, "chs" )。

    setlocale:

    函数原形为:char *setlocale( int category, const char *locale );

    头文件:<locale.h>

    所支持的操作系统为:ANSI, Win 95, Win NT

    对于简体中文可以使用如下设置:setlocale( LC_ALL, "chs" );

    为什么一定要调用setlocale呢?

    因为在C/C++语言标准中定义了其运行时的字符集环境为"C",也就是ASCII字符集的一个子集,那么mbstowcs在工作时会将cstr中所包含的字符串看作是ASCII编码的字符,而不认为是一个包含有chs编码的字符串,所以他会将每一个中文拆成2个ASCII编码进行转换,这样得到的结果就是会形成4个wchar_t的字符组成的串,那么如何才能够让mbstowcs正常工作呢?在调用mbstowcs进行转换之间必须明确的告诉mbstowcs目前cstr串中包含的是chs编码的字符串,通过setlocale( LC_ALL, "chs" )函数调用来完成,需要注意的是这个函数会改变整个应用程序的字符集编码方式,必须要通过重新调用setlocale( LC_ALL, "C" )函数来还原,这样就可以保证mbstowcs在转换时将cstr中的串看作是中文串,并且转换成为2个wchar_t字符,而不是4个。

    本地化设置需要具备三个条件:

    a. 语言代码 (Language Code)

    b. 国家代码 (Country Code)

    c. 编码(Encoding)

    本地名字可以用下面这些部分来构造:

    语言代码_国家代码.编码 比如(zh_CN.UTF-8, en_US等)

    locale的别名表见 /usr/lib/X11/locale/locale.alias(以Debian GNU/Linux为例)

    setlocale语言字符串参考

    另外还有一种方法就是重新写CStdioFile的派生类CStdioFileEx(网上有)。

    //好像C++中没有类能够读些Unicode格式的文本文件,所以我写了下面这个类。用法很简单,大家尝试几下就明白了。

    #pragma once

    class CStdioFileEx: public CStdioFile

    {

    public:

    CStdioFileEx();

    CStdioFileEx( LPCTSTR lpszFileName, UINT nOpenFlags );

    virtual BOOL Open( LPCTSTR lpszFileName, UINT nOpenFlags, CFileException* pError = NULL );

    virtual BOOL ReadString(CString& rString);

    BOOL ReadWideString(CStringW& rString);

    BOOL ReadAnsiString(CStringA& rString);

    virtual void WriteString(LPCTSTR lpsz);

    void WriteWideString(LPCWSTR lpsz);

    void WriteAnsiString(LPCSTR lpsz);

    bool IsUnicodeFormat() {return m_bIsUnicodeText;}

    unsigned long GetCharCount();

    // Additional flag to allow Unicode text format writing

    enum {modeWriteUnicode = 0x100000};

    static bool IsFileUnicode(const CString& sFilePath);

    protected:

    UINT PreprocessFlags(const CString& sFilePath, UINT& nOpenFlags);

    bool    m_bIsUnicodeText;

    };

    //。cpp文件

    #include "stdafx.h"

    #include "StdioFileEx.h"

    //在UCS 编码中有一个叫做"ZERO WIDTH NO-BREAK SPACE"的字符,它的编码是FEFF。而FFFE在UCS中是不存在的字符,

    //所以不应该出现在实际传输中。UCS规范建议我们在传输字节流前,先传输字符"ZERO WIDTH NO-BREAK SPACE"。这样

    //如果接收者收到FEFF,就表明这个字节流是Big-Endian的;如果收到FFFE,就表明这个字节流是Little-Endian的。

    //因此字符"ZERO WIDTH NO-BREAK SPACE"又被称作BOM。

    //UTF-8不需要BOM来表明字节顺序,但可以用BOM来表明编码方式。字符"ZERO WIDTH NO-BREAK SPACE"的UTF-8编码是

    //EF BB BF。所以如果接收者收到以EF BB BF开头的字节流,就知道这是UTF-8编码了。

    //Windows就是使用BOM来标记文本文件的编码方式的。

    //有些老的浏览器和文本编辑器不支持BOM。

    #define UNICODE_BOM        0xFEFF//Unicode "byte order mark" which goes at start of file

    CStdioFileEx::CStdioFileEx(): CStdioFile()

    {

    m_bIsUnicodeText = false;

    }

    CStdioFileEx::CStdioFileEx(LPCTSTR lpszFileName,UINT nOpenFlags)

    :CStdioFile(lpszFileName, PreprocessFlags(lpszFileName, nOpenFlags))

    {

    }

    BOOL CStdioFileEx::Open(LPCTSTR lpszFileName,UINT nOpenFlags,CFileException* pError /*=NULL*/)

    {

    PreprocessFlags(lpszFileName, nOpenFlags);

    return CStdioFile::Open(lpszFileName, nOpenFlags, pError);

    }

    BOOL CStdioFileEx::ReadString(CString& rString)

    {

    #ifdef _UNICODE

    return ReadWideString(rString);

    #else

    return ReadAnsiString(rString);

    #endif

    }

    BOOL CStdioFileEx::ReadWideString(CStringW& rString)

    {

    _ASSERTE(m_pStream);

    rString = L"";      // empty string without deallocating

    if(m_bIsUnicodeText)

    {

        // If at position 0, discard byte-order mark before reading

        if(GetPosition() == 0)

        {

         wchar_t bom;

         Read(&bom, sizeof(wchar_t));

        }

        const int nMaxSize = 128;

        LPWSTR lpsz = rString.GetBuffer(nMaxSize);

        LPWSTR lpszResult;

        int nLen = 0;

        for (;;)

        {

         lpszResult = fgetws(lpsz, nMaxSize+1, m_pStream);

         rString.ReleaseBuffer();

         // handle error/eof case

         if (lpszResult == NULL && !feof(m_pStream))

         {

          Afx_clearerr_s(m_pStream);

          AfxThrowFileException(CFileException::genericException, _doserrno,

           m_strFileName);

         }

         // if string is read completely or EOF

         if (lpszResult == NULL ||

          (nLen = (int)lstrlenW(lpsz)) < nMaxSize ||

          lpsz[nLen-1] == '\n')

          break;

         nLen = rString.GetLength();

         lpsz = rString.GetBuffer(nMaxSize + nLen) + nLen;

        }

        //remove crlf if exist.

        nLen = rString.GetLength();

        if (nLen > 1 && rString.Mid(nLen-2) == L"\r\n")

        {

         rString.GetBufferSetLength(nLen-2);

        }

        return rString.GetLength() > 0;

    }

    else

    {

        CStringA ansiString;

        BOOL bRetval = ReadAnsiString(ansiString);

        //setlocale(LC_ALL, "chs_chn.936");//no need

        rString = ansiString;

        return bRetval;

    }

    }

    BOOL CStdioFileEx::ReadAnsiString(CStringA& rString)

    {

    _ASSERTE(m_pStream);

    rString = "";      // empty string without deallocating

    if(!m_bIsUnicodeText)

    {

        const int nMaxSize = 128;

        LPSTR lpsz = rString.GetBuffer(nMaxSize);

        LPSTR lpszResult;

        int nLen = 0;

        for (;;)

        {

         lpszResult = fgets(lpsz, nMaxSize+1, m_pStream);

         rString.ReleaseBuffer();

         // handle error/eof case

         if (lpszResult == NULL && !feof(m_pStream))

         {

          Afx_clearerr_s(m_pStream);

          AfxThrowFileException(CFileException::genericException, _doserrno,

           m_strFileName);

         }

         // if string is read completely or EOF

         if (lpszResult == NULL ||

          (nLen = (int)lstrlenA(lpsz)) < nMaxSize ||

          lpsz[nLen-1] == '\n')

          break;

         nLen = rString.GetLength();

         lpsz = rString.GetBuffer(nMaxSize + nLen) + nLen;

        }

        //remove crlf if exist.

        nLen = rString.GetLength();

        if (nLen > 1 && rString.Mid(nLen-2) == "\r\n")

        {

         rString.GetBufferSetLength(nLen-2);

        }

        return rString.GetLength() > 0;

    }

    else

    {

        CStringW wideString;

        BOOL bRetval = ReadWideString(wideString);

        //setlocale(LC_ALL, "chs_chn.936");//no need

        rString = wideString;

        return bRetval;

    }

    }

    // Purpose:    Writes string to file either in Unicode or multibyte, depending on whether the caller specified the

    //       CStdioFileEx::modeWriteUnicode flag. Override of base class function.

    void CStdioFileEx::WriteString(LPCTSTR lpsz)

    {

    #ifdef _UNICODE

    WriteWideString(lpsz);

    #else

    WriteAnsiString(lpsz);

    #endif

    }

    void CStdioFileEx::WriteWideString(LPCWSTR lpsz)

    {

    ASSERT(lpsz != NULL);

    if (lpsz == NULL)

    {

        AfxThrowInvalidArgException();

    }

    if(m_bIsUnicodeText)

    {

        ASSERT(m_pStream != NULL);

        // If writing Unicode and at the start of the file, need to write byte mark

        if(GetPosition() == 0)

        {

         wchar_t cBOM = (wchar_t)UNICODE_BOM;

         CFile::Write(&cBOM, sizeof(wchar_t));

        }

        if (fputws(lpsz, m_pStream) == _TEOF)

         AfxThrowFileException(CFileException::diskFull, _doserrno, m_strFileName);

    }

    else

    {

        USES_CONVERSION;

        WriteAnsiString(CW2A(lpsz));

    }

    }

    void CStdioFileEx::WriteAnsiString(LPCSTR lpsz)

    {

    ASSERT(lpsz != NULL);

    if (lpsz == NULL)

    {

        AfxThrowInvalidArgException();

    }

    if(!m_bIsUnicodeText)

    {

        ASSERT(m_pStream != NULL);

        if (fputs(lpsz, m_pStream) == _TEOF)

         AfxThrowFileException(CFileException::diskFull, _doserrno, m_strFileName);

    }

    else

    {

        USES_CONVERSION;

        WriteWideString(CA2W(lpsz));

    }

    }

    UINT CStdioFileEx::PreprocessFlags(const CString& sFilePath, UINT& nOpenFlags)

    {

    m_bIsUnicodeText = false;

    // If we have writeUnicode we must have write or writeRead as well

    if (nOpenFlags & CStdioFileEx::modeWriteUnicode)

    {

        ASSERT(nOpenFlags & CFile::modeWrite || nOpenFlags & CFile::modeReadWrite);

        m_bIsUnicodeText = true;

    }

    // If reading in text mode and not creating...

    else if (nOpenFlags & CFile::typeText && !(nOpenFlags & CFile::modeCreate) && !(nOpenFlags & CFile::modeWrite ))

    {

        m_bIsUnicodeText = IsFileUnicode(sFilePath);

    }

    //如果要读写Unicode格式的文本文件, 必须切换到typeBinary方式, 因为这会影响fputws/fgetws的工作方式(具体情况参考MSDN)。

    if (m_bIsUnicodeText)

    {

        nOpenFlags &= ~(CFile::typeText);

        nOpenFlags |= CFile::typeBinary;

    }

    return nOpenFlags;

    }

    // Purpose:    Determines whether a file is Unicode by reading the first character and detecting

    //       whether it's the Unicode byte marker.

    bool CStdioFileEx::IsFileUnicode(const CString& sFilePath)

    {

    CFile      file;

    wchar_t     cFirstChar;

    CFileException exFile;

    bool      bIsUnicode = false;

    // Open file in binary mode and read first character

    if (file.Open(sFilePath, CFile::typeBinary | CFile::modeRead, &exFile))

    {

        // If byte is Unicode byte-order marker, let's say it's Unicode

        if (file.Read(&cFirstChar, sizeof(wchar_t)) > 0 && cFirstChar == (wchar_t)UNICODE_BOM)

        {

         bIsUnicode = true;

        }

        file.Close();

    }

    else

    {

        // Handle error here if you like

    }

    return bIsUnicode;

    }

    unsigned long CStdioFileEx::GetCharCount()

    {

    int      nCharSize;

    unsigned long nByteCount, nCharCount = 0;

    if (m_pStream)

    {

        // Get size of chars in file

        nCharSize = m_bIsUnicodeText ? sizeof(wchar_t): sizeof(char);

        // If Unicode, remove byte order mark from count

        nByteCount = (unsigned long)GetLength();

        if (m_bIsUnicodeText)

        {

         nByteCount = nByteCount - sizeof(wchar_t);

        }

        // Calc chars

        nCharCount = (nByteCount / nCharSize);

    }

    return nCharCount;

    }

  • 相关阅读:
    块级元素与行级元素(内联元素)
    css中属性值继承小解
    form表单
    html,xhtml和xml
    html中的标签分类
    如何把HTML标记分类
    实现对HashMap的value排序
    装饰者模式
    实现一个简单的二叉树容器,并且实现中序、先序、后续遍历
    Java中java.util.concurrent包下的4中线程池代码示例
  • 原文地址:https://www.cnblogs.com/me115/p/1715569.html
Copyright © 2011-2022 走看看