代码之家  ›  专栏  ›  技术社区  ›  catphive

使用区域和流的MbSrtoCoS和WcSrRtBs的C++等价物

  •  4
  • catphive  · 技术社区  · 14 年前

    使用STD:C++和WSRSTOMBS类型的函数是否有C++等价的:

    我试图找出使用标准库在std::string和std::wstring之间来回转换的最佳方法。似乎std::locale几乎可以做到这一点,但我对一些细节或它可能具有的限制有点怀疑。

    一些细节:我在linux上,它使用utf-8作为本机编码。我想从utf-8 std::string转到std::wstring,然后在不丢失信息的情况下返回。

    我认为windows上的locale可能有一些限制,但我并不特别关心它们。只要答案在Linux上有效,并且除了libstdc++之外没有其他依赖项,也就是说没有boost依赖项,我就很高兴了。

    背景资料链接欣赏。

    注意:似乎有些混乱在utf-8中,多个字符可以表示单个字符,因此在从wchar_t转换为char时不考虑此问题的函数将不起作用。

    2 回复  |  直到 14 年前
        1
  •  3
  •   Community Neeleshkumar S    7 年前

    locale 对于这项任务来说是过分了-utf-8和utf-16可以通过简单的二进制转换来回转换。这里有一些代码,基于我的 answer to an earlier question 是的。

    std::string UTF16to8(const wchar_t * in)
    {
        std::string out;
        if (in == NULL)
            return out;
    
        unsigned int codepoint = 0;
        for (in;  *in != 0;  ++in)
        {
            if (*in >= 0xd800 && *in <= 0xdbff)
                codepoint = ((*in - 0xd800) << 10) + 0x10000;
            else
            {
                if (*in >= 0xdc00 && *in <= 0xdfff)
                    codepoint |= *in - 0xdc00;
                else
                    codepoint = *in;
    
                if (codepoint <= 0x7f)
                    out.append(1, static_cast<char>(codepoint));
                else if (codepoint <= 0x7ff)
                {
                    out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
                    out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
                }
                else if (codepoint <= 0xffff)
                {
                    out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
                    out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
                    out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
                }
                else
                {
                    out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
                    out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
                    out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
                    out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
                }
                codepoint = 0;
            }
        }
        return out;
    }
    
    std::wstring UTF8to16(const char * in)
    {
        std::wstring out;
        if (in == NULL)
            return out;
    
        unsigned int codepoint = 0;
        int following = 0;
        for (in;  *in != 0;  ++in)
        {
            unsigned char ch = *in;
            if (ch <= 0x7f)
            {
                codepoint = ch;
                following = 0;
            }
            else if (ch <= 0xbf)
            {
                if (following > 0)
                {
                    codepoint = (codepoint << 6) | (ch & 0x3f);
                    --following;
                }
            }
            else if (ch <= 0xdf)
            {
                codepoint = ch & 0x1f;
                following = 1;
            }
            else if (ch <= 0xef)
            {
                codepoint = ch & 0x0f;
                following = 2;
            }
            else
            {
                codepoint = ch & 0x07;
                following = 3;
            }
            if (following == 0)
            {
                if (codepoint > 0xffff)
                {
                    out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10)));
                    out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff)));
                }
                else
                    out.append(1, static_cast<wchar_t>(codepoint));
                codepoint = 0;
            }
        }
        return out;
    }
    

    如果你的wchar是32位而不是16位,这里有一个版本(未测试)。

    std::string UTF32to8(const wchar_t * in)
    {
        assert(sizeof(wchar_t) >= 4);
        std::string out;
        if (in == NULL)
            return out;
    
        for (in;  *in != 0;  ++in)
        {
            unsigned int codepoint = *in;
    
            if (codepoint <= 0x7f)
                out.append(1, static_cast<char>(codepoint));
            else if (codepoint <= 0x7ff)
            {
                out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
            }
            else if (codepoint <= 0xffff)
            {
                out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
                out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
            }
            else
            {
                out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
                out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
                out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
            }
        }
        return out;
    }
    
    std::wstring UTF8to32(const char * in)
    {
        assert(sizeof(wchar_t) >= 4);
        std::wstring out;
        if (in == NULL)
            return out;
    
        wchar_t codepoint = 0;
        int following = 0;
        for (in;  *in != 0;  ++in)
        {
            unsigned char ch = *in;
            if (ch <= 0x7f)
            {
                codepoint = ch;
                following = 0;
            }
            else if (ch <= 0xbf)
            {
                if (following > 0)
                {
                    codepoint = (codepoint << 6) | (ch & 0x3f);
                    --following;
                }
            }
            else if (ch <= 0xdf)
            {
                codepoint = ch & 0x1f;
                following = 1;
            }
            else if (ch <= 0xef)
            {
                codepoint = ch & 0x0f;
                following = 2;
            }
            else
            {
                codepoint = ch & 0x07;
                following = 3;
            }
            if (following == 0)
            {
                out.append(1, codepoint);
                codepoint = 0;
            }
        }
        return out;
    }
    
        2
  •  1
  •   wengseng    14 年前

    你试过创建一些简单的函数吗?

    std::wstring StringToWString(const std::string& src)
    {
     std::wstring str(src.length(),L' ');
     std::copy(src.begin(), src.end(), str.begin());
     return str; 
    }
    
    
    std::string WStringToString(const std::wstring& src)
    {
     std::string str(src.length(), ' ');
     std::copy(src.begin(), src.end(), str.begin());
     return str; 
    }
    
    void main()
    {
     string s1 = "Hello World!";
     wstring s2 = StringToWString(s1);
     s1 = WStringToString(s2);
    }