代码之家 › 专栏 › 技术社区 › iveqy

convert=c3=b6 to_假设为utf-8

utf-8 c

iveqy · 技术社区 · 6 年前

因此,我试图阅读一封电子邮件,它是用引用的可打印文件编码的,因此包含以下内容:

=C3=B6

应该转换成

Ã¶

所以我得到c3b6是 utf-8 价值 Ã¶ 但是我没有真正理解如何转换 char * str = '=C3=B6' 进入之内 char * str 'Ã¶' .

我使用的是Linux,但会将代码移植到Windows,所以我需要一个多平台的解决方案。

我该怎么做?

3 回复 | 直到 6 年前

chux 6 年前

开始行动。

分析字符串 "=C3=B6" ,查找2个字节作为十六进制字符。然后形成一个字符串来打印(希望 printf 将解释为符合UTF-8标准的 printf("%s", ...) “没有为多字节字符作出特殊规定”。 YMMV .

#include "stdio.h"
int main() {
  char * str = "=C3=B6";
  printf("%s\n", str);
  printf("1 %s\n", "Ã¶");
  printf("2 %s\n", "\xC3\xB6");
  unsigned char a[3] = { 0 };
  if (sscanf("=c3=b6", "=%hhx=%hhx", &a[0], &a[1]) == 2) {
    printf("3 %s\n", a);
  }
  return 0;
}

产量

=C3=B6
1 Ã¶
2 Ã¶
3 Ã¶

Craig Estey 6 年前

这是你应该开始做的事情。

我已经对它进行了测试,它似乎对您提供的输入有效。它有一些错误检查,但不是很多。

#include <stdio.h>

// hexnib -- convert ascii hex digit to binary value
int
hexnib(int chr)
{

    chr &= 0xFF;

    do {
        if ((chr >= '0') && (chr <= '9')) {
            chr -= '0';
            break;
        }

        if ((chr >= 'A') && (chr <= 'F')) {
            chr -= 'A';
            chr += 10;
            break;
        }

        // error ...
    } while (0);

    return chr;
}

void
convert(char *utf8,const char *quo)
{
    int chr;
    int acc;

    while (1) {
        chr = *quo++;
        if (chr == 0)
            break;

        // handle ordinary char (i.e. _not_ start of =XY)
        if (chr != '=') {
            *utf8++ = chr;
            continue;
        }

        // hex value accumulator
        acc = 0;

        // get X value
        chr = *quo++;
        if (chr == 0)
            break;

        // convert to binary
        chr = hexnib(chr);
        acc <<= 8;
        acc |= chr;

        // get Y value
        chr = *quo++;
        if (chr == 0)
            break;

        // convert to binary
        chr = hexnib(chr);
        acc <<= 8;
        acc |= chr;

        // store utf sequence
        *utf8++ = acc;
    }

    // store end of string
    *utf8 = 0;
}

int
main(int argc,char **argv)
{
    char *fname;
    FILE *fi;
    char ibuf[1000];
    char obuf[1000];

    --argc;
    ++argv;

    fname = *argv;
    if (fname != NULL)
        fi = fopen(fname,"r");
    else
        fi = stdin;

    while (1) {
        char *cp = fgets(ibuf,sizeof(ibuf),fi);
        if (cp == NULL)
            break;

        convert(obuf,ibuf);

        fputs(obuf,stdout);
    }

    if (fname != NULL)
        fclose(fi);

    return 0;
}

Nominal Animal 6 年前

译码 quoted-printable 字符串涉及三个方面:

忽略软换行符。这些是 = 然后是换行符。
转换 = 后跟两个十六进制数字,到其代码与该十六进制值匹配的字符

解码数据有三种主要方法:

输入滤波器。而不是例如 fgetc() ,使用一个读取和解码引用的可打印输入的函数。
转换为新缓冲区。见 convert() 功能在 Craig Esteys answer 同样的问题。
转换到位。这是可能的,因为每个有效的带引号的可打印编码字符串至少与解码的字符串一样长。

输入滤波器。为了简单起见,让我们一次看一个字符。(请注意,许多utf-8字符的长度超过一个字符。)

首先,我们需要一个助手函数来将十六进制数字字符转换为各自的算术值:

static inline int hex_digit(const int c)
{
    switch (c) {
    case '0':           return  0;
    case '1':           return  1;
    case '2':           return  2;
    case '3':           return  3;
    case '4':           return  4;
    case '5':           return  5;
    case '6':           return  6;
    case '7':           return  7;
    case '8':           return  8;
    case '9':           return  9;
    case 'A': case 'a': return 10;
    case 'B': case 'b': return 11;
    case 'C': case 'c': return 12;
    case 'D': case 'd': return 13;
    case 'E': case 'e': return 14;
    case 'F': case 'f': return 15;
    default:            return -1;
    }
}

在大多数情况下,您也可以将其写为

static inline int hex_digit(const int c)
{
    if (c >= '0' && c <= '9')
        return c - '0';
    else
    if (c >= 'A' && c <= 'F')
        return c - 'A' + 10;
    else
    if (c >= 'a' && c <= 'F')
        return c - 'a' + 10;
    else
        return -1;
}

甚至像

static signed char  hex_digit_value[UCHAR_MAX + 1];

static inline int hex_digit(const int c)
{
    return hex_digit_value[(unsigned char)c];
}

static inline void init_hex_digit_values(void)
{
    int  i;
    for (i = 0; i <= UCHAR_MAX; i++)
        hex_digit_value[i] = -1;

    hex_digit_value['0'] = 0;
    hex_digit_value['1'] = 1;
    hex_digit_value['2'] = 2;
    hex_digit_value['3'] = 3;
    hex_digit_value['4'] = 4;
    hex_digit_value['5'] = 5;
    hex_digit_value['6'] = 6;
    hex_digit_value['7'] = 7;
    hex_digit_value['8'] = 8;
    hex_digit_value['9'] = 9;
    hex_digit_value['A'] = hex_digit_value['a'] = 10;
    hex_digit_value['B'] = hex_digit_value['b'] = 11;
    hex_digit_value['C'] = hex_digit_value['c'] = 12;
    hex_digit_value['D'] = hex_digit_value['d'] = 13;
    hex_digit_value['E'] = hex_digit_value['e'] = 14;
    hex_digit_value['F'] = hex_digit_value['f'] = 15;
}

哪里 init_hex_digit_values() 在程序开始时调用一次。我喜欢第一种形式,因为它是最便携的,但第二种形式是您通常看到的。

第三种形式,使用 hex_digit_value[] 数组,是一个过早优化的例子。在某些情况下,它可能比其他字符稍快一些(但这种差异在实践中显然太小,不重要),但如果要使用相同的代码支持非常不同的单字节字符集(例如ebdic和ascii),它可能会很有用。

首先,从包含引用的可打印数据的流(文件或句柄)中读取解码字符:

int get_quoted_printable_char(FILE *from)
{
    int  c, c2, hi, lo;

    /* Paranoid check. */
    if (!from || ferror(from) || feof(from))
        return EOF;

    while (1) {

        c = fgetc(from);
        if (c != '=')
            return c;

        /* Soft newline? */
        c = fgetc(from);
        if (c == '\n')
            continue;

        /* '=' at the end of input? */
        if (c == EOF)
            return EOF;

        hi = hex_digit(c);
        if (hi < 0) {
            /* Invalid input; emit '=' instead. */
            ungetc(c, from);
            return '=';
        }

        c2 = fgetc(from);
        if (c2 == EOF) {
            /* Invalid input; emit '=' <c> instead. */
            ungetc(c, from);
            return '=';
        }

        low = hex_digit(c2);
        if (lo < 0) {
            /* Invalid input; try to emit '=' <c> <c2> instead. */
            ungetc(c2, from);
            ungetc(c, from);
            return '=';
        }

        return low + 16 * high;
    }
}

如果输入有多个连续的软换行符,则存在循环。这不应该真的发生,但是如果真的发生了,我们真的想忽略它们。

如果要将引用的可打印流复制到文件中,您只需要上面的内容,例如

int save(FILE *source, const char *filename)
{
    FILE  *target;
    int    c;

    if (!source || ferror(source))
        return -1;  /* Invalid source handle */

    if (!filename || !*filename)
        return -2;  /* Invalid filename */

    target = fopen(filename, "w");
    if (!target)
        return -3;  /* Cannot open filename for writing */

    while (1) {
        c = get_quoted_printable_char(source);
        if (c == EOF)
            break;

        if (fputc(c, target) == EOF)
            break;
    }

    if (!feof(source) || ferror(source)) {
        fclose(target);
        remove(filename);
        return -4; /* Error reading source. */
    }
    if (fclose(source)) {
        fclose(target);
        remove(filename);
        return -4; /* Error closing source (delayed read error). */
    }

    if (ferror(target) || fflush(target)) {
        fclose(target);
        remove(filename);
        return -5; /* Write error */
    }
    if (fclose(target)) {
        remove(filename);
        return -5; /* Error closing target; delayed write error */
    }

    /* Success. */
    return 0;
}

特别注意防止读写错误。它不是非常快,因为它依赖C库来缓冲输入,但速度也不是非常慢。实际上,它不使用任何显式缓冲区(依赖标准C库来决定如何缓冲源和正在写入的文件),这使得它在总体上是可以接受的。

转换为新缓冲区或就地转换非常类似:

size_t  decode_quoted_printable(char *dst, const char *src)
{
    const char *const origin = dst;

    /* Neither pointer may be NULL. src == dst is okay, however. */
    if (!dst || !src) {
        errno = EINVAL;
        return 0;
    }

    /* Copy loop. */
    while (*src)
        if (*src == '=') {
            if (src[1] == '\0') {
                /* '=' at the end of string. Skipped. */
                break;
            } else
            if (src[1] == '\n') {
                /* Soft newline. Skip both =\n and =\n\r newlines. */
                if (src[2] == '\r')
                    src += 3;
                else
                    src += 2;
            } else
            if (src[1] == '\r') {
                /* Soft newline. Skip both =\r and =\r\n newlines. */
                if (src[2] == '\n')
                    src += 3;
                else
                    src += 2;
            } else {
                const int  hi = hex_digit((unsigned char)(src[1]));
                const int  lo = hex_digit((unsigned char)(src[2]));
                if (hi >= 0 && lo >= 0) {
                    *(dst++) = lo + 16*hi;
                    src += 3;
                } else {
                    /* Error in input format. We are permissive,
                       and reproduce the erroneous `=XY` as-is. */
                    *(dst++) = *(src++);
                }
            }
        } else
        if (*src == '\n') {
            if (src[1] == '\r')
                src += 2;
            else
                src += 1;
            *(dst++) = '\n';
        } else
        if (*src == '\r') {
            if (src[1] == '\n')
                src += 2;
            else
                src += 1;
            *(dst++) = '\n';
        } else
           *(dst++) = *(src++);

    /* Terminate result to make it a string. */
    *dst = '\0';

    /* Just in case the source was an empty string, we clear
       errno to zero.  This also means we always set errno,
       which is a bit rare, but makes the use of this function
       easy: errno is nonzero iff there was an error. */
    errno = 0;
    return (size_t)(dst - origin);
}

请注意,由于不能修改字符串文本,因此不能 char *data = "foo"; decode_quoted_printable(foo, foo); .

你可以做到 char data[] = "foo"; decode_quoted_printable(foo, foo); 但是,因为它声明了一个恰好初始化为字符串的字符数组 "foo" .

请注意,上述功能还可以自动进行通用换行转换。也就是说,它支持所有四个换行约定, \r\n , \n\r , \r 和 \n ,并将它们全部转换为标准C \n 换行符。

目标缓冲区必须至少与源缓冲区一样长,并且可以使用与源相同的目标缓冲区,只要它是可变的(不是文本字符串,也不是指向文本字符串)。

与从流方法中获取一个解码字符的不同之处在于,后者要求整个内容位于内存缓冲区中。这是正负,取决于上下文。