pike.git / src / builtin_functions.c

version» Context lines:

pike.git/src/builtin_functions.c:2052:    out = end_shared_string(out);    pop_n_elems(args);    push_string(out);   }      /*! @decl string utf8_to_string(string(0..255) s)    *! @decl string utf8_to_string(string(0..255) s, int extended)    *!    *! Converts an UTF-8 byte-stream into a string.    *! +  *! @param s +  *! String of UTF-8 encoded data to decode. +  *! +  *! @param extended +  *! Bitmask with extension options. +  *! @int +  *! @value 1 +  *! Accept and decode the extension used by @[string_to_utf8()]. +  *! @value 2 +  *! Accept and decode UTF-8 encoded UTF-16 (ie accept and +  *! decode valid surrogates). +  *! @endint +  *!    *! @note    *! Throws an error if the stream is not a legal UTF-8 byte-stream.    *! -  *! Accepts and decodes the extension used by @[string_to_utf8()] if -  *! @[extended] is @expr{1@}. -  *! +     *! @note    *! In conformance with RFC 3629 and Unicode 3.1 and later,    *! non-shortest forms are not decoded. An error is thrown instead.    *!    *! @seealso    *! @[Locale.Charset.encoder()], @[string_to_unicode()], @[string_to_utf8()],    *! @[unicode_to_string()]    */   PMOD_EXPORT void f_utf8_to_string(INT32 args)   {
pike.git/src/builtin_functions.c:2106:    */       if ((c & 0xc0) == 0x80) {    bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1,    NULL, Pike_sp - args,    "Invalid continuation character 0x%02x "    "at index %"PRINTPTRDIFFT"d.\n",    c, i);    }    - #define GET_CONT_CHAR(in, i, c) do { \ + #define GET_CHAR(in, i, c) do { \    i++; \    if (i >= in->len) \    bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \    NULL, Pike_sp - args, \    "Truncated UTF-8 sequence at end of string.\n"); \    c = STR0 (in)[i]; \ -  +  } while(0) + #define GET_CONT_CHAR(in, i, c) do { \ +  GET_CHAR(in, i, c); \    if ((c & 0xc0) != 0x80) \    bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \    NULL, Pike_sp - args, \    "Expected continuation character at index %d, " \    "got 0x%02x.\n", \    i, c); \    } while (0)      #define UTF8_SEQ_ERROR(prefix, c, i, problem) do { \    bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \
pike.git/src/builtin_functions.c:2149:    }       else if ((c & 0xf0) == 0xe0) {    /* 16bit */    if (c == 0xe0) {    GET_CONT_CHAR (in, i, c);    if (!(c & 0x20))    UTF8_SEQ_ERROR ("0xe0 ", c, i - 1, "is a non-shortest form");    cont = 1;    } -  else if (!extended && c == 0xed) { +  else if (!(extended & 1) && c == 0xed) {    GET_CONT_CHAR (in, i, c); -  if (c > 0x9f) +  if (c & 0x20) { +  /* Surrogate. */ +  if (!(extended & 2)) {    UTF8_SEQ_ERROR ("0xed ", c, i - 1, "would decode to " -  "an invalid surrogate character"); +  "a UTF-16 surrogate character"); +  } +  if (c & 0x10) { +  UTF8_SEQ_ERROR ("0xed ", c, i - 1, "would decode to " +  "a UTF-16 low surrogate character"); +  } +  GET_CONT_CHAR(in, i, c); +  +  GET_CHAR (in, i, c); +  if (c != 0xed) { +  UTF8_SEQ_ERROR ("", c, i-1, "UTF-16 low surrogate " +  "character required"); +  } +  GET_CONT_CHAR (in, i, c); +  if ((c & 0xf0) != 0xb0) { +  UTF8_SEQ_ERROR ("0xed ", c, i-1, "UTF-16 low surrogate " +  "character required"); +  } +  shift = 2; +  }    cont = 1;    }    else    cont = 2;    if (shift < 1) {    shift = 1;    }    }       else {    if ((c & 0xf8) == 0xf0) {    /* 21bit */    if (c == 0xf0) {    GET_CONT_CHAR (in, i, c);    if (!(c & 0x30))    UTF8_SEQ_ERROR ("0xf0 ", c, i - 1, "is a non-shortest form");    cont = 2;    } -  else if (!extended) { +  else if (!(extended & 1)) {    if (c > 0xf4)    UTF8_SEQ_ERROR ("", c, i, "would decode to "    "a character outside the valid UTF-8 range");    else if (c == 0xf4) {    GET_CONT_CHAR (in, i, c);    if (c > 0x8f)    UTF8_SEQ_ERROR ("0xf4 ", c, i - 1, "would decode to "    "a character outside the valid UTF-8 range");    cont = 2;    }
pike.git/src/builtin_functions.c:2196:    else    cont = 3;    }       else if (c == 0xff)    bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1,    NULL, Pike_sp - args,    "Invalid character 0xff at index %"PRINTPTRDIFFT"d.\n",    i);    -  else if (!extended) +  else if (!(extended & 1))    UTF8_SEQ_ERROR ("", c, i, "would decode to "    "a character outside the valid UTF-8 range");       else {    if ((c & 0xfc) == 0xf8) {    /* 26bit */    if (c == 0xf8) {    GET_CONT_CHAR (in, i, c);    if (!(c & 0x38))    UTF8_SEQ_ERROR ("0xf8 ", c, i - 1, "is a non-shortest form");
pike.git/src/builtin_functions.c:2240:    }    }       if (shift < 2)    shift = 2;    }       while(cont--)    GET_CONT_CHAR (in, i, c);    + #undef GET_CHAR   #undef GET_CONT_CHAR   #undef UTF8_SEQ_ERROR    }    }    if (len == in->len) {    /* 7bit in == 7bit out */    pop_n_elems(args-1);    return;    }   
pike.git/src/builtin_functions.c:2329:    c &= 0x01;    } else {    /* 36bit */    cont = 6;    c = 0;    }    while(cont--) {    unsigned int c2 = STR0(in)[i++] & 0x3f;    c = (c << 6) | c2;    } +  if ((extended & 2) && (c & 0xfc00) == 0xdc00) { +  /* Low surrogate */ +  c &= 0x3ff; +  c |= ((out_str[--j] & 0x3ff)<<10) | 0x10000;    } -  +  }    out_str[j++] = c;    }    break;    }    }      #ifdef PIKE_DEBUG    if (j != len) {    Pike_fatal("utf8_to_string(): Calculated and actual lengths differ: "    "%"PRINTPTRDIFFT"d != %"PRINTPTRDIFFT"d\n",