pike.git/
src/
builtin_functions.c
Branch:
Tag:
Non-build tags
All tags
No tags
2012-02-23
2012-02-23 12:55:30 by Henrik Grubbström (Grubba) <grubba@grubba.org>
6aebbf605c0c7b4030451983cc1649363806a7d5 (
61
lines) (+
51
/-
10
)
[
Show
|
Annotate
]
Branch:
7.9
utf8_to_string(): Added support for optionally decoding surrogates.
2059:
*! *! Converts an UTF-8 byte-stream into a string. *!
+
*! @param s
+
*! String of UTF-8 encoded data to decode.
+
*!
+
*! @param extended
+
*! Bitmask with extension options.
+
*! @int
+
*! @value 1
+
*! Accept and decode the extension used by @[string_to_utf8()].
+
*! @value 2
+
*! Accept and decode UTF-8 encoded UTF-16 (ie accept and
+
*! decode valid surrogates).
+
*! @endint
+
*!
*! @note *! Throws an error if the stream is not a legal UTF-8 byte-stream. *!
-
*! Accepts and decodes the extension used by @[string_to_utf8()] if
-
*! @[extended] is @expr{1@}.
-
*!
+
*! @note *! In conformance with RFC 3629 and Unicode 3.1 and later, *! non-shortest forms are not decoded. An error is thrown instead.
2113:
c, i); }
-
#define GET_
CONT_
CHAR(in, i, c) do { \
+
#define GET_CHAR(in, i, c) do {
\
i++; \ if (i >= in->len) \ bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \ NULL, Pike_sp - args, \ "Truncated UTF-8 sequence at end of string.\n"); \ c = STR0 (in)[i]; \
-
+
} while(0)
+
#define GET_CONT_CHAR(in, i, c) do { \
+
GET_CHAR(in, i, c); \
if ((c & 0xc0) != 0x80) \ bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \ NULL, Pike_sp - args, \
2156:
UTF8_SEQ_ERROR ("0xe0 ", c, i - 1, "is a non-shortest form"); cont = 1; }
-
else if (!extended && c == 0xed) {
+
else if (!
(
extended &
1)
&
&
c == 0xed) {
GET_CONT_CHAR (in, i, c);
-
if (c
>
0x9f
)
+
if (c
&
0x20
)
{
+
/* Surrogate. */
+
if (!(extended & 2)) {
UTF8_SEQ_ERROR ("0xed ", c, i - 1, "would decode to "
-
"
an
invalid
surrogate character");
+
"
a
UTF-16
surrogate character");
+
}
+
if (c & 0x10) {
+
UTF8_SEQ_ERROR ("0xed ", c, i - 1, "would decode to "
+
"a UTF-16 low surrogate character");
+
}
+
GET_CONT_CHAR(in, i, c);
+
+
GET_CHAR (in, i, c);
+
if (c != 0xed) {
+
UTF8_SEQ_ERROR ("", c, i-1, "UTF-16 low surrogate "
+
"character required");
+
}
+
GET_CONT_CHAR (in, i, c);
+
if ((c & 0xf0) != 0xb0) {
+
UTF8_SEQ_ERROR ("0xed ", c, i-1, "UTF-16 low surrogate "
+
"character required");
+
}
+
shift = 2;
+
}
cont = 1; } else
2179:
UTF8_SEQ_ERROR ("0xf0 ", c, i - 1, "is a non-shortest form"); cont = 2; }
-
else if (!extended) {
+
else if (!
(
extended
& 1
)
)
{
if (c > 0xf4) UTF8_SEQ_ERROR ("", c, i, "would decode to " "a character outside the valid UTF-8 range");
2203:
"Invalid character 0xff at index %"PRINTPTRDIFFT"d.\n", i);
-
else if (!extended)
+
else if (!
(
extended
& 1
)
)
UTF8_SEQ_ERROR ("", c, i, "would decode to " "a character outside the valid UTF-8 range");
2247:
while(cont--) GET_CONT_CHAR (in, i, c);
+
#undef GET_CHAR
#undef GET_CONT_CHAR #undef UTF8_SEQ_ERROR }
2336:
unsigned int c2 = STR0(in)[i++] & 0x3f; c = (c << 6) | c2; }
+
if ((extended & 2) && (c & 0xfc00) == 0xdc00) {
+
/* Low surrogate */
+
c &= 0x3ff;
+
c |= ((out_str[--j] & 0x3ff)<<10) | 0x10000;
}
-
+
}
out_str[j++] = c; } break;