2005-04-02
2005-04-02 11:43:03 by Martin Stjernholm <mast@lysator.liu.se>
-
dee8943255b4fb97a831cdb66c1b0cd15f497d12
(225 lines)
(+170/-55)
[
Show
| Annotate
]
Branch: 7.9
Made string_to_utf8 and utf8_to_string unicode 3.1 compliant by not allowing
chars outside the valid ranges and by not decoding non-shortest forms.
Rev: src/builtin_functions.c:1.594
Rev: src/testsuite.in:1.753
2:
|| This file is part of Pike. For copyright information see COPYRIGHT.
|| Pike is distributed under GPL, LGPL and MPL. See the file COPYING
|| for more information.
- || $Id: builtin_functions.c,v 1.593 2005/03/16 12:18:19 grubba Exp $
+ || $Id: builtin_functions.c,v 1.594 2005/04/02 11:43:03 mast Exp $
*/
#include "global.h"
1766:
/*! @decl string string_to_utf8(string s)
*! @decl string string_to_utf8(string s, int extended)
*!
- *! Converts a string into an UTF8 compliant byte-stream.
+ *! Converts a string into an UTF-8 compliant byte-stream.
*!
*! @note
- *! Throws an error if characters not valid in an UTF8 stream are
- *! encountered. Valid characters are in the range 0x00000000 - 0x7fffffff.
+ *! Throws an error if characters not valid in an UTF-8 stream are
+ *! encountered. Valid characters are in the ranges
+ *! @expr{0x00000000-0x0000d7ff@} and @expr{0x0000e000-0x0010ffff@}.
*!
- *! If @[extended] is 1, characters in the range 0x80000000-0xfffffffff
- *! will also be accepted, and encoded using a non-standard UTF8 extension.
+ *! If @[extended] is 1 then characters outside the valid ranges are
+ *! accepted too and encoded using the same algorithm. Such encoded
+ *! characters are however not UTF-8 compliant.
*!
*! @seealso
*! @[Locale.Charset.encoder()], @[string_to_unicode()],
1802:
if (c & ~0xffff) {
/* 17bit or more. */
len++;
+ if (!extended && c > 0x10ffff)
+ bad_arg_error ("string_to_utf8", Pike_sp - args, args, 1,
+ NULL, Pike_sp - args,
+ "Character 0x%08x at index %"PRINTPTRDIFFT"d is "
+ "outside the allowed range.\n",
+ c, i);
if (c & ~0x1fffff) {
/* 22bit or more. */
len++;
1810:
len++;
if (c & ~0x7fffffff) {
/* 32bit or more. */
- if (!extended) {
- Pike_error("string_to_utf8(): "
- "Value 0x%08x (index %ld) is larger than 31 bits.\n",
- c, PTRDIFF_T_TO_LONG(i));
- }
+
len++;
/* FIXME: Needs fixing when we get 64bit chars... */
}
}
}
}
-
+ else if (!extended && c >= 0xd800 && c <= 0xdfff)
+ bad_arg_error ("string_to_utf8", Pike_sp - args, args, 1,
+ NULL, Pike_sp - args,
+ "Character 0x%08x at index %"PRINTPTRDIFFT"d is "
+ "in the surrogate range and therefore invalid.\n",
+ c, i);
}
}
}
1881: Inside #if defined(PIKE_DEBUG)
#ifdef PIKE_DEBUG
if (len != j) {
Pike_fatal("string_to_utf8(): Calculated and actual lengths differ: "
- "%ld != %ld\n",
- PTRDIFF_T_TO_LONG(len), PTRDIFF_T_TO_LONG(j));
+ "%"PRINTPTRDIFFT"d != %"PRINTPTRDIFFT"d\n", len, j);
}
#endif /* PIKE_DEBUG */
out = end_shared_string(out);
1893:
/*! @decl string utf8_to_string(string s)
*! @decl string utf8_to_string(string s, int extended)
*!
- *! Converts an UTF8 byte-stream into a string.
+ *! Converts an UTF-8 byte-stream into a string.
*!
*! @note
- *! Throws an error if the stream is not a legal UFT8 byte-stream.
+ *! Throws an error if the stream is not a legal UTF-8 byte-stream.
*!
*! Accepts and decodes the extension used by @[string_to_utf8()], if
*! @[extended] is @expr{1@}.
*!
-
+ *! @note
+ *! In conformance with RFC 3629 and Unicode 3.1 and later,
+ *! non-shortest forms are not decoded. An error will be thrown
+ *! instead.
+ *!
*! @seealso
*! @[Locale.Charset.encoder()], @[string_to_unicode()], @[string_to_utf8()],
*! @[unicode_to_string()]
1909:
{
struct pike_string *in;
struct pike_string *out;
- int len = 0;
+ ptrdiff_t len = 0;
int shift = 0;
- int i,j;
+ ptrdiff_t i,j;
INT_TYPE extended = 0;
get_all_args("utf8_to_string", args, "%S.%i", &in, &extended);
1921:
len++;
if (c & 0x80) {
int cont = 0;
+
+ /* From table 3-6 in the Unicode standard 4.0: Well-Formed UTF-8
+ * Byte Sequences
+ *
+ * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
+ * 000000-00007f 00-7f
+ * 000080-0007ff c2-df 80-bf
+ * 000800-000fff e0 a0-bf 80-bf
+ * 001000-00cfff e1-ec 80-bf 80-bf
+ * 00d000-00d7ff ed 80-9f 80-bf
+ * 00e000-00ffff ee-ef 80-bf 80-bf
+ * 010000-03ffff f0 90-bf 80-bf 80-bf
+ * 040000-0fffff f1-f3 80-bf 80-bf 80-bf
+ * 100000-10ffff f4 80-8f 80-bf 80-bf
+ */
+
if ((c & 0xc0) == 0x80) {
- Pike_error("utf8_to_string(): "
- "Unexpected continuation block 0x%02x at index %d.\n",
+ bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1,
+ NULL, Pike_sp - args,
+ "Invalid continuation character 0x%02x "
+ "at index %"PRINTPTRDIFFT"d.\n",
c, i);
}
-
+
+ #define GET_CONT_CHAR(in, i, c) do { \
+ i++; \
+ if (i >= in->len) \
+ bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \
+ NULL, Pike_sp - args, \
+ "Truncated UTF-8 sequence at end of string.\n"); \
+ c = ((unsigned char *)(in->str))[i]; \
+ if ((c & 0xc0) != 0x80) \
+ bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \
+ NULL, Pike_sp - args, \
+ "Expected continuation character at index %d, " \
+ "got 0x%02x.\n", \
+ i, c); \
+ } while (0)
+
+ #define UTF8_SEQ_ERROR(prefix, c, i, problem) do { \
+ bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \
+ NULL, Pike_sp - args, \
+ "UTF-8 sequence beginning with %s0x%02x " \
+ "at index %"PRINTPTRDIFFT"d %s.\n", \
+ prefix, c, i, problem); \
+ } while (0)
+
if ((c & 0xe0) == 0xc0) {
/* 11bit */
-
+ if (!(c & 0x1e))
+ UTF8_SEQ_ERROR ("", c, i, "is a non-shortest form");
cont = 1;
if (c & 0x1c) {
if (shift < 1) {
shift = 1;
}
}
- } else if ((c & 0xf0) == 0xe0) {
+ }
+
+ else if ((c & 0xf0) == 0xe0) {
/* 16bit */
-
+ if (c == 0xe0) {
+ GET_CONT_CHAR (in, i, c);
+ if (!(c & 0x20))
+ UTF8_SEQ_ERROR ("0xe0 ", c, i - 1, "is a non-shortest form");
+ cont = 1;
+ }
+ else if (!extended && c == 0xed) {
+ GET_CONT_CHAR (in, i, c);
+ if (c > 0x9f)
+ UTF8_SEQ_ERROR ("0xed ", c, i - 1, "would decode to "
+ "an invalid surrogate character");
+ cont = 1;
+ }
+ else
cont = 2;
if (shift < 1) {
shift = 1;
}
- } else {
+ }
+
+ else {
+ if (shift < 2)
shift = 2;
-
+
if ((c & 0xf8) == 0xf0) {
/* 21bit */
-
+ if (c == 0xf0) {
+ GET_CONT_CHAR (in, i, c);
+ if (!(c & 0x30))
+ UTF8_SEQ_ERROR ("0xf0 ", c, i - 1, "is a non-shortest form");
+ cont = 2;
+ }
+ else if (!extended) {
+ if (c > 0xf4)
+ UTF8_SEQ_ERROR ("", c, i, "would decode to "
+ "a character outside the valid UTF-8 range");
+ else if (c == 0xf4) {
+ GET_CONT_CHAR (in, i, c);
+ if (c > 0x8f)
+ UTF8_SEQ_ERROR ("0xf4 ", c, i - 1, "would decode to "
+ "a character outside the valid UTF-8 range");
+ cont = 2;
+ }
+ else
cont = 3;
- } else if ((c & 0xfc) == 0xf8) {
+ }
+ else
+ cont = 3;
+ }
+
+ else if (c == 0xff)
+ bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1,
+ NULL, Pike_sp - args,
+ "Invalid character 0xff at index %"PRINTPTRDIFFT"d.\n",
+ i);
+
+ else if (!extended)
+ UTF8_SEQ_ERROR ("", c, i, "would decode to "
+ "a character outside the valid UTF-8 range");
+
+ else {
+ if ((c & 0xfc) == 0xf8) {
/* 26bit */
-
+ if (c == 0xf8) {
+ GET_CONT_CHAR (in, i, c);
+ if (!(c & 0x38))
+ UTF8_SEQ_ERROR ("0xf8 ", c, i - 1, "is a non-shortest form");
+ cont = 3;
+ }
+ else
cont = 4;
} else if ((c & 0xfe) == 0xfc) {
/* 31bit */
-
+ if (c == 0xfc) {
+ GET_CONT_CHAR (in, i, c);
+ if (!(c & 0x3c))
+ UTF8_SEQ_ERROR ("0xfc ", c, i - 1, "is a non-shortest form");
+ cont = 4;
+ }
+ else
cont = 5;
} else if (c == 0xfe) {
/* 36bit */
- if (!extended) {
- Pike_error("utf8_to_string(): "
- "Character 0xfe at index %d when not in extended mode.\n",
- i);
+ GET_CONT_CHAR (in, i, c);
+ if (!(c & 0x3e))
+ UTF8_SEQ_ERROR ("0xfe ", c, i - 1, "is a non-shortest form");
+ else if (c & 0x3c)
+ UTF8_SEQ_ERROR ("0xfe ", c, i - 1, "would decode to "
+ "a too large character value");
+ cont = 5;
}
- cont = 6;
- } else {
- Pike_error("utf8_to_string(): "
- "Unexpected character 0xff at index %d.\n",
- i);
+
}
}
- while(cont--) {
- i++;
- if (i >= in->len) {
- Pike_error("utf8_to_string(): Truncated UTF8 sequence.\n");
+
+ while(cont--)
+ GET_CONT_CHAR (in, i, c);
}
- c = ((unsigned char *)(in->str))[i];
- if ((c & 0xc0) != 0x80) {
- Pike_error("utf8_to_string(): "
- "Expected continuation character at index %d (got 0x%02x).\n",
- i, c);
+
}
- }
- }
- }
+
if (len == in->len) {
/* 7bit in == 7bit out */
pop_n_elems(args-1);
1993:
if (c & 0x80) {
int cont = 0;
- /* NOTE: The tests aren't as paranoid here, since we've
- * already tested the string above.
+ /* NOTE: No tests here since we've already tested the string
+ * above.
*/
if ((c & 0xe0) == 0xc0) {
/* 11bit */