pike.git / src / builtin_functions.c

version» Context lines:

pike.git/src/builtin_functions.c:1:   /*   || This file is part of Pike. For copyright information see COPYRIGHT.   || Pike is distributed under GPL, LGPL and MPL. See the file COPYING   || for more information. - || $Id: builtin_functions.c,v 1.474 2004/12/22 12:50:09 grubba Exp $ + || $Id: builtin_functions.c,v 1.475 2005/04/02 14:25:13 mast Exp $   */      #include "global.h" - RCSID("$Id: builtin_functions.c,v 1.474 2004/12/22 12:50:09 grubba Exp $"); + RCSID("$Id: builtin_functions.c,v 1.475 2005/04/02 14:25:13 mast Exp $");   #include "interpret.h"   #include "svalue.h"   #include "pike_macros.h"   #include "object.h"   #include "program.h"   #include "array.h"   #include "pike_error.h"   #include "constants.h"   #include "mapping.h"   #include "stralloc.h"
pike.git/src/builtin_functions.c:1534:    }   #endif /* PIKE_DEBUG */    out = end_shared_string(out);    pop_n_elems(args);    push_string(out);   }      /*! @decl string utf8_to_string(string s)    *! @decl string utf8_to_string(string s, int extended)    *! -  *! Converts an UTF8 byte-stream into a string. +  *! Converts an UTF-8 byte-stream into a string.    *!    *! @note -  *! Throws an error if the stream is not a legal UFT8 byte-stream. +  *! Throws an error if the stream is not a legal UTF-8 byte-stream.    *!    *! Accepts and decodes the extension used by @[string_to_utf8()], if -  *! @[extended] is @tt{1@}. +  *! @[extended] is @expr{1@}.    *! -  +  *! @note +  *! In conformance with RFC 3629 and Unicode 3.1 and later, +  *! non-shortest forms are not decoded. An error will be thrown +  *! instead. +  *!    *! @seealso    *! @[Locale.Charset.encoder()], @[string_to_unicode()], @[string_to_utf8()],    *! @[unicode_to_string()]    */   PMOD_EXPORT void f_utf8_to_string(INT32 args)   {    struct pike_string *in;    struct pike_string *out; -  int len = 0; +  ptrdiff_t len = 0;    int shift = 0; -  int i,j; +  ptrdiff_t i,j;    int extended = 0;       get_all_args("utf8_to_string", args, "%S", &in);       if (args > 1) {    if (Pike_sp[1-args].type != T_INT) {    SIMPLE_BAD_ARG_ERROR("utf8_to_string()", 2, "int|void");    }    extended = Pike_sp[1-args].u.integer;    }       for(i=0; i < in->len; i++) {    unsigned int c = ((unsigned char *)in->str)[i];    len++;    if (c & 0x80) {    int cont = 0; -  +  +  /* From table 3-6 in the Unicode standard 4.0: Well-Formed UTF-8 +  * Byte Sequences +  * +  * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte +  * 000000-00007f 00-7f +  * 000080-0007ff c2-df 80-bf +  * 000800-000fff e0 a0-bf 80-bf +  * 001000-00cfff e1-ec 80-bf 80-bf +  * 00d000-00d7ff ed 80-9f 80-bf +  * 00e000-00ffff ee-ef 80-bf 80-bf +  * 010000-03ffff f0 90-bf 80-bf 80-bf +  * 040000-0fffff f1-f3 80-bf 80-bf 80-bf +  * 100000-10ffff f4 80-8f 80-bf 80-bf +  */ +     if ((c & 0xc0) == 0x80) { -  Pike_error("utf8_to_string(): " -  "Unexpected continuation block 0x%02x at index %d.\n", +  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, +  NULL, Pike_sp - args, +  "Invalid continuation character 0x%02x " +  "at index %"PRINTPTRDIFFT"d.\n",    c, i);    } -  +  + #define GET_CONT_CHAR(in, i, c) do { \ +  i++; \ +  if (i >= in->len) \ +  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \ +  NULL, Pike_sp - args, \ +  "Truncated UTF-8 sequence at end of string.\n"); \ +  c = ((unsigned char *)(in->str))[i]; \ +  if ((c & 0xc0) != 0x80) \ +  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \ +  NULL, Pike_sp - args, \ +  "Expected continuation character at index %d, " \ +  "got 0x%02x.\n", \ +  i, c); \ +  } while (0) +  + #define UTF8_SEQ_ERROR(prefix, c, i, problem) do { \ +  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \ +  NULL, Pike_sp - args, \ +  "UTF-8 sequence beginning with %s0x%02x " \ +  "at index %"PRINTPTRDIFFT"d %s.\n", \ +  prefix, c, i, problem); \ +  } while (0) +     if ((c & 0xe0) == 0xc0) {    /* 11bit */ -  +  if (!(c & 0x1e)) +  UTF8_SEQ_ERROR ("", c, i, "is a non-shortest form");    cont = 1;    if (c & 0x1c) {    if (shift < 1) {    shift = 1;    }    } -  } else if ((c & 0xf0) == 0xe0) { +  } +  +  else if ((c & 0xf0) == 0xe0) {    /* 16bit */ -  +  if (c == 0xe0) { +  GET_CONT_CHAR (in, i, c); +  if (!(c & 0x20)) +  UTF8_SEQ_ERROR ("0xe0 ", c, i - 1, "is a non-shortest form"); +  cont = 1; +  } +  else    cont = 2;    if (shift < 1) {    shift = 1;    } -  } else { +  } +  +  else { +  if (shift < 2)    shift = 2; -  +     if ((c & 0xf8) == 0xf0) {    /* 21bit */ -  +  if (c == 0xf0) { +  GET_CONT_CHAR (in, i, c); +  if (!(c & 0x30)) +  UTF8_SEQ_ERROR ("0xf0 ", c, i - 1, "is a non-shortest form"); +  cont = 2; +  } +  else    cont = 3; -  } else if ((c & 0xfc) == 0xf8) { +  } +  +  else if (c == 0xff) +  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, +  NULL, Pike_sp - args, +  "Invalid character 0xff at index %"PRINTPTRDIFFT"d.\n", +  i); +  +  else { +  if ((c & 0xfc) == 0xf8) {    /* 26bit */ -  +  if (c == 0xf8) { +  GET_CONT_CHAR (in, i, c); +  if (!(c & 0x38)) +  UTF8_SEQ_ERROR ("0xf8 ", c, i - 1, "is a non-shortest form"); +  cont = 3; +  } +  else    cont = 4;    } else if ((c & 0xfe) == 0xfc) {    /* 31bit */ -  +  if (c == 0xfc) { +  GET_CONT_CHAR (in, i, c); +  if (!(c & 0x3c)) +  UTF8_SEQ_ERROR ("0xfc ", c, i - 1, "is a non-shortest form"); +  cont = 4; +  } +  else    cont = 5;    } else if (c == 0xfe) {    /* 36bit */ -  if (!extended) { -  Pike_error("utf8_to_string(): " -  "Character 0xfe at index %d when not in extended mode.\n", -  i); +  if (!extended) +  UTF8_SEQ_ERROR ("", c, i - 1, "would decode to " +  "a too large character value"); +  GET_CONT_CHAR (in, i, c); +  if (!(c & 0x3e)) +  UTF8_SEQ_ERROR ("0xfe ", c, i - 1, "is a non-shortest form"); +  else if (c & 0x3c) +  UTF8_SEQ_ERROR ("0xfe ", c, i - 1, "would decode to " +  "a too large character value"); +  cont = 5;    } -  cont = 6; -  } else { -  Pike_error("utf8_to_string(): " -  "Unexpected character 0xff at index %d.\n", -  i); +     }    } -  while(cont--) { -  i++; -  if (i >= in->len) { -  Pike_error("utf8_to_string(): Truncated UTF8 sequence.\n"); +  +  while(cont--) +  GET_CONT_CHAR (in, i, c);    } -  c = ((unsigned char *)(in->str))[i]; -  if ((c & 0xc0) != 0x80) { -  Pike_error("utf8_to_string(): " -  "Expected continuation character at index %d (got 0x%02x).\n", -  i, c); +     } -  } -  } -  } +     if (len == in->len) {    /* 7bit in == 7bit out */    pop_n_elems(args-1);    return;    }       out = begin_wide_shared_string(len, shift);       for(j=i=0; i < in->len; i++) {    unsigned int c = ((unsigned char *)in->str)[i];       if (c & 0x80) {    int cont = 0;    -  /* NOTE: The tests aren't as paranoid here, since we've -  * already tested the string above. +  /* NOTE: No tests here since we've already tested the string +  * above.    */    if ((c & 0xe0) == 0xc0) {    /* 11bit */    cont = 1;    c &= 0x1f;    } else if ((c & 0xf0) == 0xe0) {    /* 16bit */    cont = 2;    c &= 0x0f;    } else if ((c & 0xf8) == 0xf0) {