pike.git / src / builtin_functions.c

version» Context lines:

pike.git/src/builtin_functions.c:2201:    get_all_args("utf8_to_string", args, "%S.%i", &in, &extended);       check_string_range(in, 1, &min, &max);       if (min >= 0 && max <= 0x7f) {    /* 7bit string -- already valid utf8. */    pop_n_elems(args - 1);    return;    }    -  for(i=0; i < in->len; i++) { -  unsigned int c = STR0(in)[i]; -  len++; -  if (c & 0x80) { -  int cont = 0; +  len = pike_string_utf8_decode_length(in, args, extended, &shift);    -  /* From table 3-6 in the Unicode standard 4.0: Well-Formed UTF-8 -  * Byte Sequences -  * -  * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte -  * 000000-00007f 00-7f -  * 000080-0007ff c2-df 80-bf -  * 000800-000fff e0 a0-bf 80-bf -  * 001000-00cfff e1-ec 80-bf 80-bf -  * 00d000-00d7ff ed 80-9f 80-bf -  * 00e000-00ffff ee-ef 80-bf 80-bf -  * 010000-03ffff f0 90-bf 80-bf 80-bf -  * 040000-0fffff f1-f3 80-bf 80-bf 80-bf -  * 100000-10ffff f4 80-8f 80-bf 80-bf -  */ -  -  if ((c & 0xc0) == 0x80) { -  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, -  NULL, Pike_sp - args, -  "Invalid continuation character 0x%02x " -  "at index %"PRINTPTRDIFFT"d.\n", -  c, i); -  } -  - #define GET_CHAR(in, i, c) do { \ -  i++; \ -  if (i >= in->len) \ -  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \ -  NULL, Pike_sp - args, \ -  "Truncated UTF-8 sequence at end of string.\n"); \ -  c = STR0 (in)[i]; \ -  } while(0) - #define GET_CONT_CHAR(in, i, c) do { \ -  GET_CHAR(in, i, c); \ -  if ((c & 0xc0) != 0x80) \ -  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \ -  NULL, Pike_sp - args, \ -  "Expected continuation character at index %d, " \ -  "got 0x%02x.\n", \ -  i, c); \ -  } while (0) -  - #define UTF8_SEQ_ERROR(prefix, c, i, problem) do { \ -  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, \ -  NULL, Pike_sp - args, \ -  "UTF-8 sequence beginning with %s0x%02x " \ -  "at index %"PRINTPTRDIFFT"d %s.\n", \ -  prefix, c, i, problem); \ -  } while (0) -  -  if ((c & 0xe0) == 0xc0) { -  /* 11bit */ -  if (!(c & 0x1e)) -  UTF8_SEQ_ERROR ("", c, i, "is a non-shortest form"); -  cont = 1; -  if (c & 0x1c) { -  if (shift < 1) { -  shift = 1; -  } -  } -  } -  -  else if ((c & 0xf0) == 0xe0) { -  /* 16bit */ -  if (c == 0xe0) { -  GET_CONT_CHAR (in, i, c); -  if (!(c & 0x20)) -  UTF8_SEQ_ERROR ("0xe0 ", c, i - 1, "is a non-shortest form"); -  cont = 1; -  } -  else if (!(extended & 1) && c == 0xed) { -  GET_CONT_CHAR (in, i, c); -  if (c & 0x20) { -  /* Surrogate. */ -  if (!(extended & 2)) { -  UTF8_SEQ_ERROR ("0xed ", c, i - 1, "would decode to " -  "a UTF-16 surrogate character"); -  } -  if (c & 0x10) { -  UTF8_SEQ_ERROR ("0xed ", c, i - 1, "would decode to " -  "a UTF-16 low surrogate character"); -  } -  GET_CONT_CHAR(in, i, c); -  -  GET_CHAR (in, i, c); -  if (c != 0xed) { -  UTF8_SEQ_ERROR ("", c, i-1, "UTF-16 low surrogate " -  "character required"); -  } -  GET_CONT_CHAR (in, i, c); -  if ((c & 0xf0) != 0xb0) { -  UTF8_SEQ_ERROR ("0xed ", c, i-1, "UTF-16 low surrogate " -  "character required"); -  } -  shift = 2; -  } -  cont = 1; -  } -  else -  cont = 2; -  if (shift < 1) { -  shift = 1; -  } -  } -  -  else { -  if ((c & 0xf8) == 0xf0) { -  /* 21bit */ -  if (c == 0xf0) { -  GET_CONT_CHAR (in, i, c); -  if (!(c & 0x30)) -  UTF8_SEQ_ERROR ("0xf0 ", c, i - 1, "is a non-shortest form"); -  cont = 2; -  } -  else if (!(extended & 1)) { -  if (c > 0xf4) -  UTF8_SEQ_ERROR ("", c, i, "would decode to " -  "a character outside the valid UTF-8 range"); -  else if (c == 0xf4) { -  GET_CONT_CHAR (in, i, c); -  if (c > 0x8f) -  UTF8_SEQ_ERROR ("0xf4 ", c, i - 1, "would decode to " -  "a character outside the valid UTF-8 range"); -  cont = 2; -  } -  else -  cont = 3; -  } -  else -  cont = 3; -  } -  -  else if (c == 0xff) -  bad_arg_error ("utf8_to_string", Pike_sp - args, args, 1, -  NULL, Pike_sp - args, -  "Invalid character 0xff at index %"PRINTPTRDIFFT"d.\n", -  i); -  -  else if (!(extended & 1)) -  UTF8_SEQ_ERROR ("", c, i, "would decode to " -  "a character outside the valid UTF-8 range"); -  -  else { -  if ((c & 0xfc) == 0xf8) { -  /* 26bit */ -  if (c == 0xf8) { -  GET_CONT_CHAR (in, i, c); -  if (!(c & 0x38)) -  UTF8_SEQ_ERROR ("0xf8 ", c, i - 1, "is a non-shortest form"); -  cont = 3; -  } -  else -  cont = 4; -  } else if ((c & 0xfe) == 0xfc) { -  /* 31bit */ -  if (c == 0xfc) { -  GET_CONT_CHAR (in, i, c); -  if (!(c & 0x3c)) -  UTF8_SEQ_ERROR ("0xfc ", c, i - 1, "is a non-shortest form"); -  cont = 4; -  } -  else -  cont = 5; -  } else if (c == 0xfe) { -  /* 36bit */ -  GET_CONT_CHAR (in, i, c); -  if (!(c & 0x3e)) -  UTF8_SEQ_ERROR ("0xfe ", c, i - 1, "is a non-shortest form"); -  else if (c & 0x3c) -  UTF8_SEQ_ERROR ("0xfe ", c, i - 1, "would decode to " -  "a too large character value"); -  cont = 5; -  } -  } -  -  if (shift < 2) -  shift = 2; -  } -  -  while(cont--) -  GET_CONT_CHAR (in, i, c); -  - #undef GET_CHAR - #undef GET_CONT_CHAR - #undef UTF8_SEQ_ERROR -  } -  } +     if (len == in->len) {    /* 7bit in == 7bit out */    pop_n_elems(args-1);    return;    }       out = begin_wide_shared_string(len, shift);       switch (shift) {    case 0: {