Branch: Tag:

2006-08-15

2006-08-15 14:52:34 by Henrik Grubbström (Grubba) <grubba@grubba.org>

Backported charset stuff from Pike 7.6.

Rev: lib/modules/Sql.pmod/mysql.pike:1.18
Rev: lib/modules/Sql.pmod/sql_util.pmod:1.11

1:   /* -  * $Id: mysql.pike,v 1.17 2002/11/27 15:40:34 mast Exp $ +  * $Id: mysql.pike,v 1.18 2006/08/15 14:52:34 grubba Exp $    *    * Glue for the Mysql-module    */
13: Inside #if constant(Mysql.mysql)
     inherit Mysql.mysql;    + #define UNICODE_DECODE_MODE 1 // Unicode decode mode + #define LATIN1_UNICODE_ENCODE_MODE 2 // Unicode encode mode with latin1 charset + #define UTF8_UNICODE_ENCODE_MODE 4 // Unicode encode mode with utf8 charset +  + // Set to the above if the connection is in utf8-mode. Enable latin1 + // unicode encode mode by default; it should be compatible with + // earlier pike versions. + static int utf8_mode; +  + // The charset, either "latin1" or "utf8", currently assigned to + // character_set_client when unicode encode mode is enabled. Zero when + // the connection charset has been set to something else than "latin1" + // or "unicode". + static string send_charset; +  + static void update_unicode_encode_mode_from_charset (string charset) + { +  switch (charset) { // Lowercase assumed. +  case "latin1": +  utf8_mode |= LATIN1_UNICODE_ENCODE_MODE; +  utf8_mode &= ~UTF8_UNICODE_ENCODE_MODE; +  send_charset = "latin1"; +  break; +  case "unicode": +  utf8_mode |= UTF8_UNICODE_ENCODE_MODE; +  utf8_mode &= ~LATIN1_UNICODE_ENCODE_MODE; +  send_charset = "utf8"; +  break; +  default: +  // Wrong charset - the mode can't be used. +  utf8_mode |= LATIN1_UNICODE_ENCODE_MODE|UTF8_UNICODE_ENCODE_MODE; +  send_charset = 0; +  break; +  } + } +  + int(0..1) set_unicode_encode_mode (int enable) + //! Enables or disables unicode encode mode. + //! + //! In this mode, if the server supports UTF-8 and the connection + //! charset is @expr{latin1@} (the default) or @expr{unicode@} then + //! @[big_query] handles wide unicode queries. Enabled by default. + //! + //! Unicode encode mode works as follows: Eight bit strings are sent + //! as @expr{latin1@} and wide strings are sent using @expr{utf8@}. + //! @[big_query] sends @expr{SET character_set_client@} statements as + //! necessary to update the charset on the server side. If the server + //! doesn't support that then it fails, but the wide string query + //! would fail anyway. + //! + //! To make this transparent, string literals with introducers (e.g. + //! @expr{_binary 'foo'@}) are excluded from the UTF-8 encoding. This + //! means that @[big_query] needs to do some superficial parsing of + //! the query when it is a wide string. + //! + //! @returns + //! @int + //! @value 1 + //! Unicode encode mode is enabled. + //! @value 0 + //! Unicode encode mode couldn't be enabled because an + //! incompatible connection charset is set. You need to do + //! @expr{@[set_charset]("latin1")@} or + //! @expr{@[set_charset]("unicode")@} to enable it. + //! @endint + //! + //! @note + //! Note that this mode doesn't affect the MySQL system variable + //! @expr{character_set_connection@}, i.e. it will still be set to + //! @expr{latin1@} by default which means server functions like + //! @expr{UPPER()@} won't handle non-@expr{latin1@} characters + //! correctly in all cases. + //! + //! To fix that, do @expr{@[set_charset]("unicode")@}. That will + //! allow unicode encode mode to work while @expr{utf8@} is fully + //! enabled at the server side. + //! + //! Tip: If you enable @expr{utf8@} on the server side, you need to + //! send raw binary strings as @expr{_binary'...'@}. Otherwise they + //! will get UTF-8 encoded by the server. + //! + //! @note + //! When unicode encode mode is enabled and the connection charset + //! is @expr{latin1@}, the charset accepted by @[big_query] is not + //! quite Unicode since @expr{latin1@} is based on @expr{cp1252@}. + //! The differences are in the range @expr{0x80..0x9f@} where + //! Unicode have control chars. + //! + //! This small discrepancy is not present when the connection + //! charset is @expr{unicode@}. + //! + //! @seealso + //! @[set_unicode_decode_mode], @[set_charset] + { +  if (enable) +  update_unicode_encode_mode_from_charset (lower_case (get_charset())); +  else { +  utf8_mode &= ~(LATIN1_UNICODE_ENCODE_MODE|UTF8_UNICODE_ENCODE_MODE); +  send_charset = 0; +  } +  return !!send_charset; + } +  + int get_unicode_encode_mode() + //! Returns nonzero if unicode encode mode is enabled, zero otherwise. + //! + //! @seealso + //! @[set_unicode_encode_mode] + { +  return !!send_charset; + } +  + void set_unicode_decode_mode (int enable) + //! Enable or disable unicode decode mode. + //! + //! In this mode, if the server supports UTF-8 then non-binary text + //! strings in results are are automatically decoded to (possibly + //! wide) unicode strings. Not enabled by default. + //! + //! The statement "@expr{SET character_set_results = utf8@}" is sent + //! to the server to enable the mode. When the mode is disabled, + //! "@expr{SET character_set_results = xxx@}" is sent, where + //! @expr{xxx@} is the connection charset that @[get_charset] returns. + //! + //! @param enable + //! Nonzero enables this feature, zero disables it. + //! + //! @throws + //! Throws an exception if the server doesn't support this, i.e. if + //! the statement above fails. The MySQL system variable + //! @expr{character_set_results@} was added in MySQL 4.1.1. + //! + //! @note + //! This mode is not compatible with earlier pike versions. You need + //! to run in compatibility mode <= 7.6 to have it disabled by + //! default. + //! + //! @seealso + //! @[set_unicode_encode_mode] + { +  if (enable) { +  ::big_query ("SET character_set_results = utf8"); +  utf8_mode |= UNICODE_DECODE_MODE; +  } +  else { +  ::big_query ("SET character_set_results = " + get_charset()); +  utf8_mode &= ~UNICODE_DECODE_MODE; +  } + } +  + int get_unicode_decode_mode() + //! Returns nonzero if unicode decode mode is enabled, zero otherwise. + //! + //! @seealso + //! @[set_unicode_decode_mode] + { +  return utf8_mode & UNICODE_DECODE_MODE; + } +  + void set_charset (string charset) + //! Changes the connection charset. Works similar to sending the query + //! @expr{SET NAMES @[charset]@} but also records the charset on the + //! client side so that various client functions work correctly. + //! + //! @[charset] is a MySQL charset name or the special value + //! @expr{"unicode"@} (see below). You can use @expr{SHOW CHARACTER + //! SET@} to get a list of valid charsets. + //! + //! Specifying @expr{"unicode"@} as charset is the same as + //! @expr{"utf8"@} except that unicode encode and decode modes are + //! enabled too. Briefly, this means that you can send queries as + //! unencoded unicode strings and will get back non-binary text + //! results as unencoded unicode strings. See + //! @[set_unicode_encode_mode] and @[set_unicode_decode_mode] for + //! further details. + //! + //! @throws + //! Throws an exception if the server doesn't support this, i.e. if + //! the statement @expr{SET NAMES@} fails. Support for it was added + //! in MySQL 4.1.0. + //! + //! @note + //! If @[charset] is @expr{"latin1"@} and unicode encode mode is + //! enabled (the default) then @[big_query] can send wide unicode + //! queries transparently if the server supports UTF-8. See + //! @[set_unicode_encode_mode]. + //! + //! @note + //! If unicode decode mode is already enabled (see + //! @[set_unicode_decode_mode]) then this function won't affect the + //! result charset (i.e. the MySQL system variable + //! @expr{character_set_results@}). + //! + //! Actually, a query @expr{SET character_set_results = utf8@} will + //! be sent immediately after setting the charset as above if + //! unicode decode mode is enabled and @[charset] isn't + //! @expr{"utf8"@}. + //! + //! @note + //! You should always use either this function or the + //! @expr{"mysql_charset_name"@} option to @[create] to set the + //! connection charset, or more specifically the charset that the + //! server expects queries to have (i.e. the MySQL system variable + //! @expr{character_set_client@}). Otherwise @[big_query] might not + //! work correctly. + //! + //! Afterwards you may change the system variable + //! @expr{character_set_connection@}, and also + //! @expr{character_set_results@} if unicode decode mode isn't + //! enabled. + //! + //! @note + //! The MySQL @expr{latin1@} charset is close to Windows + //! @expr{cp1252@}. The difference from ISO-8859-1 is a bunch of + //! printable chars in the range @expr{0x80..0x9f@} (which contains + //! control chars in ISO-8859-1). For instance, the euro currency + //! sign is @expr{0x80@}. + //! + //! You can use the @expr{mysql-latin1@} encoding in the + //! @[Locale.Charset] module to do conversions, or just use the + //! special @expr{"unicode"@} charset instead. + //! + //! @seealso + //! @[get_charset], @[set_unicode_encode_mode], @[set_unicode_decode_mode] + { +  charset = lower_case (charset); +  +  ::set_charset (charset == "unicode" ? "utf8" : charset); +  +  if (charset == "unicode" || +  utf8_mode & (LATIN1_UNICODE_ENCODE_MODE|UTF8_UNICODE_ENCODE_MODE)) +  update_unicode_encode_mode_from_charset (charset); +  +  if (charset == "unicode") +  utf8_mode |= UNICODE_DECODE_MODE; +  else if (utf8_mode & UNICODE_DECODE_MODE && charset != "utf8") +  // This setting has been overridden by ::set_charset, so we need +  // to reinstate it. +  ::big_query ("SET character_set_results = utf8"); + } +  + string get_charset() + //! Returns the MySQL name for the current connection charset. + //! + //! Returns @expr{"unicode"@} if unicode encode mode is enabled and + //! UTF-8 is used on the server side (i.e. in + //! @expr{character_set_connection@}). + //! + //! @note + //! In servers with full charset support (i.e. MySQL 4.1.0 or + //! later), this corresponds to the MySQL system variable + //! @expr{character_set_client@} (with one exception - see next + //! note) and thus controls the charset in which queries are sent. + //! The charset used for text strings in results might be something + //! else (and typically is if unicode decode mode is enabled; see + //! @[set_unicode_decode_mode]). + //! + //! @note + //! If the returned charset is @expr{latin1@} or @expr{unicode@} and + //! unicode encode mode is enabled (the default) then + //! @expr{character_set_client@} in the server might be either + //! @expr{latin1@} or @expr{utf8@}, depending on the last sent + //! query. See @[set_unicode_encode_mode] for more info. + //! + //! @seealso + //! @[set_charset] + { +  if (utf8_mode & UTF8_UNICODE_ENCODE_MODE && send_charset) +  return "unicode"; +  return ::get_charset(); + } +    #if constant( Mysql.mysql.MYSQL_NO_ADD_DROP_DB )   // Documented in the C-file.   void create_db( string db )
37: Inside #if constant(Mysql.mysql)
   ({ "\\\\", "\\\"", "\\0", "\\\'", "\\n", "\\r" })));   }    + string latin1_to_utf8 (string s) + //! Converts a string in MySQL @expr{latin1@} format to UTF-8. + { +  return string_to_utf8 (replace (s, ([ +  "\x80": "\u20AC", /*"\x81": "\u0081",*/ "\x82": "\u201A", "\x83": "\u0192", +  "\x84": "\u201E", "\x85": "\u2026", "\x86": "\u2020", "\x87": "\u2021", +  "\x88": "\u02C6", "\x89": "\u2030", "\x8a": "\u0160", "\x8b": "\u2039", +  "\x8c": "\u0152", /*"\x8d": "\u008D",*/ "\x8e": "\u017D", /*"\x8f": "\u008F",*/ +  /*"\x90": "\u0090",*/ "\x91": "\u2018", "\x92": "\u2019", "\x93": "\u201C", +  "\x94": "\u201D", "\x95": "\u2022", "\x96": "\u2013", "\x97": "\u2014", +  "\x98": "\u02DC", "\x99": "\u2122", "\x9a": "\u0161", "\x9b": "\u203A", +  "\x9c": "\u0153", /*"\x9d": "\u009D",*/ "\x9e": "\u017E", "\x9f": "\u0178", +  ]))); + } +  + string utf8_encode_query (string q, function(string:string) encode_fn) + //! Encodes the appropriate sections of the query with @[encode_fn]. + //! Everything except strings prefixed by an introducer (i.e. + //! @expr{_something@} or @expr{N@}) is encoded. + { +  // We need to find the segments that shouldn't be encoded. +  string e = ""; +  while (1) { +  sscanf(q, "%[^\'\"]%s", string prefix, string suffix); +  e += encode_fn (prefix); +  +  if (suffix == "") break; +  +  string quote = suffix[..0]; +  int start = 1; +  int end; +  while ((end = search(suffix, quote, start)) >= 0) { +  if (suffix[end-1] == '\\') { +  // Count the number of preceding back-slashes. +  // if odd, continue searching after the quote. +  int i; +  for (i = 2; i < end; i++) { +  if (suffix[end - i] != '\\') break; +  } +  if (!(i & 1)) { +  start = end+1; +  continue; +  } +  } +  if (sizeof(suffix) == end+1) break; +  if (suffix[end+1] == quote[0]) { +  // Quote quoted by doubling. +  start = end+2; +  continue; +  } +  break; +  } +  + #define IS_IDENTIFIER_CHAR(chr) (Unicode.is_wordchar (chr) || \ +  (<'_', '$'>)[chr]) +  +  int intpos = -1; +  +  // Optimize the use of _binary. +  if (has_suffix (prefix, "_binary")) +  intpos = sizeof (prefix) - sizeof ("_binary"); +  else if (has_suffix (prefix, "_binary ")) +  intpos = sizeof (prefix) - sizeof ("_binary "); +  +  else { +  // Find the white-space suffix of the prefix. +  int i = sizeof(prefix); +  while (i--) { +  if (!(< ' ', '\n', '\r', '\t' >)[prefix[i]]) break; +  } +  +  if (i >= 0) { +  if ((<'n', 'N'>)[prefix[i]]) +  // Probably got a national charset string. +  intpos = i; +  else { +  // The following assumes all possible charset names contain +  // only [a-zA-Z0-9_$] and are max 32 chars (from +  // MY_CS_NAME_SIZE in m_ctype.h). +  sscanf (reverse (prefix[i - 33..i]), "%[a-zA-Z0-9_$]%s", +  string rev_intro, string rest); +  if (sizeof (rev_intro) && rev_intro[-1] == '_' && sizeof (rest)) +  intpos = i - sizeof (rev_intro) + 1; +  } +  } +  } +  +  int got_introducer; +  if (intpos == 0) +  // The prefix begins with the introducer. +  got_introducer = 1; +  else if (intpos > 0) { +  // Check that the introducer sequence we found isn't a suffix of +  // some longer keyword or identifier. +  int prechar = prefix[intpos - 1]; +  if (!IS_IDENTIFIER_CHAR (prechar)) +  got_introducer = 1; +  } +  +  if (got_introducer) { +  string s = suffix[..end]; +  if (String.width (s) > 8) { +  string encoding = prefix[intpos..]; +  if (has_prefix (encoding, "_")) +  sscanf (encoding[1..], "%[a-zA-Z0-9]", encoding); +  else +  encoding = "utf8"; // Gotta be "N". +  s = s[1..sizeof (s) - 2]; +  if (sizeof (s) > 40) s = sprintf ("%O...", s[..37]); +  else s = sprintf ("%O", s); +  predef::error ("A string in the query should be %s encoded " +  "but it is wide: %s\n", encoding, s); +  } +  e += s; +  } else { +  e += encode_fn (suffix[..end]); +  } +  +  q = suffix[end+1..]; +  } +  return e; + } +    // The following time conversion functions assumes the SQL server   // handles time in this local timezone. They map the special zero   // time/date spec to 0.
144: Inside #if constant(Mysql.mysql)
   }   }    - //! - int|object big_query(string q, mapping(string|int:mixed)|void bindings) + Mysql.mysql_result big_query (string query, +  mapping(string|int:mixed)|void bindings, +  void|string charset)   { -  if (!bindings) -  return ::big_query(q); -  return ::big_query(.sql_util.emulate_bindings(q,bindings,this_object())); +  if (bindings) +  query = .sql_util.emulate_bindings(query,bindings,this); +  +  string restore_charset; +  if (charset) { +  restore_charset = send_charset || get_charset(); +  if (charset != restore_charset) +  ::big_query ("SET character_set_client=" + charset); +  else +  restore_charset = 0;    }    -  +  else if (send_charset) { +  string new_send_charset; +  +  if (utf8_mode & LATIN1_UNICODE_ENCODE_MODE) { +  if (String.width (query) == 8) +  new_send_charset = "latin1"; +  else { +  query = utf8_encode_query (query, latin1_to_utf8); +  new_send_charset = "utf8"; +  } +  } +  +  else { /* utf8_mode & UTF8_UNICODE_ENCODE_MODE */ +  if (_can_send_as_latin1 (query)) +  new_send_charset = "latin1"; +  else { +  query = utf8_encode_query (query, string_to_utf8); +  new_send_charset = "utf8"; +  } +  } +  +  if (new_send_charset != send_charset) { +  mixed err; +  if (err = ::big_query("SET character_set_client=" + new_send_charset)) { +  if (new_send_charset == "utf8") +  predef::error ("The query is a wide string " +  "and the MySQL server doesn't support UTF-8: %s\n", +  describe_error (err)); +  throw(err); +  } +  send_charset = new_send_charset; +  } +  } +  +  int|object res = ::big_query(query); +  +  if (restore_charset) { +  if (send_charset && (<"latin1", "utf8">)[charset]) +  send_charset = charset; +  else +  ::big_query("SET character_set_client=" + restore_charset); +  } +  +  if (!objectp(res)) return res; +  +  if (utf8_mode & UNICODE_DECODE_MODE) { +  return .sql_util.UnicodeWrapper(res); +  } +  return res; + } +    int(0..1) is_keyword( string name )   //! Return 1 if the argument @[name] is a mysql keyword.   { -  +  // FIXME: Document which version of MySQL this is up-to-date with.    return (<    "action", "add", "aggregate", "all", "alter", "after", "and", "as",    "asc", "avg", "avg_row_length", "auto_increment", "between", "bigint",
196: Inside #if constant(Mysql.mysql)
   >)[ lower_case(name) ];   }    + static void create(string|void host, string|void database, +  string|void user, string|void password, +  mapping(string:string|int)|void options) + { +  if (options) { +  string charset = options->mysql_charset_name || "latin1"; +  if (charset == "unicode") +  options->mysql_charset_name = "utf8"; +  +  ::create(host||"", database||"", user||"", password||"", options); +  +  update_unicode_encode_mode_from_charset (lower_case (charset)); +  +  if (charset == "unicode") +  utf8_mode |= UNICODE_DECODE_MODE; +  else if (options->unicode_decode_mode) +  set_unicode_decode_mode (1); +  +  } else { +  ::create(host||"", database||"", user||"", password||""); +  +  update_unicode_encode_mode_from_charset ("latin1"); +  } + } +    #endif /* constant(Mysql.mysql) */