Branch: Tag:

2008-02-27

2008-02-27 08:53:38 by Tobias Liin <liin@roxen.com>

Fixed bug #4473. The module now decodes data upon saving it in the database as UTF-8. It uses charset in content-type header and fallbacks to meta tag for HTML and BOM + encoding attribute for XML.

Rev: server/modules/tags/insert_cached_href.pike:1.21

7:   //<locale-token project="mod_insert_cached_href">LOCALE</locale-token>   #define LOCALE(X,Y) _DEF_LOCALE("mod_insert_cached_href",X,Y)    - constant cvs_version = "$Id: insert_cached_href.pike,v 1.20 2008/02/11 10:27:24 jonasw Exp $"; + constant cvs_version = "$Id: insert_cached_href.pike,v 1.21 2008/02/27 08:53:38 liin Exp $";      constant thread_safe = 1;   constant module_type = MODULE_TAG;
190:   */   public string get_result_sync(HTTPClient client, mapping args, mapping header) {    if (!is_redirect(client->status) || !MAX_REDIRECTS) -  return client->data(); +  return decode_data(client->data(), client->con->headers);       int counter;    string location = client->con->headers->location;       if (!location || !sizeof(location)) -  return client->data(); +  return decode_data(client->data(), client->con->headers);       DWRITE("Following redirect from " + (string)client->url +    " to " + location);
212:    location = new_client->con->headers->location;       if (!location || !sizeof(location)) -  return new_client->data(); +  return decode_data(new_client->data(), new_client->con->headers);       DWRITE("Following redirect from " + (string)new_client->url +    " to " + location);
224:    counter++;    }    -  return new_client->data(); +  return decode_data(new_client->data(), new_client->con->headers);   }      /*
286:    // In practice a server never runs unthreaded. Keep it    // simple and only return when status code < 300:    if(client && client->status > 0 && client->status < 300) { -  href_database->update_data(to_fetch["url"], client->data()); -  return client->data(); +  string data = decode_data(client->data(), client->headers); +  href_database->update_data(to_fetch["url"], data); +  return data;    } else    return "";   #endif
427:    if (result && sizeof(result) && result[0]["data"] != "") {    DWRITE("get_data(): Returning cached data for " + args["cached-href"]);    -  return result[0]["data"]; +  return utf8_to_string(result[0]["data"]);    } else if (!args["pure-db"]) {    DWRITE("get_data(): No cached data existed for " + args["cached-href"] + " so performing a synchronous fetch");   
530:    , url));       sql_query("UPDATE " + data_table + " SET data=%s, latest_write=%d WHERE url=%s", -  data, time(), url); +  string_to_utf8(data), time(), url);       sql_query("UPDATE " + request_table + " SET next_fetch=next_fetch + " + (24 * 3600)    + " WHERE time_of_day > 0 AND " + time() + " > next_fetch AND url='"
657:    string res = href_database->get_data(Attributes(args)->get_db_args(),    (["x-roxen-recursion-depth":recursion_depth]));    +  // DEPRECATED attribute 'decode-xml'. Keep it during transition period for upgrades, +  // since there will be undecoded data in the database until the first fetch for each +  // URL. The same type of decoding now occur upon saving the data in the database    if(args["decode-xml"]) {    // Parse xml header and recode content to internal representation.    mixed result = catch {    res = Parser.XML.Simple()->autoconvert(res);    };    -  if (result) { -  werror("INSERT_CACHED_HREF: An error occurred trying to decode the data from " + -  args["cached-href"] + ".\n"); -  } -  +     // Remove any bytes potentially still preceeding the first '<' in the xml file    return res[search(res, "<")..];    }
826:       if (href_database)    if (orig_url) -  href_database->update_data(orig_url, con->data()); +  href_database->update_data(orig_url, decode_data(con->data(), con->headers));    else -  href_database->update_data((string)url, con->data()); +  href_database->update_data((string)url, decode_data(con->data(), con->headers));       if (sync)    queue->write("@");
887:   }   #endif    + /* +  Decodes data based on 1) HTTP headers or 2) fallbacks on +  data content, meta http-equiv for html and BOM + encoding='' +  for xml + */ + string decode_data(string data, mapping headers) { +  function get_ct_cs = +  lambda(string ct) { +  string cs; +  foreach((ct/";")[1..], string s) { +  string s2 = String.trim_all_whites(s); +  string _cs; +  if(sscanf(s2, "charset=%s", _cs) == 1) +  cs = String.trim_all_whites(_cs); +  } +  return cs; +  };    -  +  function get_cs_from_html = +  lambda(string data) { +  string cs; +  Parser.HTML parser = Parser.HTML(); +  parser->case_insensitive_tag(1); +  parser->lazy_entity_end(1); +  parser->ignore_unknown(1); +  parser->match_tag(0); +  parser->add_tags( ([ "meta": lambda( Parser.HTML p, mapping m) +  { +  if(m["content"] && m["http-equiv"] && +  lower_case(m["http-equiv"]) == "content-type") +  cs = get_ct_cs(m["content"]); +  } ]) ); +  parser->finish(data); +  return cs; +  };    -  +  function get_cs_from_xml_enc = +  lambda(string data) { +  string cs,tmp; +  sscanf(data, "%*s<?xml%s?>%*s", tmp); +  sscanf(lower_case(tmp), "%*sencoding=\"%s\"%*s", cs); +  if (!cs) +  cs = "utf-8"; // UTF-8 is default XML encoding when omitted +  return cs; +  };    -  +  string ct, cs; +  +  if(!(ct = headers["content-type"])) { +  // Don't even try to decode, might be binary for all we know +  return data; +  } +  +  ct = String.trim_all_whites(lower_case(ct)); +  +  // If text, look for charset: +  if(has_prefix(ct,"text/") || has_prefix(ct, "application/xml")) { +  cs = get_ct_cs(ct); +  +  if (!cs) { +  // No charset in content-type header, look in data for encoding hints +  +  if(has_prefix(ct, "text/html")) { +  cs = get_cs_from_html(data); +  } else if(has_prefix(ct, "text/xml") || has_prefix(ct, "application/xml")) { +  string data2; +  mixed result = catch { +  data2 = Parser.XML.Simple()->autoconvert(data); +  }; +  +  if (!result) +  return remove_bom(data2); +  +  cs = get_cs_from_xml_enc(data); +  } +  } +  } +  +  if(has_prefix(ct, "text/xml") || has_prefix(ct, "application/xml")) +  data = remove_bom(data); +  +  if(cs) { +  catch { +  data = Locale.Charset.decoder(cs)->feed(data)->drain(); +  return data; +  }; +  } +  +  return data; + } +  + string remove_bom(string data) { +  return data[search(data, "<")..]; + } +    TAGDOCUMENTATION;   #ifdef manual   constant tagdoc=([
1005: Inside #if defined(manual)
     <attr name='decode-xml' value='string'>   <p> +  <i>(DEPRECATED. All text content is now decoded automatically.)</i>    If provided the resulting content will be decoded to the internal    charset representation by looking at a potential BOM (Byte Order    Mark) and the specified encoding in the XML header. Defaults to UTF-8