Roxen.git / server / modules / tags / insert_cached_href.pike

version» Context lines:

Roxen.git/server/modules/tags/insert_cached_href.pike:1:   // This is a roxen module. Copyright © 2000 - 2004, Roxen IS.   //      #include <module.h>   inherit "module";      //<locale-token project="mod_insert_cached_href">LOCALE</locale-token>   #define LOCALE(X,Y) _DEF_LOCALE("mod_insert_cached_href",X,Y)    - constant cvs_version = "$Id: insert_cached_href.pike,v 1.20 2008/02/11 10:27:24 jonasw Exp $"; + constant cvs_version = "$Id: insert_cached_href.pike,v 1.21 2008/02/27 08:53:38 liin Exp $";      constant thread_safe = 1;   constant module_type = MODULE_TAG;   LocaleString module_name = LOCALE(1, "Tags: Insert cached href");   LocaleString module_doc = LOCALE(2, "This module contains the RXML tag \"insert "    "cached-href\". Useful when implementing e.g."    " RSS syndication.");      #if DEBUG_INSERT_CACHED_HREF   #define DWRITE(x) report_debug("INSERT_CACHED_HREF: " + x + "\n")
Roxen.git/server/modules/tags/insert_cached_href.pike:183:       return 0;   }      /*    Takes action based on HTTP status codes in reply.    Synchronous:   */   public string get_result_sync(HTTPClient client, mapping args, mapping header) {    if (!is_redirect(client->status) || !MAX_REDIRECTS) -  return client->data(); +  return decode_data(client->data(), client->con->headers);       int counter;    string location = client->con->headers->location;       if (!location || !sizeof(location)) -  return client->data(); +  return decode_data(client->data(), client->con->headers);       DWRITE("Following redirect from " + (string)client->url +    " to " + location);       args["cached-href"] = location;    HTTPClient new_client = HTTPClient("GET", args, header);       new_client->orig_url = (string)client->url;    new_client->run();    counter++;       while (is_redirect(new_client->status) && counter < MAX_REDIRECTS) {    location = new_client->con->headers->location;       if (!location || !sizeof(location)) -  return new_client->data(); +  return decode_data(new_client->data(), new_client->con->headers);       DWRITE("Following redirect from " + (string)new_client->url +    " to " + location);       args["cached-href"] = location;    new_client = HTTPClient("GET", args, header);    new_client->orig_url = (string)client->url;    new_client->run();    counter++;    }    -  return new_client->data(); +  return decode_data(new_client->data(), new_client->con->headers);   }      /*    Takes action based on HTTP status codes in reply.    Asynchronous:   */   public void get_result_async(HTTPClient client, mapping args, mapping header) {    if (!is_redirect(client->status))    return;   
Roxen.git/server/modules/tags/insert_cached_href.pike:279: Inside #if defined(THREADS)
   client->run();       if (to_fetch["sync"])    return get_result_sync(client, args, header);   #else    client = Protocols.HTTP.get_url(to_fetch["url"], 0);       // In practice a server never runs unthreaded. Keep it    // simple and only return when status code < 300:    if(client && client->status > 0 && client->status < 300) { -  href_database->update_data(to_fetch["url"], client->data()); -  return client->data(); +  string data = decode_data(client->data(), client->headers); +  href_database->update_data(to_fetch["url"], data); +  return data;    } else    return "";   #endif   }         /* This class represents the database in which the data of the URL:s are stored */   class HrefDatabase {    private constant request_table_def = "url VARCHAR(255) NOT NULL,"    "fetch_interval INT UNSIGNED NOT NULL,"
Roxen.git/server/modules/tags/insert_cached_href.pike:420:    sql_query("INSERT IGNORE INTO " + data_table + " values (%s, '', 0)",    args["cached-href"]);       result = sql_query("SELECT data FROM " + data_table + " WHERE url='" +    args["cached-href"] + "' AND (" + time() + " - latest_write < "    + args["fresh-time"] + " OR " + args["fresh-time"] + " = 0)");       if (result && sizeof(result) && result[0]["data"] != "") {    DWRITE("get_data(): Returning cached data for " + args["cached-href"]);    -  return result[0]["data"]; +  return utf8_to_string(result[0]["data"]);    } else if (!args["pure-db"]) {    DWRITE("get_data(): No cached data existed for " + args["cached-href"] + " so performing a synchronous fetch");       string data = fetch_url((["url":args["cached-href"], "timeout":args["timeout"],    "sync":1]), header);       return data;    } else {    DWRITE("get_data(): No cached data existed for " + args["cached-href"] + " and pure-db data "    "was desired, so simply returning the empty string");
Roxen.git/server/modules/tags/insert_cached_href.pike:523:    }       return to_fetch;    }       public void update_data(string url, string data) {    DWRITE(sprintf("update_data(): Saving the fetched data to the db for url %s"    , url));       sql_query("UPDATE " + data_table + " SET data=%s, latest_write=%d WHERE url=%s", -  data, time(), url); +  string_to_utf8(data), time(), url);       sql_query("UPDATE " + request_table + " SET next_fetch=next_fetch + " + (24 * 3600)    + " WHERE time_of_day > 0 AND " + time() + " > next_fetch AND url='"    + url + "'");    }   }      /* This class represents a set of attributes given to the tag 'insert cached-href' */   class Attributes {   
Roxen.git/server/modules/tags/insert_cached_href.pike:650:    recursion_depth++;       if(args->nocache)    NOCACHE();    else    CACHE(60);       string res = href_database->get_data(Attributes(args)->get_db_args(),    (["x-roxen-recursion-depth":recursion_depth]));    +  // DEPRECATED attribute 'decode-xml'. Keep it during transition period for upgrades, +  // since there will be undecoded data in the database until the first fetch for each +  // URL. The same type of decoding now occur upon saving the data in the database    if(args["decode-xml"]) {    // Parse xml header and recode content to internal representation.    mixed result = catch {    res = Parser.XML.Simple()->autoconvert(res);    };    -  if (result) { -  werror("INSERT_CACHED_HREF: An error occurred trying to decode the data from " + -  args["cached-href"] + ".\n"); -  } -  +     // Remove any bytes potentially still preceeding the first '<' in the xml file    return res[search(res, "<")..];    }       return res;    }   }      #ifdef THREADS   
Roxen.git/server/modules/tags/insert_cached_href.pike:819:    queue->write("@");    }       void data_ok() {    DWRITE("Received data from " + (string)url + " OK");    status = con->status;    finish_up();       if (href_database)    if (orig_url) -  href_database->update_data(orig_url, con->data()); +  href_database->update_data(orig_url, decode_data(con->data(), con->headers));    else -  href_database->update_data((string)url, con->data()); +  href_database->update_data((string)url, decode_data(con->data(), con->headers));       if (sync)    queue->write("@");    }       void data_fail() {    DWRITE("Receiving data from " + (string)url + " FAILED");    status = 0;    finish_up();   
Roxen.git/server/modules/tags/insert_cached_href.pike:880:    else    do_method("GET", args["cached-href"], 0, headers);       timeout = args["timeout"];    sync = args["sync"];    }      }   #endif    + /* +  Decodes data based on 1) HTTP headers or 2) fallbacks on +  data content, meta http-equiv for html and BOM + encoding='' +  for xml + */ + string decode_data(string data, mapping headers) { +  function get_ct_cs = +  lambda(string ct) { +  string cs; +  foreach((ct/";")[1..], string s) { +  string s2 = String.trim_all_whites(s); +  string _cs; +  if(sscanf(s2, "charset=%s", _cs) == 1) +  cs = String.trim_all_whites(_cs); +  } +  return cs; +  };    -  +  function get_cs_from_html = +  lambda(string data) { +  string cs; +  Parser.HTML parser = Parser.HTML(); +  parser->case_insensitive_tag(1); +  parser->lazy_entity_end(1); +  parser->ignore_unknown(1); +  parser->match_tag(0); +  parser->add_tags( ([ "meta": lambda( Parser.HTML p, mapping m) +  { +  if(m["content"] && m["http-equiv"] && +  lower_case(m["http-equiv"]) == "content-type") +  cs = get_ct_cs(m["content"]); +  } ]) ); +  parser->finish(data); +  return cs; +  };    -  +  function get_cs_from_xml_enc = +  lambda(string data) { +  string cs,tmp; +  sscanf(data, "%*s<?xml%s?>%*s", tmp); +  sscanf(lower_case(tmp), "%*sencoding=\"%s\"%*s", cs); +  if (!cs) +  cs = "utf-8"; // UTF-8 is default XML encoding when omitted +  return cs; +  };    -  +  string ct, cs; +  +  if(!(ct = headers["content-type"])) { +  // Don't even try to decode, might be binary for all we know +  return data; +  } +  +  ct = String.trim_all_whites(lower_case(ct)); +  +  // If text, look for charset: +  if(has_prefix(ct,"text/") || has_prefix(ct, "application/xml")) { +  cs = get_ct_cs(ct); +  +  if (!cs) { +  // No charset in content-type header, look in data for encoding hints +  +  if(has_prefix(ct, "text/html")) { +  cs = get_cs_from_html(data); +  } else if(has_prefix(ct, "text/xml") || has_prefix(ct, "application/xml")) { +  string data2; +  mixed result = catch { +  data2 = Parser.XML.Simple()->autoconvert(data); +  }; +  +  if (!result) +  return remove_bom(data2); +  +  cs = get_cs_from_xml_enc(data); +  } +  } +  } +  +  if(has_prefix(ct, "text/xml") || has_prefix(ct, "application/xml")) +  data = remove_bom(data); +  +  if(cs) { +  catch { +  data = Locale.Charset.decoder(cs)->feed(data)->drain(); +  return data; +  }; +  } +  +  return data; + } +  + string remove_bom(string data) { +  return data[search(data, "<")..]; + } +    TAGDOCUMENTATION;   #ifdef manual   constant tagdoc=([       "insert#cached-href":#"<desc type='plugin'>   <p>    <short>This tag inserts the contents of the provided URL,    as read from a database.</short>    The database is updated repeatedly in the background by a background    process that is initiated and run as soon as this module is
Roxen.git/server/modules/tags/insert_cached_href.pike:998: Inside #if defined(manual)
  <attr name='nocache' value='string'>   <p>    If provided the resulting page will get a zero cache time in the RAM cache.    The default time is up to 60 seconds depending on the cache limit imposed by    other RXML tags on the same page.   </p>   </attr>      <attr name='decode-xml' value='string'>   <p> +  <i>(DEPRECATED. All text content is now decoded automatically.)</i>    If provided the resulting content will be decoded to the internal    charset representation by looking at a potential BOM (Byte Order    Mark) and the specified encoding in the XML header. Defaults to UTF-8    if no BOM or encoding was found.   </p>   </attr>",   ]);   #endif