41adb62012-01-23Marcus Wellhardh // This is a roxen module. Copyright © 2012, Roxen IS. #include <module.h> inherit "module";
1ef7b82012-05-10Tobias Liin constant cvs_version = "$Id: periodic-fetcher.pike,v 1.2 2012/05/10 05:53:49 liin Exp $";
41adb62012-01-23Marcus Wellhardh constant thread_safe = 1; constant module_type = MODULE_ZERO; LocaleString module_name = "Periodic Fetcher"; LocaleString module_doc = #"<p>The module fetches a list of URLs periodically. The periodicity can be specified per URL. The main purpose is to populate caches and keep them warm. </p> <p>The list of URLs is specified in a file in the site itself. This file is fetched on startup and the module has to be reloaded in order to update the list. </p> <p>The module uses the curl binary to simulate external requests.</p>"; constant curl_redirs = "5"; void ERROR_MSG(sprintf_format fmt, sprintf_args ... args) { report_error (module_name + ": " + fmt, @args); } void DEBUG_MSG(sprintf_format fmt, sprintf_args ... args) { if(query("debug")) report_debug (module_name + ": " + fmt, @args); } string status() { string res = "<p>"; res += sprintf("Queue size: %d<br>", sizeof(event_queue)); res += sprintf("Crawler status: %s", crawler_status); if(global_events) { res += "<br/><br/>\n"; res += "<table border='1' cellpadding='2' cellspacing='0'>\n"; res += " <tr>\n"; res += " <th align='left'>URL</th>\n"; res += " <th align='left'>Period</th>\n"; res += " <th align='left'>Low</th>\n"; res += " <th align='left'>High</th>\n"; res += " <th align='left'>Last</th>\n"; res += " <th align='left'>Count</th>\n"; res += " </tr>\n"; foreach(global_events, Event event) { res += sprintf("<tr>\n" " <td>%s</td>\n" " <td>%d</td>\n" " <td>%f</td>\n" " <td>%f</td>\n" " <td>%f</td>\n" " <td>%d</td>\n" "</tr>\n", event->url, event->period, event->low/1000000.0, event->high/1000000.0, event->last/1000000.0, event->count); } res += "</table>\n"; } return res+"</p>"; } mapping(string:function) query_action_buttons() { return ([ "Start Crawler": start_crawler, "Stop Crawler": stop_crawler ]); } class Event { string url; int period; int time; int count; int last = UNDEFINED; int high = UNDEFINED; int low = UNDEFINED; void create(string _url, int _period, int|void _time) { url = _url; period = _period; time = _time; } string _sprintf() { return sprintf("Event(%O, %d, %d)", url, period, time); } void update_statistics(int t) { if(t < low || low == UNDEFINED) low = t; if(t > high || high == UNDEFINED) high = t; last = t; count++; } } ADT.Priority_queue event_queue; array(Event) global_events; function do_fetch_co; function start_crawler_co; string crawler_status = "<font color='FFB700'><b>Waiting</b></font>"; void create() { defvar("crawl_src", "http://localhost/periodic-crawl.txt", "Crawl list URL", TYPE_STRING, "<p>The URL to the file that contains the list of URLs to fetch. " "It should be a text file with one URL, and its periodicity in " "seconds separated by space, per line, e.g:</p>" "<pre>" " http://localhost:8080/ 5<br/>" " http://localhost:8080/news 10<br/>" " http://localhost:8080/sports 10<br/>" " http://localhost:8080/rss.xml?category=3455&id=47 20" "</pre>"); defvar("crawl_delay", 60, "Crawl Delay", TYPE_INT, "Wait this amount of second before starting the crawler after " "the roxen server has started or the module has been reloaded."); defvar("curl_path", "/usr/bin/curl", "Curl Path", TYPE_STRING, "The path to the curl binary."); defvar("curl_timeout", 300, "Curl Timeout", TYPE_INT, "The timeout in seconds for each fetch."); defvar("debug", 0, "Debug", TYPE_FLAG, "Activate to print debug messages in the debug log."); defvar("enable", 1, "Enable", TYPE_FLAG, "Enable/Disable the crawler."); } void start() { } void stop() { stop_crawler(); } void ready_to_receive_requests() { event_queue = ADT.Priority_queue(); if(!query("enable")) { crawler_status = "<b>Crawler disabled</b>"; return; }
1ef7b82012-05-10Tobias Liin  roxen.background_run(1, init_crawler); } void init_crawler() {
41adb62012-01-23Marcus Wellhardh  array(Event) events = fetch_events(query("crawl_src")); if(!events) { return; } // Populate queue foreach(events, Event event) schedule_event(event); global_events = events; // Give the server some time before starting the crawler start_crawler_co = roxen.background_run(query("crawl_delay"), start_crawler); } array(Event) fetch_events(string crawl_src) { RequestID id = roxen.InternalRequestID(); id->set_url(crawl_src); string path = Standards.URI(crawl_src)->path; // Get content of crawl file. string crawl_file = my_configuration()->try_get_file(path, id); // werror("%O\n", crawl_file); if (!crawl_file) { ERROR_MSG("Can't fetch crawl source file: %O\n", query("crawl_src")); crawler_status = sprintf("<font color='BC311B'>" " <b>Can't fetch crawl source file: %O.</b>" "</font>", query("crawl_src")); return 0; } // One URL per line. array(string) lines = (crawl_file-"\r") / "\n" - ({""}); array(Event) events = ({ }); foreach(lines, string line) { array fields = line / " " - ({""}); if(sizeof(fields) < 2) { ERROR_MSG("Parse error in crawl source file:\n%s\n", crawl_file); crawler_status = sprintf("<font color='BC311B'>" " <b>Parse error in crawl source file: %O.</b>" "</font>", query("crawl_src")); return 0; } events += ({ Event(fields[0], (int)fields[1]) }); } return events; } void start_crawler() { DEBUG_MSG("Starting Crawler\n"); if(!sizeof(event_queue)) { ERROR_MSG("Queue empty\n"); return; } crawler_status = "<font color='5BBF27'><b>Running</b></font>"; schedule_next(); } void stop_crawler() { DEBUG_MSG("Stopping Crawler\n"); if(start_crawler_co) { remove_call_out(start_crawler_co); } if(do_fetch_co) { remove_call_out(do_fetch_co); } crawler_status = "<b>Stopped</b>"; } void schedule_event(Event event) { event->time = time() + event->period; event_queue->push(event->time, event); } void do_fetch() { Event event = event_queue->pop(); // werror("do_fetch: %O\n", event); int fetch_time = fetch_url(event->url); if(fetch_time >= 0) { event->update_statistics(fetch_time); DEBUG_MSG("%O Pe:%d Lo:%f Hi:%f La:%f Co:%d\n", event->url, event->period, event->low/1000000.0, event->high/1000000.0, event->last/1000000.0, event->count); } schedule_event(event); schedule_next(); } void schedule_next() { Event event = event_queue->peek(); if(!event) return; do_fetch_co = roxen.background_run(event->time - time(), do_fetch); } int fetch_url(string url) { DEBUG_MSG("Fetching %O\n", url); Stdio.File stderr = Stdio.File(); array command_args = ({ query("curl_path"), "-o", "/dev/null", "--max-redirs", (string)curl_redirs, "--max-time", (string)query("curl_timeout"), //"--stderr", "/dev/null", "--silent", "--show-error", url }); mixed err = catch { int start_time = gethrtime(); object process = Process.create_process(command_args, ([ "stderr": stderr->pipe() ]) ); int code = process->wait(); string err_msg = stderr->read(); if(sizeof(err_msg)) ERROR_MSG("%O\n", err_msg); process = 0; if (code) // curl exit code = 0 for success { ERROR_MSG("Process %s failed with exit code %d\n", query("curl_path"), code); return -1; } return gethrtime() - start_time; }; if (err) { ERROR_MSG("Failed to to fetch %s\n", url); ERROR_MSG(describe_backtrace(err)); return -1; } }