Roxen.git
/
server
/
modules
/
misc
/
periodic-fetcher.pike
version
»
Context lines:
10
20
40
80
file
none
3
Roxen.git/server/modules/misc/periodic-fetcher.pike:1:
+
// This is a roxen module. Copyright © 2012, Roxen IS.
-
+
#include <module.h>
+
inherit "module";
+
+
constant cvs_version = "$Id$";
+
constant thread_safe = 1;
+
constant module_type = MODULE_ZERO;
+
+
LocaleString module_name = "Periodic Fetcher";
+
LocaleString module_doc =
+
#"<p>The module fetches a list of URLs periodically. The periodicity can
+
be specified per URL. The main purpose is to populate caches and keep
+
them warm. </p>
+
+
<p>The list of URLs is specified in a file in the site itself. This
+
file is fetched on startup and the module has to be reloaded in order
+
to update the list. </p>
+
+
<p>The module uses the curl binary to simulate external requests.</p>";
+
+
constant curl_redirs = "5";
+
+
+
void ERROR_MSG(sprintf_format fmt, sprintf_args ... args)
+
{
+
report_error (module_name + ": " + fmt, @args);
+
}
+
+
void DEBUG_MSG(sprintf_format fmt, sprintf_args ... args)
+
{
+
if(query("debug"))
+
report_debug (module_name + ": " + fmt, @args);
+
}
+
+
string status()
+
{
+
string res = "<p>";
+
res += sprintf("Queue size: %d<br>", sizeof(event_queue));
+
res += sprintf("Crawler status: %s", crawler_status);
+
if(global_events)
+
{
+
res += "<br/><br/>\n";
+
res += "<table border='1' cellpadding='2' cellspacing='0'>\n";
+
res += " <tr>\n";
+
res += " <th align='left'>URL</th>\n";
+
res += " <th align='left'>Period</th>\n";
+
res += " <th align='left'>Host</th>\n";
+
res += " <th align='left'>Low</th>\n";
+
res += " <th align='left'>High</th>\n";
+
res += " <th align='left'>Last</th>\n";
+
res += " <th align='left'>Count</th>\n";
+
res += " </tr>\n";
+
foreach(global_events, Event event)
+
{
+
res += sprintf("<tr>\n"
+
" <td>%s</td>\n"
+
" <td>%d</td>\n"
+
" <td>%s</td>\n"
+
" <td>%f</td>\n"
+
" <td>%f</td>\n"
+
" <td>%f</td>\n"
+
" <td>%d</td>\n"
+
"</tr>\n",
+
event->url, event->period, event->host||"",
+
event->low/1000000.0,
+
event->high/1000000.0,
+
event->last/1000000.0,
+
event->count);
+
+
}
+
res += "</table>\n";
+
}
+
return res+"</p>";
+
}
+
+
mapping(string:function) query_action_buttons() {
+
return ([ "Start Crawler": start_crawler,
+
"Stop Crawler": stop_crawler ]);
+
}
+
+
class Event
+
{
+
string url;
+
int period;
+
string host;
+
int time;
+
int count;
+
+
int last = UNDEFINED;
+
int high = UNDEFINED;
+
int low = UNDEFINED;
+
+
void create(string _url, int _period, string _host)
+
{
+
url = _url;
+
period = _period;
+
host = _host;
+
}
+
+
string _sprintf()
+
{
+
return sprintf("Event(%O, %d, %O, %d)", url, period, host, time);
+
}
+
+
void update_statistics(int t)
+
{
+
if(t < low || low == UNDEFINED)
+
low = t;
+
+
if(t > high || high == UNDEFINED)
+
high = t;
+
+
last = t;
+
count++;
+
}
+
}
+
+
ADT.Priority_queue event_queue;
+
array(Event) global_events;
+
function do_fetch_co;
+
function start_crawler_co;
+
string crawler_status = "<font color='FFB700'><b>Waiting</b></font>";
+
+
void create()
+
{
+
defvar("crawl_src", "http://localhost/periodic-crawl.txt",
+
"Crawl list URL", TYPE_STRING,
+
"<p>The URL to the file that contains the list of URLs to fetch. "
+
"It should be a text file with one URL, and its periodicity in "
+
"seconds separated by space, per line. It is also possible to specify "
+
"an optional host header at the end of the line, e.g:</p>"
+
"<pre>"
+
" http://localhost:8080/ 5<br/>"
+
" http://localhost:8080/ 5 mobile.roxen.com<br/>"
+
" http://localhost:8080/news 10<br/>"
+
" http://localhost:8080/sports 10<br/>"
+
" http://localhost:8080/rss.xml?category=3455&id=47 20"
+
"</pre>");
+
+
defvar("crawl_delay", 60,
+
"Crawl Delay", TYPE_INT,
+
"Wait this amount of second before starting the crawler after "
+
"the roxen server has started or the module has been reloaded.");
+
+
defvar("curl_path", "/usr/bin/curl",
+
"Curl Path", TYPE_STRING,
+
"The path to the curl binary.");
+
+
defvar("curl_timeout", 300,
+
"Curl Timeout", TYPE_INT,
+
"The timeout in seconds for each fetch.");
+
+
defvar("debug", 0,
+
"Debug", TYPE_FLAG,
+
"Activate to print debug messages in the debug log.");
+
+
defvar("enable", 1,
+
"Enable", TYPE_FLAG,
+
"Enable/Disable the crawler.");
+
+
}
+
+
void start()
+
{
+
}
+
+
void stop()
+
{
+
stop_crawler();
+
}
+
+
void ready_to_receive_requests()
+
{
+
event_queue = ADT.Priority_queue();
+
+
if(!query("enable"))
+
{
+
crawler_status = "<b>Crawler disabled</b>";
+
return;
+
}
+
+
roxen.background_run(1, init_crawler);
+
}
+
+
void init_crawler() {
+
array(Event) events = fetch_events(query("crawl_src"));
+
if(!events)
+
{
+
return;
+
}
+
+
// Populate queue
+
foreach(events, Event event)
+
schedule_event(event);
+
+
global_events = events;
+
+
// Give the server some time before starting the crawler
+
start_crawler_co = roxen.background_run(query("crawl_delay"), start_crawler);
+
}
+
+
array(Event) fetch_events(string crawl_src)
+
{
+
RequestID id = roxen.InternalRequestID();
+
id->set_url(crawl_src);
+
+
string path = Standards.URI(crawl_src)->path;
+
+
// Get content of crawl file.
+
string crawl_file = my_configuration()->try_get_file(path, id);
+
// werror("%O\n", crawl_file);
+
if (!crawl_file)
+
{
+
ERROR_MSG("Can't fetch crawl source file: %O\n", query("crawl_src"));
+
crawler_status =
+
sprintf("<font color='BC311B'>"
+
" <b>Can't fetch crawl source file: %O.</b>"
+
"</font>",
+
query("crawl_src"));
+
return 0;
+
}
+
+
// One URL per line.
+
array(string) lines = (crawl_file-"\r") / "\n" - ({""});
+
array(Event) events = ({ });
+
foreach(lines, string line)
+
{
+
array fields = line / " " - ({""});
+
if(sizeof(fields) < 2)
+
{
+
ERROR_MSG("Parse error in crawl source file:\n%s\n", crawl_file);
+
crawler_status =
+
sprintf("<font color='BC311B'>"
+
" <b>Parse error in crawl source file: %O.</b>"
+
"</font>",
+
query("crawl_src"));
+
return 0;
+
}
+
+
events += ({ Event(fields[0], (int)fields[1], (sizeof(fields) >= 3)? fields[2]:0) });
+
}
+
return events;
+
}
+
+
void start_crawler()
+
{
+
DEBUG_MSG("Starting Crawler\n");
+
if(!sizeof(event_queue))
+
{
+
ERROR_MSG("Queue empty\n");
+
return;
+
}
+
crawler_status = "<font color='5BBF27'><b>Running</b></font>";
+
schedule_next();
+
}
+
+
void stop_crawler()
+
{
+
DEBUG_MSG("Stopping Crawler\n");
+
if(start_crawler_co)
+
{
+
remove_call_out(start_crawler_co);
+
}
+
if(do_fetch_co)
+
{
+
remove_call_out(do_fetch_co);
+
}
+
+
crawler_status = "<b>Stopped</b>";
+
}
+
+
void schedule_event(Event event)
+
{
+
event->time = time() + event->period;
+
event_queue->push(event->time, event);
+
}
+
+
void do_fetch()
+
{
+
Event event = event_queue->pop();
+
// werror("do_fetch: %O\n", event);
+
int fetch_time = fetch_url(event->url, event->host);
+
if(fetch_time >= 0)
+
{
+
event->update_statistics(fetch_time);
+
+
DEBUG_MSG("%O Pe:%d Ho:%O Lo:%f Hi:%f La:%f Co:%d\n",
+
event->url, event->period, event->host||"",
+
event->low/1000000.0,
+
event->high/1000000.0,
+
event->last/1000000.0,
+
event->count);
+
}
+
+
schedule_event(event);
+
schedule_next();
+
}
+
+
void schedule_next()
+
{
+
Event event = event_queue->peek();
+
if(!event)
+
return;
+
do_fetch_co = roxen.background_run(event->time - time(), do_fetch);
+
}
+
+
int fetch_url(string url, string|void host)
+
{
+
DEBUG_MSG("Fetching %O, host: %O\n", url, host||"");
+
Stdio.File stderr = Stdio.File();
+
array command_args = ({ query("curl_path"),
+
"-o", "/dev/null",
+
"--max-redirs", (string)curl_redirs,
+
"--max-time", (string)query("curl_timeout"),
+
//"--stderr", "/dev/null",
+
"--silent",
+
"--show-error" });
+
+
if(host)
+
command_args += ({ "--header", "Host: "+host });
+
+
command_args += ({ url });
+
+
mixed err = catch
+
{
+
int start_time = gethrtime();
+
object process =
+
Process.create_process(command_args, ([ "stderr": stderr->pipe() ]) );
+
+
int code = process->wait();
+
string err_msg = stderr->read();
+
+
if(sizeof(err_msg))
+
ERROR_MSG("%O\n", err_msg);
+
+
process = 0;
+
+
if (code) // curl exit code = 0 for success
+
{
+
ERROR_MSG("Process %s failed with exit code %d\n",
+
query("curl_path"), code);
+
return -1;
+
}
+
+
return gethrtime() - start_time;
+
};
+
+
if (err)
+
{
+
ERROR_MSG("Failed to to fetch %s\n", url);
+
ERROR_MSG(describe_backtrace(err));
+
return -1;
+
}
+
}
Newline at end of file added.