41adb6 | 2012-01-23 | Marcus Wellhardh | |
#include <module.h>
inherit "module";
|
0917d3 | 2013-03-04 | Anders Johansson | | constant cvs_version = "$Id$";
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | constant thread_safe = 1;
constant module_type = MODULE_ZERO;
LocaleString module_name = "Periodic Fetcher";
LocaleString module_doc =
#"<p>The module fetches a list of URLs periodically. The periodicity can
be specified per URL. The main purpose is to populate caches and keep
them warm. </p>
<p>The list of URLs is specified in a file in the site itself. This
file is fetched on startup and the module has to be reloaded in order
to update the list. </p>
<p>The module uses the curl binary to simulate external requests.</p>";
constant curl_redirs = "5";
void ERROR_MSG(sprintf_format fmt, sprintf_args ... args)
{
report_error (module_name + ": " + fmt, @args);
}
void DEBUG_MSG(sprintf_format fmt, sprintf_args ... args)
{
if(query("debug"))
report_debug (module_name + ": " + fmt, @args);
}
string status()
{
string res = "<p>";
res += sprintf("Queue size: %d<br>", sizeof(event_queue));
res += sprintf("Crawler status: %s", crawler_status);
if(global_events)
{
res += "<br/><br/>\n";
res += "<table border='1' cellpadding='2' cellspacing='0'>\n";
res += " <tr>\n";
res += " <th align='left'>URL</th>\n";
res += " <th align='left'>Period</th>\n";
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | res += " <th align='left'>Host</th>\n";
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | res += " <th align='left'>Low</th>\n";
res += " <th align='left'>High</th>\n";
res += " <th align='left'>Last</th>\n";
res += " <th align='left'>Count</th>\n";
res += " </tr>\n";
foreach(global_events, Event event)
{
res += sprintf("<tr>\n"
" <td>%s</td>\n"
" <td>%d</td>\n"
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | " <td>%s</td>\n"
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | " <td>%f</td>\n"
" <td>%f</td>\n"
" <td>%f</td>\n"
" <td>%d</td>\n"
"</tr>\n",
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | event->url, event->period, event->host||"",
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | event->low/1000000.0,
event->high/1000000.0,
event->last/1000000.0,
event->count);
}
res += "</table>\n";
}
return res+"</p>";
}
mapping(string:function) query_action_buttons() {
return ([ "Start Crawler": start_crawler,
"Stop Crawler": stop_crawler ]);
}
class Event
{
string url;
int period;
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | string host;
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | int time;
int count;
int last = UNDEFINED;
int high = UNDEFINED;
int low = UNDEFINED;
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | void create(string _url, int _period, string _host)
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | {
url = _url;
period = _period;
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | host = _host;
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | }
string _sprintf()
{
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | return sprintf("Event(%O, %d, %O, %d)", url, period, host, time);
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | }
void update_statistics(int t)
{
if(t < low || low == UNDEFINED)
low = t;
if(t > high || high == UNDEFINED)
high = t;
last = t;
count++;
}
}
ADT.Priority_queue event_queue;
array(Event) global_events;
function do_fetch_co;
function start_crawler_co;
string crawler_status = "<font color='FFB700'><b>Waiting</b></font>";
void create()
{
defvar("crawl_src", "http://localhost/periodic-crawl.txt",
"Crawl list URL", TYPE_STRING,
|
48e4df | 2017-09-19 | Tobias Liin | | "<p>The URL to the file that contains the list of URLs or paths to fetch. "
"It should be a text file with one URL or path, and its periodicity in "
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | "seconds separated by space, per line. It is also possible to specify "
"an optional host header at the end of the line, e.g:</p>"
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | "<pre>"
" http://localhost:8080/ 5<br/>"
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | " http://localhost:8080/ 5 mobile.roxen.com<br/>"
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | " http://localhost:8080/news 10<br/>"
" http://localhost:8080/sports 10<br/>"
|
48e4df | 2017-09-19 | Tobias Liin | | " /rss.xml?category=3455&id=47 20"
"</pre>"
"When a path is provided instead of a URL, a full URL will be constructed by "
"prepending the path with the URL in the 'Base URL' setting.");
defvar("base_url", "http://localhost:8080",
"Base URL", TYPE_STRING,
"For lines in the text file that contain a path instead of URL, "
"this URL is prepended to construct a complete URL. This is useful "
"if the frontends need to crawl using separate URLs.");
|
41adb6 | 2012-01-23 | Marcus Wellhardh | |
defvar("crawl_delay", 60,
"Crawl Delay", TYPE_INT,
"Wait this amount of second before starting the crawler after "
"the roxen server has started or the module has been reloaded.");
defvar("curl_path", "/usr/bin/curl",
"Curl Path", TYPE_STRING,
"The path to the curl binary.");
defvar("curl_timeout", 300,
"Curl Timeout", TYPE_INT,
"The timeout in seconds for each fetch.");
defvar("debug", 0,
"Debug", TYPE_FLAG,
"Activate to print debug messages in the debug log.");
defvar("enable", 1,
"Enable", TYPE_FLAG,
"Enable/Disable the crawler.");
}
void start()
{
}
void stop()
{
stop_crawler();
}
void ready_to_receive_requests()
{
event_queue = ADT.Priority_queue();
if(!query("enable"))
{
crawler_status = "<b>Crawler disabled</b>";
return;
}
|
1ef7b8 | 2012-05-10 | Tobias Liin | | roxen.background_run(1, init_crawler);
}
void init_crawler() {
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | array(Event) events = fetch_events(query("crawl_src"));
if(!events)
{
return;
}
foreach(events, Event event)
schedule_event(event);
global_events = events;
start_crawler_co = roxen.background_run(query("crawl_delay"), start_crawler);
}
array(Event) fetch_events(string crawl_src)
{
RequestID id = roxen.InternalRequestID();
id->set_url(crawl_src);
string path = Standards.URI(crawl_src)->path;
string crawl_file = my_configuration()->try_get_file(path, id);
if (!crawl_file)
{
ERROR_MSG("Can't fetch crawl source file: %O\n", query("crawl_src"));
crawler_status =
sprintf("<font color='BC311B'>"
" <b>Can't fetch crawl source file: %O.</b>"
"</font>",
query("crawl_src"));
return 0;
}
array(string) lines = (crawl_file-"\r") / "\n" - ({""});
array(Event) events = ({ });
foreach(lines, string line)
{
|
48e4df | 2017-09-19 | Tobias Liin | | string url;
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | array fields = line / " " - ({""});
if(sizeof(fields) < 2)
{
ERROR_MSG("Parse error in crawl source file:\n%s\n", crawl_file);
crawler_status =
sprintf("<font color='BC311B'>"
" <b>Parse error in crawl source file: %O.</b>"
"</font>",
query("crawl_src"));
return 0;
}
|
48e4df | 2017-09-19 | Tobias Liin | | if (has_value(fields[0], "://")) {
url = fields[0];
} else {
url = query("base_url") + fields[0];
}
events += ({ Event(url, (int)fields[1], (sizeof(fields) >= 3)? fields[2]:0) });
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | }
return events;
}
void start_crawler()
{
DEBUG_MSG("Starting Crawler\n");
if(!sizeof(event_queue))
{
ERROR_MSG("Queue empty\n");
return;
}
crawler_status = "<font color='5BBF27'><b>Running</b></font>";
schedule_next();
}
void stop_crawler()
{
DEBUG_MSG("Stopping Crawler\n");
if(start_crawler_co)
{
remove_call_out(start_crawler_co);
}
if(do_fetch_co)
{
remove_call_out(do_fetch_co);
}
crawler_status = "<b>Stopped</b>";
}
void schedule_event(Event event)
{
event->time = time() + event->period;
event_queue->push(event->time, event);
}
void do_fetch()
{
Event event = event_queue->pop();
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | int fetch_time = fetch_url(event->url, event->host);
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | if(fetch_time >= 0)
{
event->update_statistics(fetch_time);
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | DEBUG_MSG("%O Pe:%d Ho:%O Lo:%f Hi:%f La:%f Co:%d\n",
event->url, event->period, event->host||"",
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | event->low/1000000.0,
event->high/1000000.0,
event->last/1000000.0,
event->count);
}
schedule_event(event);
schedule_next();
}
void schedule_next()
{
Event event = event_queue->peek();
if(!event)
return;
do_fetch_co = roxen.background_run(event->time - time(), do_fetch);
}
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | int fetch_url(string url, string|void host)
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | {
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | DEBUG_MSG("Fetching %O, host: %O\n", url, host||"");
|
41adb6 | 2012-01-23 | Marcus Wellhardh | | Stdio.File stderr = Stdio.File();
array command_args = ({ query("curl_path"),
"-o", "/dev/null",
"--max-redirs", (string)curl_redirs,
"--max-time", (string)query("curl_timeout"),
"--silent",
|
9669b5 | 2012-12-17 | Marcus Wellhardh | | "--show-error" });
if(host)
command_args += ({ "--header", "Host: "+host });
command_args += ({ url });
|
41adb6 | 2012-01-23 | Marcus Wellhardh | |
mixed err = catch
{
int start_time = gethrtime();
object process =
Process.create_process(command_args, ([ "stderr": stderr->pipe() ]) );
int code = process->wait();
string err_msg = stderr->read();
if(sizeof(err_msg))
ERROR_MSG("%O\n", err_msg);
process = 0;
if (code)
{
ERROR_MSG("Process %s failed with exit code %d\n",
query("curl_path"), code);
return -1;
}
return gethrtime() - start_time;
};
if (err)
{
ERROR_MSG("Failed to to fetch %s\n", url);
ERROR_MSG(describe_backtrace(err));
return -1;
}
}
|