eb01b42010-10-26Martin Stjernholm #pike __REAL_VERSION__
87e9262001-06-22Martin Nilsson 
b8d73f2010-11-02Martin Stjernholm // Cannot dump this because of the #if constant(Roxen.xxx) check below. constant dont_dump_module = 1;
89d4cf2001-05-17Johan Schön // Filter for text/html
e1d7152000-11-24Johan Schön 
40a44d2004-08-07Johan Schön #define INTRAWISE inherit .Base;
df57a22001-01-02Johan Schön 
e1d7152000-11-24Johan Schön constant contenttypes = ({ "text/html" });
40a44d2004-08-07Johan Schön constant fields = ({ "body", "title", "keywords", "description", "robots", "headline", "modified", "author", "summary", #ifdef INTRAWISE "intrawise.folderid", "intrawise.type", #endif });
e1d7152000-11-24Johan Schön 
ff17962014-08-15Martin Nilsson protected int(0..0) return_zero(mixed ... args) { return 0; }
40a44d2004-08-07Johan Schön 
ff17962014-08-15Martin Nilsson protected Parser.HTML parser; protected Parser.HTML cleaner; protected mapping entities;
40a44d2004-08-07Johan Schön 
ff17962014-08-15Martin Nilsson protected void create() {
40a44d2004-08-07Johan Schön  parser = Parser.HTML(); parser->case_insensitive_tag(1); parser->lazy_entity_end(1); parser->ignore_unknown(1); parser->match_tag(0); parser->add_quote_tag("?", return_zero, "?"); parser->_set_tag_callback(lambda(Parser.HTML p, string data) { // Do nothing! Callback still needed so that // unknown tags aren't sent to // _set_data_callback. }); constant ignore_tags = ({ "script", "style", }); parser->add_containers(mkmapping(ignore_tags, ({""})*sizeof(ignore_tags))); cleaner = Parser.html_entity_parser(1); cleaner->case_insensitive_tag(1); cleaner->lazy_entity_end(1); cleaner->ignore_unknown(1); cleaner->match_tag(0); cleaner->add_quote_tag("!--", return_zero, "--"); cleaner->add_quote_tag("?", return_zero, "?"); cleaner->_set_tag_callback(lambda(Parser.HTML p, string data) { return ({ "" }); }); entities = ([]); foreach(Parser.html_entities; string i; string v) entities["&"+i+";"] = v; }
ff17962014-08-15Martin Nilsson protected string clean(string data) {
40a44d2004-08-07Johan Schön  return cleaner->finish(data)->read();
78fadb2000-11-30Johan Schön }
0374c32009-06-30Henrik Grubbström (Grubba) void parse_http_header(string header, string value, .Output res) { switch(lower_case(header)) { case "robots": res->fields->robots = (stringp(res->fields->robots)? res->fields->robots+",": "") + value; break; case "last-modified": catch {
87382a2009-07-01Henrik Grubbström (Grubba) #if constant(Roxen.parse_since) // Roxen.parse_since() supports multiple time formats. res->fields->mtime = (string)Roxen.parse_since(value)[0]; #elif constant(Protocols.HTTP.Server.http_decode_date) // Protocols.HTTP.Server.http_decode_date() currently // only supports the format specified by the RFC.
0374c32009-06-30Henrik Grubbström (Grubba)  res->fields->mtime = (string)Protocols.HTTP.Server.http_decode_date(value);
87382a2009-07-01Henrik Grubbström (Grubba) #else // Fallback for Pike 7.4. Calendar.ISO_UTC.Second s= Calendar.ISO_UTC.parse("%e, %D %M %Y %h:%m:%s GMT", value); res->fields->mtime = (string)(s && s->unix_time()); #endif
0374c32009-06-30Henrik Grubbström (Grubba)  }; // FALL_THROUGH case "mtime": case "description": case "keywords": case "modified": case "author": #ifdef INTRAWISE case "intrawise.folderid": case "intrawise.type": #endif default: res->fields[lower_case(header)] = value; break; } }
40a44d2004-08-07Johan Schön .Output filter(Standards.URI uri, string|Stdio.File data, string content_type, mapping headers, string|void default_charset )
df57a22001-01-02Johan Schön {
40a44d2004-08-07Johan Schön  function(string...:void) dadd; .Output res=.Output();
89d4cf2001-05-17Johan Schön 
e262e52001-06-05Per Hedbor 
89d4cf2001-05-17Johan Schön  if(objectp(data)) data=data->read();
e262e52001-06-05Per Hedbor  data = .Charset.decode_http( data, headers, default_charset );
0374c32009-06-30Henrik Grubbström (Grubba)  foreach(headers; string header; string value) { parse_http_header(header, value, res); }
40a44d2004-08-07Johan Schön #if 0 array parse_rank(Parser.HTML p, mapping m, string c)
387f152001-06-28Johan Schön  { if(!m->name)
40a44d2004-08-07Johan Schön  return ({});
3524712015-05-26Martin Nilsson 
387f152001-06-28Johan Schön  if(res->fields[m->name])
40a44d2004-08-07Johan Schön  res->fields[m->name] += " " + clean(c);
387f152001-06-28Johan Schön  else
40a44d2004-08-07Johan Schön  res->fields[m->name] = clean(c); return ({});
387f152001-06-28Johan Schön  };
40a44d2004-08-07Johan Schön #endif
3524712015-05-26Martin Nilsson 
80cd452007-03-16Jonas Wallden  array parse_meta(Parser.HTML p, mapping m, mapping e)
7d3b0a2001-05-28Per Hedbor  {
80cd452007-03-16Jonas Wallden  if (e->noindex) return ({ });
0374c32009-06-30Henrik Grubbström (Grubba)  parse_http_header(m->name||m["http-equiv"]||"", m->contents||m->content||m->data||"", res);
40a44d2004-08-07Johan Schön  return ({});
7d3b0a2001-05-28Per Hedbor  }; _WhiteFish.LinkFarm lf = _WhiteFish.LinkFarm();
5f36692006-11-17Stefan Wallström  function low_ladd = lf->add; void ladd(string html_href) { low_ladd(Parser.parse_html_entities(html_href, 1)); };
1de6dc2001-08-13Martin Nilsson 
80cd452007-03-16Jonas Wallden  array(string) parse_title(Parser.HTML p, mapping m, string c, mapping e) { if (e->noindex) return ({ });
40a44d2004-08-07Johan Schön  res->fields->title = clean(c); return ({c});
7d3b0a2001-05-28Per Hedbor  };
92ffeb2001-08-19Martin Nilsson  // FIXME: Push the a contents to the description field of the // document referenced to by this tag.
6ede462006-10-24Stefan Wallström  array parse_a(Parser.HTML p, mapping m, mapping e) {
92ffeb2001-08-19Martin Nilsson  // FIXME: We should try to decode the source with the // charset indicated in m->charset. // FIXME: We should set the document language to the // language indicated in m->hreflang.
6ede462006-10-24Stefan Wallström  if(m->href && !e->nofollow) ladd( m->href );
92ffeb2001-08-19Martin Nilsson 
40a44d2004-08-07Johan Schön  // FIXME: Push the value of m->title to the title field of // the referenced document. // if(m->title) // dadd(" ", m->title, " "); return ({});
fed8332002-10-05Jonas Wallden  };
92ffeb2001-08-19Martin Nilsson  // FIXME: The longdesc information should be pushed to the // description field of the frame src URL when it is indexed.
6ede462006-10-24Stefan Wallström  array parse_frame(Parser.HTML p, mapping m, mapping e) { if(m->src && !e->nofollow) ladd( m->src );
40a44d2004-08-07Johan Schön  return ({});
2ce86d2001-08-17Per Hedbor  };
92ffeb2001-08-19Martin Nilsson  // FIXME: This information should be pushed to the body field // of the image file, if it is indexed.
42539c2005-03-08Anders Johansson  array parse_img(Parser.HTML p, mapping m, mapping e) { if( !e->noindex ) { if( m->alt && sizeof(m->alt) ) dadd(" ", clean(m->alt)); if( m->title && sizeof(m->title) ) dadd(" ", clean(m->title)); }
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  };
42539c2005-03-08Anders Johansson  array parse_applet(Parser.HTML p, mapping m, mapping e) {
92ffeb2001-08-19Martin Nilsson  // FIXME: The alt information should be pushed to the body field // of all the resources linked from this tag.
6ede462006-10-24Stefan Wallström  if( m->src && !e->nofollow ) ladd( m->src ); if( m->archive && !e->nofollow )
40a44d2004-08-07Johan Schön  ladd( m->archive); // URL to a GNU-ZIP file with classes needed // by the applet.
6ede462006-10-24Stefan Wallström  if( m->code && !e->nofollow ) ladd( m->code ); // URL to the applets code/class. if( m->codebase && !e->nofollow ) ladd( m->codebase );
42539c2005-03-08Anders Johansson  if( m->alt && sizeof(m->alt) ) if( !e->noindex ) dadd(" ", clean(m->alt));
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  }; // <area>, <bgsound>
42539c2005-03-08Anders Johansson  array parse_src_alt(Parser.HTML p, mapping m, mapping e) {
92ffeb2001-08-19Martin Nilsson  // FIXME: The alt information should be pushed to the body field // of all the resources linked from this tag.
6ede462006-10-24Stefan Wallström  if( m->src && !e->nofollow ) ladd( m->src );
42539c2005-03-08Anders Johansson  if( m->alt && sizeof(m->alt) ) if( !e->noindex ) dadd(" ", clean(m->alt));
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  };
6ede462006-10-24Stefan Wallström  array parse_background(Parser.HTML p, mapping m, mapping e) { if( m->background && !e->nofollow ) ladd( m->background );
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  };
6ede462006-10-24Stefan Wallström  array parse_embed(Parser.HTML p, mapping m, mapping e) { if( m->pluginspage && !e->nofollow )
40a44d2004-08-07Johan Schön  ladd( m->pluginspage ); // Where the required plugin can be downloaded.
6ede462006-10-24Stefan Wallström  if( m->pluginurl && !e->nofollow )
40a44d2004-08-07Johan Schön  ladd( m->pluginurl ); // Similar to pluginspage, but for java archives.
6ede462006-10-24Stefan Wallström  if( m->src && !e->nofollow ) ladd( m->src );
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  };
6ede462006-10-24Stefan Wallström  array parse_layer(Parser.HTML p, mapping m, mapping e) { if( m->background && !e->nofollow ) ladd( m->background ); if( m->src && !e->nofollow ) ladd( m->src );
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  };
42539c2005-03-08Anders Johansson  array parse_object(Parser.HTML p, mapping m, mapping e) {
6ede462006-10-24Stefan Wallström  if( m->archive && !e->nofollow ) ladd( m->archive ); if( m->classid && !e->nofollow ) ladd( m->classid ); if( m->code && !e->nofollow ) ladd( m->code ); if( m->codebase && !e->nofollow ) ladd( m->codebase ); if( m->data && !e->nofollow ) ladd( m->data );
40a44d2004-08-07Johan Schön  if( m->standby && sizeof(m->standby) )
42539c2005-03-08Anders Johansson  if ( !e->noindex ) dadd(" ", clean(m->standby) );
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  };
40a44d2004-08-07Johan Schön  array parse_base(Parser.HTML p, mapping m)
92ffeb2001-08-19Martin Nilsson  { if(m->href) catch(uri = Standards.URI(m->href));
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  };
6ede462006-10-24Stefan Wallström  array parse_q(Parser.HTML p, mapping m, mapping e) { if( m->cite && !e->nofollow ) ladd( m->cite );
40a44d2004-08-07Johan Schön  return ({});
92ffeb2001-08-19Martin Nilsson  };
6ede462006-10-24Stefan Wallström  array parse_xml(Parser.HTML p, mapping m, mapping e) { if( m->ns && !e->nofollow ) ladd( m->ns ); if( m->src && !e->nofollow ) ladd( m->src );
40a44d2004-08-07Johan Schön  return ({});
2ce86d2001-08-17Per Hedbor  };
89d4cf2001-05-17Johan Schön 
80cd452007-03-16Jonas Wallden  array parse_headline(Parser.HTML p, mapping m, string c, mapping e)
3ecb592001-09-02Johan Schön  {
80cd452007-03-16Jonas Wallden  if (e->noindex) return ({ });
3ecb592001-09-02Johan Schön  if(!res->fields->headline) res->fields->headline = "";
40a44d2004-08-07Johan Schön  res->fields->headline += " " + clean(c); return ({});
3ecb592001-09-02Johan Schön  };
40a44d2004-08-07Johan Schön  array parse_noindex(Parser.HTML p, mapping m, string c)
65d3882001-12-20Johan Schön  {
40a44d2004-08-07Johan Schön  if(m->nofollow) return ({}); Parser.HTML parser = p->clone(); parser->_set_data_callback(return_zero); parser->_set_entity_callback(return_zero); parser->add_tags( ([ "title" : return_zero, "h1" : return_zero, "h2" : return_zero, "h3" : return_zero, ]) ); function odadd = dadd; dadd = return_zero; parser->finish(c); dadd = odadd; return ({});
65d3882001-12-20Johan Schön  };
42539c2005-03-08Anders Johansson  // <!-- robots:noindex -->...<!-- /robots:noindex --> mixed parse_comment(Parser.HTML p, string c, mapping extra) { if(has_value(c, "/robots:noindex")) extra->noindex--; else if(has_value(c, "robots:noindex")) extra->noindex++;
6ede462006-10-24Stefan Wallström  if(has_value(c, "/robots:nofollow")) extra->nofollow--; else if(has_value(c, "robots:nofollow")) extra->nofollow++;
42539c2005-03-08Anders Johansson  return ({""}); };
40a44d2004-08-07Johan Schön  String.Buffer databuf=String.Buffer(sizeof(data)); Parser.HTML parser = parser->clone();
42539c2005-03-08Anders Johansson  mapping extra = ([]); parser->set_extra(extra); parser->add_quote_tag("!--", parse_comment, "--");
40a44d2004-08-07Johan Schön  parser->add_quote_tag("![CDATA[",
42539c2005-03-08Anders Johansson  lambda(Parser.HTML p, string data, mapping e) { if(!e->noindex) dadd(data);
40a44d2004-08-07Johan Schön  }, "]]");
89d4cf2001-05-17Johan Schön 
1de6dc2001-08-13Martin Nilsson  // parser->add_container("rank",parse_rank);
3ecb592001-09-02Johan Schön  parser->add_containers( ([ "title":parse_title, "h1": parse_headline, "h2": parse_headline,
3524712015-05-26Martin Nilsson  "h3": parse_headline,
42539c2005-03-08Anders Johansson  "noindex": parse_noindex, "no-index": parse_noindex,
65d3882001-12-20Johan Schön  "no_index": parse_noindex, ]) );
3524712015-05-26Martin Nilsson 
92ffeb2001-08-19Martin Nilsson  parser->add_tags( ([ "meta":parse_meta, "a":parse_a, "base": parse_base, "link":parse_a, "frame":parse_frame, "iframe":parse_frame, "layer":parse_layer, "ilayer":parse_layer, "img":parse_img, "applet":parse_applet, "area":parse_src_alt, "bgsound":parse_src_alt, "sound":parse_src_alt, "body":parse_background, "table":parse_background, "td":parse_background, "object": parse_object, "q":parse_q,
8232ec2001-08-19Martin Nilsson  "embed":parse_embed,
65d3882001-12-20Johan Schön  "xml":parse_xml
92ffeb2001-08-19Martin Nilsson  ]) );
3ecb592001-09-02Johan Schön  dadd = databuf->add;
40a44d2004-08-07Johan Schön  int space;
42539c2005-03-08Anders Johansson  parser->_set_data_callback(lambda(Parser.HTML p, string data, mapping e) { if (!e->noindex) { if(space) dadd(" "); dadd(data); space = 1; }
89d4cf2001-05-17Johan Schön  });
42539c2005-03-08Anders Johansson  parser->_set_entity_callback(lambda(Parser.HTML p, string data, mapping e) { if(!e->noindex) { if(entities[data]) { dadd(entities[data]); space = 0; return; } string c = Parser. decode_numeric_xml_entity(data); if(c) { space = 0; dadd(c); }
40a44d2004-08-07Johan Schön  } });
3524712015-05-26Martin Nilsson 
89d4cf2001-05-17Johan Schön  res->fields->title=""; res->fields->description=""; res->fields->keywords="";
40a44d2004-08-07Johan Schön  parser->finish(data);
41c2a92001-05-28Per Hedbor 
85fa0f2003-03-31Henrik Grubbström (Grubba)  res->links = lf->read();
40a44d2004-08-07Johan Schön  res->fields->body=databuf->get();
89d4cf2001-05-17Johan Schön  res->fix_relative_links(uri);
3ecb592001-09-02Johan Schön 
89d4cf2001-05-17Johan Schön  return res;
df57a22001-01-02Johan Schön }