87e9262001-06-22Martin Nilsson // This file is part of Roxen Search
6351652009-05-25Martin Stjernholm // Copyright © 2001 - 2009, Roxen IS. All rights reserved.
87e9262001-06-22Martin Nilsson //
6351652009-05-25Martin Stjernholm // $Id: Indexer.pmod,v 1.19 2009/05/25 18:26:52 mast Exp $
87e9262001-06-22Martin Nilsson 
28d6b82001-08-08Martin Nilsson //!
d057542001-06-04Johan Schön void index_document(Search.Database.Base db,
4779302001-06-05Per Hedbor  string|Standards.URI uri, void|string language,
f70d322001-09-26Johan Schön  mapping fields)
90f5642001-05-17Johan Schön {
57ab6d2001-05-26Per Hedbor  db->remove_document( uri, language );
40a44d2004-08-07Johan Schön  int mtime = (int)fields->mtime;
d057542001-06-04Johan Schön  foreach(indices(fields), string field)
57ab6d2001-05-26Per Hedbor  {
d057542001-06-04Johan Schön  string f; if( strlen(f = fields[field] ) )
57ab6d2001-05-26Per Hedbor  {
4779302001-06-05Per Hedbor  array words=Search.Utils.tokenize_and_normalize( f );
d057542001-06-04Johan Schön  db->insert_words(uri, language, field, words );
57ab6d2001-05-26Per Hedbor  } }
40a44d2004-08-07Johan Schön  if( mtime ) db->set_lastmodified( uri, language, mtime ); // Tokenize any anchor fields
b7ef8a2001-06-26Johan Schön // int source_hash=hash((string)uri)&0xf; // foreach(indices(uri_anchors|| ({ })), string link_uri) // { // array(string) words= // Search.Utils.tokenize_and_normalize(uri_anchors[link_uri]); // db->insert_words(link_uri, 0, "anchor", words, source_hash); // }
90f5642001-05-17Johan Schön }
28d6b82001-08-08Martin Nilsson //!
40a44d2004-08-07Johan Schön string extension_to_type(string extension) { return MIME.ext_to_media_type(extension) || "application/octet-stream"; } //! string filename_to_type(string filename) { array v=filename/"."; if (sizeof(v)<2) return extension_to_type("default"); return extension_to_type(v[-1]); } //! Search.Filter.Output filter_and_index(Search.Database.Base db, string|Standards.URI uri, void|string language, string|Stdio.File data, string content_type, void|mapping headers, void|string default_charset )
90f5642001-05-17Johan Schön { Search.Filter.Base filter=Search.get_filter(content_type); if(!filter)
40a44d2004-08-07Johan Schön  return 0;
90f5642001-05-17Johan Schön 
40a44d2004-08-07Johan Schön  Search.Filter.Output filteroutput=
9707602001-06-06Per Hedbor  filter->filter(uri, data, content_type, headers, default_charset);
f70d322001-09-26Johan Schön  index_document(db, uri, language, filteroutput->fields);
40a44d2004-08-07Johan Schön  return filteroutput;
90f5642001-05-17Johan Schön }
28d6b82001-08-08Martin Nilsson //!
eac31a2001-05-31Johan Schön void remove_document(Search.Database.Base db,
d1469b2001-08-21Johan Schön  string|Standards.URI uri,
90f5642001-05-17Johan Schön  void|string language) { db->remove_document(uri, language); }