7ec58b2000-10-26Johan Schön // Roxen Whitefish main pike module
5f4e122000-05-15Martin Nilsson // // Copyright © 2000, Roxen IS. #include "types.h"
e1d7152000-11-24Johan Schön class Document {
78fadb2000-11-30Johan Schön  string _sprintf() { return sprintf("Search.Document(\"http://%s\")", uri); }
e1d7152000-11-24Johan Schön  //! The placeholder for document metadata.
78fadb2000-11-30Johan Schön  string uri;
e1d7152000-11-24Johan Schön  string title; string description; int last_changed; int size; string content_type; }
5f4e122000-05-15Martin Nilsson private mapping filters=([]);
7ec58b2000-10-26Johan Schön void create() {
78fadb2000-11-30Johan Schön // werror("Loading filters\n");
7735232000-11-30Martin Nilsson  foreach(values(Search.Filter.Base), program filter)
78fadb2000-11-30Johan Schön  { Search.Filter.Base tmp=filter(); foreach(tmp->contenttypes || ({ }), string mime) filters[mime]=tmp; }
e1d7152000-11-24Johan Schön 
78fadb2000-11-30Johan Schön // if(!sizeof(filters)) // werror("No filters loaded\n"); // else // werror("Loaded %d filters\n", sizeof(filters));
5f4e122000-05-15Martin Nilsson }
78fadb2000-11-30Johan Schön Search.Filter.Base get_filter(string mime_type)
7ec58b2000-10-26Johan Schön {
5f4e122000-05-15Martin Nilsson  if(!filters[mime_type]) return 0;
78fadb2000-11-30Johan Schön  return filters[mime_type];
5f4e122000-05-15Martin Nilsson }
7ec58b2000-10-26Johan Schön array(string) get_filter_mime_types() {
5f4e122000-05-15Martin Nilsson  return indices(filters); }
d6e36f2000-11-01Martin Nilsson array(mapping) splitter(array(string) text, array(int) context,
5f4e122000-05-15Martin Nilsson  function(string:string) post_normalization,
7ec58b2000-10-26Johan Schön  function(mapping:int) ranking) {
d6e36f2000-11-01Martin Nilsson  if(sizeof(text)!=sizeof(context)) return 0;
5f4e122000-05-15Martin Nilsson  array(mapping) result=({});
7ec58b2000-10-26Johan Schön  for(int i=0; i<sizeof(text); i++) {
b3dabc2000-05-16Martin Nilsson  array words=text[i]/" ";
5f4e122000-05-15Martin Nilsson  int inc=0, oldinc;
7ec58b2000-10-26Johan Schön  foreach(words, string word) {
5f4e122000-05-15Martin Nilsson  oldinc=inc; inc+=sizeof(word)+1; word=post_normalization(word); if(!sizeof(word)) continue; mapping n_word=([ "word":word, "type":context[i],
d6e36f2000-11-01Martin Nilsson  // "offset":offset[i]+oldinc,
7ec58b2000-10-26Johan Schön  // This might be destroyed by pre_normalization
5f4e122000-05-15Martin Nilsson  ]); n_word->rank=ranking(n_word); result+=({ n_word }); } } return result; } // ---------- Anchor database ------------- class Anchor_database { void add(string page, string href, string text) { } array(string) get_texts(string page) { return ({}); } }
6b3def2000-10-27Martin Nilsson  // --- Page Ranking Algorithms ------------ float entropy(array(string) page_words) { mapping(string:int) words=([]); foreach(page_words, string word) words[word]=1; return (float)sizeof(words)/(float)sizeof(page_words); }
e1d7152000-11-24Johan Schön  private constant rank_list = ([ T_TITLE : 1, T_KEYWORDS : 2, T_EXT_A : 3, T_H1 : 4, T_H2 : 5, T_H3 : 6, T_DESC : 7, T_H4 : 8, T_TH : 9, T_B : 10, T_I : 11, T_A : 12, T_NONE : 13, T_H5 : 14, T_H6 : 15 ]); int rank(mapping word) { return rank_list[word->type]; }
6b3def2000-10-27Martin Nilsson // A normal page has an entropy value around 0.5, so the result x should probably be // remapped to abs(x-0.5) or even 1-abs(x-0.5)