7ec58b2000-10-26Johan Schön // Roxen Whitefish main pike module
5f4e122000-05-15Martin Nilsson // // Copyright © 2000, Roxen IS. #include "types.h"
e1d7152000-11-24Johan Schön class Document { //! The placeholder for document metadata. string title; string description; int last_changed; int size; string content_type; }
5f4e122000-05-15Martin Nilsson private mapping filters=([]);
7ec58b2000-10-26Johan Schön void create() {
e1d7152000-11-24Johan Schön  werror("Loading filters\n"); foreach(values(Search.Filter), Search.Filter filter) foreach(filter->contenttypes || ({ }), string mime) filters[mime]=filter;
5f4e122000-05-15Martin Nilsson  if(!sizeof(filters)) werror("No filters loaded\n"); else werror("Loaded %d filters\n", sizeof(filters)); }
7ec58b2000-10-26Johan Schön Filter get_filter(string mime_type) {
5f4e122000-05-15Martin Nilsson  if(!filters[mime_type]) return 0; return filters[mime_type]->Filter(); }
7ec58b2000-10-26Johan Schön array(string) get_filter_mime_types() {
5f4e122000-05-15Martin Nilsson  return indices(filters); }
d6e36f2000-11-01Martin Nilsson array(mapping) splitter(array(string) text, array(int) context,
5f4e122000-05-15Martin Nilsson  function(string:string) post_normalization,
7ec58b2000-10-26Johan Schön  function(mapping:int) ranking) {
d6e36f2000-11-01Martin Nilsson  if(sizeof(text)!=sizeof(context)) return 0;
5f4e122000-05-15Martin Nilsson  array(mapping) result=({});
7ec58b2000-10-26Johan Schön  for(int i=0; i<sizeof(text); i++) {
b3dabc2000-05-16Martin Nilsson  array words=text[i]/" ";
5f4e122000-05-15Martin Nilsson  int inc=0, oldinc;
7ec58b2000-10-26Johan Schön  foreach(words, string word) {
5f4e122000-05-15Martin Nilsson  oldinc=inc; inc+=sizeof(word)+1; word=post_normalization(word); if(!sizeof(word)) continue; mapping n_word=([ "word":word, "type":context[i],
d6e36f2000-11-01Martin Nilsson  // "offset":offset[i]+oldinc,
7ec58b2000-10-26Johan Schön  // This might be destroyed by pre_normalization
5f4e122000-05-15Martin Nilsson  ]); n_word->rank=ranking(n_word); result+=({ n_word }); } } return result; } // ---------- Anchor database ------------- class Anchor_database { void add(string page, string href, string text) { } array(string) get_texts(string page) { return ({}); } }
6b3def2000-10-27Martin Nilsson  // --- Page Ranking Algorithms ------------ float entropy(array(string) page_words) { mapping(string:int) words=([]); foreach(page_words, string word) words[word]=1; return (float)sizeof(words)/(float)sizeof(page_words); }
e1d7152000-11-24Johan Schön  private constant rank_list = ([ T_TITLE : 1, T_KEYWORDS : 2, T_EXT_A : 3, T_H1 : 4, T_H2 : 5, T_H3 : 6, T_DESC : 7, T_H4 : 8, T_TH : 9, T_B : 10, T_I : 11, T_A : 12, T_NONE : 13, T_H5 : 14, T_H6 : 15 ]); int rank(mapping word) { return rank_list[word->type]; }
6b3def2000-10-27Martin Nilsson // A normal page has an entropy value around 0.5, so the result x should probably be // remapped to abs(x-0.5) or even 1-abs(x-0.5)