7ec58b2000-10-26Johan Schön // Roxen Whitefish main pike module
5f4e122000-05-15Martin Nilsson // // Copyright © 2000, Roxen IS. #include "types.h" private mapping filters=([]);
7ec58b2000-10-26Johan Schön void create() {
5f4e122000-05-15Martin Nilsson  // Load filters werror("Load filters\n"); array tmp=__FILE__/"/"; tmp=tmp[0..sizeof(tmp)-2]; string path=tmp*"/"+"/filters/"; // catch { array(string) f=get_dir( path ); foreach(glob("*.pike",f), string file) { // mixed error = catch { werror("Try with %s\n", path+file); object l=(object)(path+file); array(string) mimes = l->contenttypes; foreach(mimes, string mime) filters[mime]=l; // }; // if(error) werror("Failed to load filters/%s\n",file); } // }; if(!sizeof(filters)) werror("No filters loaded\n"); else werror("Loaded %d filters\n", sizeof(filters)); } private constant rank_list = ([ T_TITLE : 1, T_KEYWORDS : 2, T_EXT_A : 3, T_H1 : 4, T_H2 : 5, T_H3 : 6, T_DESC : 7, T_H4 : 8, T_TH : 9, T_B : 10, T_I : 11, T_A : 12, T_NONE : 13, T_H5 : 14, T_H6 : 15 ]);
7ec58b2000-10-26Johan Schön int rank(mapping word) {
5f4e122000-05-15Martin Nilsson  return rank_list[word->type]; }
7ec58b2000-10-26Johan Schön class Filter {
5f4e122000-05-15Martin Nilsson  void set_content(string); array(array(string)) get_anchors(); void add_content(string, int);
35d3712000-05-16Martin Nilsson  array(array) get_filtered_content();
5f4e122000-05-15Martin Nilsson  string get_title(); string get_keywords(); string get_description();
35d3712000-05-16Martin Nilsson  // string normalization(string);
5f4e122000-05-15Martin Nilsson }
7ec58b2000-10-26Johan Schön Filter get_filter(string mime_type) {
5f4e122000-05-15Martin Nilsson  if(!filters[mime_type]) return 0; return filters[mime_type]->Filter(); }
7ec58b2000-10-26Johan Schön array(string) get_filter_mime_types() {
5f4e122000-05-15Martin Nilsson  return indices(filters); } array(mapping) splitter(array(string) text, array(int) context, array(int) offset, function(string:string) post_normalization,
7ec58b2000-10-26Johan Schön  function(mapping:int) ranking) {
5f4e122000-05-15Martin Nilsson  if(sizeof(text)!=sizeof(context) || sizeof(text)!=sizeof(offset) ) return 0; array(mapping) result=({});
7ec58b2000-10-26Johan Schön  for(int i=0; i<sizeof(text); i++) {
b3dabc2000-05-16Martin Nilsson  array words=text[i]/" ";
5f4e122000-05-15Martin Nilsson  int inc=0, oldinc;
7ec58b2000-10-26Johan Schön  foreach(words, string word) {
5f4e122000-05-15Martin Nilsson  oldinc=inc; inc+=sizeof(word)+1; word=post_normalization(word); if(!sizeof(word)) continue; mapping n_word=([ "word":word, "type":context[i],
7ec58b2000-10-26Johan Schön  "offset":offset[i]+oldinc, // This might be destroyed by pre_normalization
5f4e122000-05-15Martin Nilsson  ]); n_word->rank=ranking(n_word); result+=({ n_word }); } } return result; } // ---------- Anchor database ------------- class Anchor_database { void add(string page, string href, string text) { } array(string) get_texts(string page) { return ({}); } }
6b3def2000-10-27Martin Nilsson  // --- Page Ranking Algorithms ------------ float entropy(array(string) page_words) { mapping(string:int) words=([]); foreach(page_words, string word) words[word]=1; return (float)sizeof(words)/(float)sizeof(page_words); } // A normal page has an entropy value around 0.5, so the result x should probably be // remapped to abs(x-0.5) or even 1-abs(x-0.5)