a23b4c2001-05-25Per Hedbor #include <math.h>
7502512001-05-22Per Hedbor #include "global.h" #include "stralloc.h" #include "global.h"
7a08b42001-05-25Per Hedbor RCSID("$Id: whitefish.c,v 1.17 2001/05/25 16:17:59 per Exp $");
7502512001-05-22Per Hedbor #include "pike_macros.h" #include "interpret.h" #include "program.h" #include "program_id.h" #include "object.h" #include "operators.h"
4ad07e2001-05-22Per Hedbor #include "array.h" #include "module_support.h"
7502512001-05-22Per Hedbor 
0853842001-05-22Per Hedbor #include "config.h"
7502512001-05-22Per Hedbor #include "whitefish.h" #include "resultset.h"
4926f72001-05-22Per Hedbor #include "blob.h"
7502512001-05-22Per Hedbor  /* must be included last */ #include "module_magic.h"
4ad07e2001-05-22Per Hedbor struct tofree { Blob **blobs;
a23b4c2001-05-25Per Hedbor  Blob **tmp;
4ad07e2001-05-22Per Hedbor  int nblobs; struct object *res; };
6aa8ef2001-05-23Per Hedbor static void free_stuff( void *_t )
4ad07e2001-05-22Per Hedbor { struct tofree *t= (struct tofree *)_t; int i; if( t->res ) free_object( t->res ); for( i = 0; i<t->nblobs; i++ ) wf_blob_free( t->blobs[i] );
a23b4c2001-05-25Per Hedbor  free( t->tmp );
4ad07e2001-05-22Per Hedbor  free( t ); }
a23b4c2001-05-25Per Hedbor #define OFFSET(X) \
cce09d2001-05-25Per Hedbor  (X.type == HIT_BODY?X.u.body.pos:X.type==HIT_FIELD?(X.u.field.pos):(X.u.anchor.pos))
a23b4c2001-05-25Per Hedbor 
7a08b42001-05-25Per Hedbor static int _distance_f( int distance ) { if( distance < 2 ) return 0; if( distance < 6 ) return 1; if( distance < 11 ) return 2; if( distance < 22 ) return 3; if( distance < 42 ) return 4; if( distance < 82 ) return 5; if( distance < 161 ) return 6; return 7; } #define DOFF(X) _distance_f(X)
cce09d2001-05-25Per Hedbor #define MOFF(X) (X.type==HIT_BODY?0:X.type==HIT_FIELD?X.u.field.type+2:1)
a23b4c2001-05-25Per Hedbor  static void handle_hit( Blob **blobs, int nblobs, struct object *res,
5cdc2c2001-05-25Per Hedbor  int docid,
6ac4092001-05-25Johan Schön  double *field_c[66],
a633972001-05-25Per Hedbor  double *prox_c[8], double mc, double mp)
a23b4c2001-05-25Per Hedbor { int i, j, k, end = 0; Hit *hits = malloc( nblobs * sizeof(Hit) ); unsigned char *nhits = malloc( nblobs ); unsigned char *pos = malloc( nblobs );
6ac4092001-05-25Johan Schön  int matrix[66][8];
a23b4c2001-05-25Per Hedbor 
a633972001-05-25Per Hedbor  MEMSET(matrix, 0, sizeof(matrix) );
a23b4c2001-05-25Per Hedbor  MEMSET(hits, 0, nblobs * sizeof(Hit) ); MEMSET(pos, 0, nblobs ); for( i = 0; i<nblobs; i++ ) nhits[i] = wf_blob_nhits( blobs[i] ); for( i = 0; i<nblobs; i++ ) { MEMSET( pos, 0, nblobs ); for( j = 0; j<nhits[i]; j++ ) { hits[i] = wf_blob_hit( blobs[i], j );
cce09d2001-05-25Per Hedbor  matrix[MOFF(hits[i])][0]++;
a633972001-05-25Per Hedbor /* printf("Absolute hit %d -> %d\n", hits[i].raw, MOFF(hits[i]) ); */
a23b4c2001-05-25Per Hedbor  /* forward the other positions */ for( k = 0; k<nblobs; k++ )
a633972001-05-25Per Hedbor  if( k != i && pos[ k ] < nhits[ k ] )
a23b4c2001-05-25Per Hedbor  {
cce09d2001-05-25Per Hedbor  while( (hits[k].raw < hits[i].raw) && (pos[ k ] < nhits[ k ]))
a23b4c2001-05-25Per Hedbor  hits[k] = wf_blob_hit( blobs[k], pos[k]++ );
07a8a02001-05-25Per Hedbor  if( (pos[ k ] < nhits[ k ]) && hits[k].type == hits[i].type )
a633972001-05-25Per Hedbor  { /* printf("Pair hit %d-%d: %x -> [%d][%d]\n", */ /* OFFSET(hits[i]), OFFSET(hits[k]), */ /* hits[i].raw, */ /* MOFF(hits[i]), */ matrix[MOFF(hits[i])][DOFF(OFFSET(hits[k])-OFFSET(hits[i]))]+=2; }
a23b4c2001-05-25Per Hedbor  } } } /* Now we have our nice matrix. Time to do some multiplication */
5cdc2c2001-05-25Per Hedbor 
a633972001-05-25Per Hedbor /* printf("matrix:\n"); */ /* for( i = 0; i<3; i++ ) */ /* { */ /* for( j = 0; j<8; j++ ) */ /* printf( "%4d ", matrix[i][j] ); */ /* printf("\n"); */ /* } */
5cdc2c2001-05-25Per Hedbor  { double accum = 0.0, fc, pc; int accum_i;
6ac4092001-05-25Johan Schön  for( i = 0; i<66; i++ )
a633972001-05-25Per Hedbor  if( (fc = (*field_c)[i]) != 0.0 )
5cdc2c2001-05-25Per Hedbor  for( j = 0; j<8; j++ )
a633972001-05-25Per Hedbor  if( (pc = (*prox_c)[j]) != 0.0 ) accum += (matrix[i][j]*fc*pc) / (mc*mp);
5cdc2c2001-05-25Per Hedbor  /* Limit */ if( accum > 32000.0 ) accum = 32000.0;
a633972001-05-25Per Hedbor  accum_i = (int)(accum *100 );
5cdc2c2001-05-25Per Hedbor  if( accum_i > 0 ) wf_resultset_add( res, docid, accum_i ); }
a23b4c2001-05-25Per Hedbor }
4ad07e2001-05-22Per Hedbor static struct object *low_do_query_merge( Blob **blobs, int nblobs,
6ac4092001-05-25Johan Schön  double field_c[66],
5cdc2c2001-05-25Per Hedbor  double prox_c[8] )
4ad07e2001-05-22Per Hedbor { struct object *res = wf_resultset_new(); struct tofree *__f = malloc( sizeof( struct tofree ) );
a633972001-05-25Per Hedbor  double max_c=0.0, max_p=0.0;
4ad07e2001-05-22Per Hedbor  ONERROR e;
a23b4c2001-05-25Per Hedbor  int i, j, end=0; Blob **tmp; tmp = malloc( nblobs * sizeof( Blob *) );
4ad07e2001-05-22Per Hedbor  __f->res = res; __f->blobs = blobs; __f->nblobs = nblobs;
a23b4c2001-05-25Per Hedbor  __f->tmp = tmp;
4ad07e2001-05-22Per Hedbor  SET_ONERROR( e, free_stuff, __f );
3693062001-05-22Per Hedbor 
a633972001-05-25Per Hedbor  for( i = 0; i<66; i++ ) if( field_c[i] > max_c ) max_c = field_c[i]; for( i = 0; i<8; i++ ) if( prox_c[i] > max_p ) max_p = prox_c[i];
a23b4c2001-05-25Per Hedbor 
a633972001-05-25Per Hedbor  if( max_p != 0.0 && max_c != 0.0 )
a23b4c2001-05-25Per Hedbor  {
a633972001-05-25Per Hedbor  /* Time to do the real work. :-) */ for( i = 0; i<nblobs; i++ ) /* Forward to first element */ wf_blob_next( blobs[i] ); /* Main loop: Find the smallest element in the blob array. */ while( !end ) { unsigned int min = 0x7ffffff;
a23b4c2001-05-25Per Hedbor 
a633972001-05-25Per Hedbor  for( i = 0; i<nblobs; i++ ) if( !blobs[i]->eof && ((unsigned int)blobs[i]->docid) < min ) min = blobs[i]->docid;
3693062001-05-22Per Hedbor 
a633972001-05-25Per Hedbor  if( min == 0x7ffffff ) break; /* printf( "hit in %d: ", min ); */
a23b4c2001-05-25Per Hedbor 
a633972001-05-25Per Hedbor  for( j = 0, i = 0; i < nblobs; i++ ) if( blobs[i]->docid == min && !blobs[i]->eof ) { tmp[j++] = blobs[i]; /* printf( "%8x ", blobs[i]->word ); */ } /* printf( "\n"); */ handle_hit( tmp, j, res, min, &field_c, &prox_c, max_c, max_p );
a23b4c2001-05-25Per Hedbor 
a633972001-05-25Per Hedbor  /* Step the 'min' blobs */ for( i = 0; i<j; i++ ) wf_blob_next( tmp[i] ); }
a23b4c2001-05-25Per Hedbor  } /* Free workarea and return the result. */
3693062001-05-22Per Hedbor 
4ad07e2001-05-22Per Hedbor  UNSET_ONERROR( e ); __f->res = 0; free_stuff( __f ); return res; }
7502512001-05-22Per Hedbor 
4926f72001-05-22Per Hedbor static void f_do_query_merge( INT32 args )
6ac4092001-05-25Johan Schön /*! @decl ResultSet do_query_merge( array(int) words, @
4926f72001-05-22Per Hedbor  *! array(int) field_coefficients, @ *! array(int) proximity_coefficients, @
6ac4092001-05-25Johan Schön  *! function(int:string) blobfeeder) *! @[words]
4926f72001-05-22Per Hedbor  *! *! Arrays of word ids. Note that the order is significant *! for the ranking. *! *! @[field_coefficients] *! *! An array of ranking coefficients for the different fields.
6ac4092001-05-25Johan Schön  *! In the range of [0x0000-0xffff]. The array (always) has 66
4926f72001-05-22Per Hedbor  *! elements: *! *! Index Coefficient for field *! ----- ---------------------
6ac4092001-05-25Johan Schön  *! 0 body *! 1 anchor *! 2..65 Special field 0..63
4926f72001-05-22Per Hedbor  *! *! @[proximity_coefficients] *! *! An array of ranking coefficients for the different *! proximity categories. Always has 8 elements, in the range *! of [0x0000-0xffff]. *! *! Index Meaning *! ----- ------- *! 0 spread: 0 (Perfect hit) *! 1 spread: 1-5 *! 2 spread: 6-10 *! 3 spread: 11-20 *! 4 spread: 21-40 *! 5 spread: 41-80 *! 6 spread: 81-160 *! 7 spread: 161- *! *! The 'spread' value should be defined somehow. *! *! @[blobfeeder] *! *! This function returns a Pike string containing the word hits *! for a certain word_id. Call repeatedly until it returns 0. */ {
5cdc2c2001-05-25Per Hedbor  double proximity_coefficients[8];
6ac4092001-05-25Johan Schön  double field_coefficients[66];
4ad07e2001-05-22Per Hedbor  int numblobs, i; Blob **blobs; struct svalue *cb;
a23b4c2001-05-25Per Hedbor  struct object *res;
4ad07e2001-05-22Per Hedbor  struct array *_words, *_field, *_prox; /* 1: Get all arguments. */
6ac4092001-05-25Johan Schön  get_all_args( "do_query_merge", args, "%a%a%a%*",
7ce8552001-05-22Per Hedbor  &_words, &_field, &_prox, &cb);
4ad07e2001-05-22Per Hedbor 
6ac4092001-05-25Johan Schön  if( _field->size != 66 ) Pike_error("Illegal size of field_coefficients array (expected 66)\n" );
4ad07e2001-05-22Per Hedbor  if( _prox->size != 8 ) Pike_error("Illegal size of proximity_coefficients array (expected 8)\n" ); numblobs = _words->size; if( !numblobs ) { struct object *o = wf_resultset_new( ); pop_n_elems( args ); push_object( o ); return; } blobs = malloc( sizeof(Blob *) * numblobs ); for( i = 0; i<numblobs; i++ ) blobs[i] = wf_blob_new( cb, _words->item[i].u.integer ); for( i = 0; i<8; i++ )
a633972001-05-25Per Hedbor  proximity_coefficients[i] = (double)_prox->item[i].u.integer;
4ad07e2001-05-22Per Hedbor 
6ac4092001-05-25Johan Schön  for( i = 0; i<66; i++ )
a633972001-05-25Per Hedbor  field_coefficients[i] = (double)_field->item[i].u.integer;
4926f72001-05-22Per Hedbor 
a23b4c2001-05-25Per Hedbor  res = low_do_query_merge(blobs,numblobs, field_coefficients, proximity_coefficients ); pop_n_elems( args ); push_object( res );
4926f72001-05-22Per Hedbor }
7502512001-05-22Per Hedbor void pike_module_init(void) { init_resultset_program();
20c5972001-05-23Per Hedbor  init_blob_program();
4926f72001-05-22Per Hedbor  add_function( "do_query_merge", f_do_query_merge, "function(array(int),array(int),array(int)"
7ce8552001-05-22Per Hedbor  ",function(int:string):object)",
4926f72001-05-22Per Hedbor  0 );
7502512001-05-22Per Hedbor } void pike_module_exit(void) {
0853842001-05-22Per Hedbor  exit_resultset_program();
20c5972001-05-23Per Hedbor  exit_blob_program();
7502512001-05-22Per Hedbor }