Main Page | File List

score.c

00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <string.h>
00004 #include <math.h>
00005 #include <gmp.h>
00006 #include <mpfr.h>
00007 #include "score.h"
00008 #include <db.h> 
00009 #include "store.h"
00010 
00011 //temporary scoring database
00012 static DB *dbtemp=NULL;
00013 //number of "good" and "bad" pages in permanent database
00014 static unsigned int BAD, GOOD;
00015 //parameters for scoring
00016 static double X = 0.4;
00017 static double S = 2;
00018 static double DEV = 0.4;
00019 //the value rreturned by function score_getChi2
00020 mpfr_t sum;
00021 //variable where is stored the int value of a token
00022 unsigned int valeur[2];
00023 //number of tokens in the temporary database (used for verbose mode = 2) 
00024 unsigned int tokens = 0;
00025 
00026 /*
00027 * Initialize variables for scoring (number of "good" and "bad" pages) and open temporary database for scoring. 
00028 */
00029 void score_initScoring(void)
00030 {
00031   int ret;
00032   u_int32_t flags;
00033   unsigned int* tmp = 0;
00034 
00035   flags = DB_CREATE|DB_TRUNCATE;        
00036     ret = db_create(&dbtemp, NULL, 0);
00037     if (ret != 0){
00038     fprintf(stderr, "Error while creating database\n");
00039     exit(0);
00040     }
00041     
00042     dbtemp->set_cachesize(dbtemp,0,(unsigned int)270384*270384,1);
00043     ret = dbtemp->open(dbtemp,       
00044                        NULL,       
00045                        NULL, 
00046                        0,       
00047                        DB_BTREE,   
00048                        flags,     
00049                        0);
00050     
00051     if(ret != 0){
00052       fprintf(stderr, "Error while opening basetemp");
00053     }
00054 
00055   tmp = store_getScore("~~nombre-pages~~");
00056   BAD = tmp[0];
00057   GOOD = tmp[1];
00058 }
00059 
00060 /*
00061 * Display the score of a page and more (depending on verbose mode).
00062 * @param verbose (1 = "simple verbose mode", 2 = "extra verbose mode")
00063 */
00064 void score_getProbability(int verbose)
00065 {
00066   unsigned int tabstats[10], nbtokens=0;
00067   char*  tabtokens[tokens];
00068   unsigned int tabnbap[tokens];
00069   unsigned int tabgood[tokens];
00070   unsigned int tabbad[tokens];
00071   double tabfw[tokens];
00072   char *tabsign[tokens];
00073   int    tabind[tokens];
00074 
00075   double graham=0, fw=0;
00076   mpfr_t *ret, h, s, res;
00077   unsigned int* tmp;
00078   unsigned int bad, good, frequency;
00079   double sommeLog = 0, sommemLog=0;
00080   unsigned int intervalle = 0;
00081   int retb;
00082   int i=0,j=0;
00083   for (i=0;i<10;i++) tabstats[i]=0;
00084   i=0;
00085 
00086   DBT key, data;
00087   DBC *cursorp;
00088   
00089   dbtemp->cursor(dbtemp, NULL, &cursorp, 0);  
00090   memset(&key, 0, sizeof(DBT));
00091   memset(&data, 0, sizeof(DBT));
00092   
00093   while ((retb = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
00094     tmp = store_getScore(key.data);
00095     bad = tmp[0];
00096     good = tmp[1];
00097     tmp = score_getScore(key.data);
00098     frequency = tmp[0];
00099     cursorp->c_del(cursorp, 0);
00100     if (bad+good != 0){
00101       graham = (1.0*bad/BAD)/((1.0*bad/BAD)+(1.0*good/GOOD));
00102       fw = (S*X + (bad+good)*graham)/(S+bad+good);
00103     }else{
00104       graham = 0.0;
00105       fw = X;
00106     }
00107     if (fw<DEV || fw>1-DEV){
00108       intervalle  = intervalle + frequency;
00109       sommeLog = sommeLog + (log(fw)*frequency);
00110       sommemLog = sommemLog + (log(1-fw)*frequency);
00111       tabsign[i] = "+";
00112     }else tabsign[i] = "-";
00113     if(verbose == 1){
00114       int indice = (int)floor(fw*10);
00115       tabstats[indice] += frequency;
00116       nbtokens += frequency;
00117     }
00118     if(verbose == 2){
00119       tabtokens[i] = NULL;
00120       tabtokens[i] = malloc(sizeof(char)*(strlen(key.data)+1)); 
00121       strcpy(tabtokens[i], key.data);
00122       tabnbap[i] = frequency;
00123       tabgood[i] = good;
00124       tabbad[i] = bad;
00125       tabfw[i] = fw;
00126       tabind[i] = i;
00127       i++;
00128       nbtokens += frequency;
00129     }
00130   }
00131   if(verbose == 2){
00132     triRapide(tabfw,0, tokens-1, tabind);
00133     printf("%-45s %7s %10s %7s %7s %9s \n", "tokens", "nbapp", "total", "good", "bad", "proba");
00134     for(i=0; i<tokens; i++){
00135       printf("%-45s %7u %10u %7u %7u %9f %s\n", tabtokens[tabind[i]], tabnbap[tabind[i]],tabgood[tabind[i]]+tabbad[tabind[i]], tabgood[tabind[i]], tabbad[tabind[i]], tabfw[i], tabsign[tabind[i]]);
00136     }
00137     printf("nbtokens : %d\n", nbtokens);
00138   }
00139 
00140 
00141   mpfr_init(h);
00142   mpfr_init(s);
00143   mpfr_init(res);
00144   
00145   ret = score_getChi2(-2*sommeLog, 2*intervalle);
00146   mpfr_set(h, *ret, GMP_RNDN);
00147   ret = score_getChi2(-2*sommemLog, 2*intervalle);
00148   mpfr_set(s, *ret, GMP_RNDN);
00149 
00150   mpfr_neg(s, s, GMP_RNDN);
00151   mpfr_add(res, h, s, GMP_RNDN);
00152   mpfr_add_ui(res, res, 1, GMP_RNDN);
00153   mpfr_div_ui(res, res, 2, GMP_RNDN);
00154 
00155   if (cursorp != NULL) 
00156     cursorp->c_close(cursorp); 
00157 
00158   if(verbose == 1){
00159     for (i=0; i<10; i++){
00160       printf("%-3d-%3d     %4d : ", i*10, (i+1)*10, tabstats[i]);
00161       for(j=0; j<(int)floor(((double)tabstats[i]/nbtokens)*100); j++){
00162         printf("#");
00163       }
00164       printf("\n");
00165     }
00166     printf("nbtokens : %d\n", nbtokens);
00167   }
00168 
00169   printf("%f\n", mpfr_get_d (res, GMP_RNDN));
00170 }
00171 
00172 /*
00173 * Calulate and returns the inverse Chi square value of the combined probabilities. 
00174 * @param chi the sum of logarithms
00175 * @param df the degrees of liberty
00176 * @return sum the inverse Chi square value
00177 */
00178 mpfr_t* score_getChi2(double chi, unsigned int df)
00179 {
00180   mpfr_t term, temp, tmp ,m, mm;
00181   unsigned int i=1;
00182   
00183   mpfr_init(sum);
00184   mpfr_init(term);
00185   mpfr_init(temp);
00186   mpfr_init(tmp);
00187   mpfr_init(m);
00188   mpfr_init(mm);
00189 
00190   mpfr_set_d(m, chi / 2.0, GMP_RNDN);
00191   mpfr_neg(mm,m, GMP_RNDD);
00192   mpfr_exp(tmp, mm, GMP_RNDN);
00193   mpfr_set(sum, tmp, GMP_RNDN);
00194   mpfr_set(term, tmp, GMP_RNDN);
00195   for(i=1;i<=df/2;i++){
00196     mpfr_div_ui(temp, m, i, GMP_RNDN);
00197     mpfr_mul(term, term, temp, GMP_RNDN);
00198     mpfr_add(sum, sum, term, GMP_RNDN);
00199   }
00200 
00201   if ( mpfr_cmp_ui(sum,1) > 0){
00202     mpfr_set_d(sum, 1.0, GMP_RNDN);
00203     return (&sum);
00204   }else{ 
00205     return (&sum);
00206   }
00207 }
00208 
00209 /*
00210 * Get the number of occurences of a token in the temporary scoring database.
00211 * @param token the token to get the score from
00212 * @return data.data the number of occurences of the token
00213 */
00214 unsigned int* score_getScore(char *token)
00215 {     
00216   int ret;
00217   DBT key, data;
00218   
00219   memset(&key, 0, sizeof(DBT));
00220   memset(&data, 0, sizeof(DBT));    
00221  
00222   key.data = token;
00223   key.size = strlen(token)+1;
00224   
00225   ret = dbtemp->get(dbtemp, NULL, &key, &data, 0);
00226   
00227   if(ret == DB_NOTFOUND){
00228     valeur[0] = 0;
00229     valeur[1] = 0;
00230     return (valeur);
00231   }else
00232     return (to_int((unsigned char*)data.data));
00233 }
00234 
00235 
00236 /*
00237 * Store the given token in the the temporary scoring database
00238 * @param token the token to store
00239 * @param type the type of token (tag, word, biword or domain)
00240 */
00241 void score_storeToken(char *token, int type)
00242 {
00243  int ret; 
00244  DBT key, data;
00245  unsigned int *tmp; 
00246  unsigned char *val=NULL;     
00247  char *debut = NULL;
00248  char *chaine = NULL;
00249 
00250  tokens +=1 ;
00251 
00252  switch(type){
00253  case TAGS :    
00254    debut = malloc(sizeof(char)*(strlen("tag:")+1));
00255    strcpy(debut, "tag:");
00256    break;
00257  case WORDS :
00258    debut = malloc(sizeof(char)*(strlen("word:")+1));
00259    strcpy(debut, "word:");
00260    break;
00261  case BIWORDS :
00262    debut = malloc(sizeof(char)*(strlen("biword:")+1));
00263    strcpy(debut, "biword:");
00264    break;
00265  case DOMAINS :
00266    debut = malloc(sizeof(char)*(strlen("domain:")+1));
00267    strcpy(debut, "domain:");
00268    break;
00269  default :
00270    debut =  malloc(sizeof(char)*(strlen("")+1));
00271    strcpy(debut, "");
00272    break;
00273  }
00274 
00275  memset(&key, 0, sizeof(DBT));
00276  memset(&data, 0, sizeof(DBT));    
00277 
00278  chaine = malloc(sizeof(char)*(strlen(token)+strlen(debut)+1));
00279  strcpy(chaine, debut);
00280  strcat(chaine, token);
00281  
00282  key.data = chaine;
00283  key.size = strlen(chaine)+1;
00284  
00285  val = to_hex(1, 0);
00286 
00287  data.data = val;
00288  data.size = 4; 
00289   
00290  ret = dbtemp->put(dbtemp, NULL, &key, &data, DB_NOOVERWRITE);
00291  if (ret == DB_KEYEXIST) {
00292    dbtemp->get(dbtemp, NULL, &key, &data, 0);
00293    tmp = to_int(data.data);
00294    val = to_hex(tmp[0]+1, 0);
00295    data.data = val;
00296    data.size = 4;
00297    tokens -= 1;
00298    dbtemp->put(dbtemp, NULL, &key, &data, 0);
00299  }
00300  
00301  if (debut != NULL) {free(debut); debut=NULL;}
00302  else printf("var debut NULL in store_storeToken");
00303  if(chaine != NULL) {free(chaine); chaine=NULL;}
00304  else printf("var chaine NULL in store_storeToken\n");
00305 }
00306 
00307 /*
00308 * Close the temporary scoring database.
00309 */
00310 void score_closedb(void)
00311 {
00312   if (dbtemp != NULL)
00313     dbtemp->close(dbtemp, 0); 
00314 }
00315 
00316 /*
00317 * Print a mpfr_t value on the standard output.
00318 * @param string the string to display
00319 * @param var the variable to display
00320 */
00321 void print_mpfr(char *string, mpfr_t var)
00322 {
00323   printf ("%s", string);
00324   mpfr_out_str (stdout, 10, 0, var, GMP_RNDN);
00325   putchar ('\n');
00326 }
00327 
00328 /*
00329 * Function to sort an array by quicksort (used for extra verbose mode)
00330 * @param t the array to sort
00331 * @param debut index of the first element of the array
00332 * @param fin index of the last element of the array
00333 * @param tind array of the indexes of the array
00334 */
00335 void triRapide(double *t,int debut, int fin, int *tind) {
00336 
00337   if(debut<fin) {
00338 
00339     int placePivot;
00340 
00341     partitionner(t, debut, fin, &placePivot, tind);
00342     triRapide(t, debut, placePivot-1, tind);
00343     triRapide(t, placePivot+1, fin, tind);
00344   }
00345 }
00346 
00350 void partitionner(double *t, int debut, int fin, int *pPosition, int *tind) {
00351 
00352   double valeurPivot=t[debut];
00353   int i;
00354 
00355   *pPosition=debut;
00356   for(i=debut+1;i<=fin;i++)
00357     if(t[i]<valeurPivot) {
00358       (*pPosition)++; // parenthésage obligatoire
00359       echanger(&t[*pPosition],&t[i]);
00360       echangerInt(&tind[*pPosition],&tind[i]);
00361     }
00362   echanger(&t[debut],&t[*pPosition]);
00363   echangerInt(&tind[debut],&tind[*pPosition]);
00364 }
00365 
00369 void echanger(double *px, double *py) {
00370 
00371   double aux = *px;
00372 
00373   *px = *py;
00374   *py = aux;
00375 }
00376 
00380 void echangerInt(int *px, int *py) {
00381 
00382   int aux = *px;
00383 
00384   *px = *py;
00385   *py = aux;
00386 }
00387 

Generated on Tue May 31 14:22:44 2005 for filterFlex by  doxygen 1.3.9.1