00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <string.h>
00004 #include <math.h>
00005 #include <gmp.h>
00006 #include <mpfr.h>
00007 #include "score.h"
00008 #include <db.h>
00009 #include "store.h"
00010
00011
00012 static DB *dbtemp=NULL;
00013
00014 static unsigned int BAD, GOOD;
00015
00016 static double X = 0.4;
00017 static double S = 2;
00018 static double DEV = 0.4;
00019
00020 mpfr_t sum;
00021
00022 unsigned int valeur[2];
00023
00024 unsigned int tokens = 0;
00025
00026
00027
00028
00029 void score_initScoring(void)
00030 {
00031 int ret;
00032 u_int32_t flags;
00033 unsigned int* tmp = 0;
00034
00035 flags = DB_CREATE|DB_TRUNCATE;
00036 ret = db_create(&dbtemp, NULL, 0);
00037 if (ret != 0){
00038 fprintf(stderr, "Error while creating database\n");
00039 exit(0);
00040 }
00041
00042 dbtemp->set_cachesize(dbtemp,0,(unsigned int)270384*270384,1);
00043 ret = dbtemp->open(dbtemp,
00044 NULL,
00045 NULL,
00046 0,
00047 DB_BTREE,
00048 flags,
00049 0);
00050
00051 if(ret != 0){
00052 fprintf(stderr, "Error while opening basetemp");
00053 }
00054
00055 tmp = store_getScore("~~nombre-pages~~");
00056 BAD = tmp[0];
00057 GOOD = tmp[1];
00058 }
00059
00060
00061
00062
00063
00064 void score_getProbability(int verbose)
00065 {
00066 unsigned int tabstats[10], nbtokens=0;
00067 char* tabtokens[tokens];
00068 unsigned int tabnbap[tokens];
00069 unsigned int tabgood[tokens];
00070 unsigned int tabbad[tokens];
00071 double tabfw[tokens];
00072 char *tabsign[tokens];
00073 int tabind[tokens];
00074
00075 double graham=0, fw=0;
00076 mpfr_t *ret, h, s, res;
00077 unsigned int* tmp;
00078 unsigned int bad, good, frequency;
00079 double sommeLog = 0, sommemLog=0;
00080 unsigned int intervalle = 0;
00081 int retb;
00082 int i=0,j=0;
00083 for (i=0;i<10;i++) tabstats[i]=0;
00084 i=0;
00085
00086 DBT key, data;
00087 DBC *cursorp;
00088
00089 dbtemp->cursor(dbtemp, NULL, &cursorp, 0);
00090 memset(&key, 0, sizeof(DBT));
00091 memset(&data, 0, sizeof(DBT));
00092
00093 while ((retb = cursorp->c_get(cursorp, &key, &data, DB_NEXT)) == 0) {
00094 tmp = store_getScore(key.data);
00095 bad = tmp[0];
00096 good = tmp[1];
00097 tmp = score_getScore(key.data);
00098 frequency = tmp[0];
00099 cursorp->c_del(cursorp, 0);
00100 if (bad+good != 0){
00101 graham = (1.0*bad/BAD)/((1.0*bad/BAD)+(1.0*good/GOOD));
00102 fw = (S*X + (bad+good)*graham)/(S+bad+good);
00103 }else{
00104 graham = 0.0;
00105 fw = X;
00106 }
00107 if (fw<DEV || fw>1-DEV){
00108 intervalle = intervalle + frequency;
00109 sommeLog = sommeLog + (log(fw)*frequency);
00110 sommemLog = sommemLog + (log(1-fw)*frequency);
00111 tabsign[i] = "+";
00112 }else tabsign[i] = "-";
00113 if(verbose == 1){
00114 int indice = (int)floor(fw*10);
00115 tabstats[indice] += frequency;
00116 nbtokens += frequency;
00117 }
00118 if(verbose == 2){
00119 tabtokens[i] = NULL;
00120 tabtokens[i] = malloc(sizeof(char)*(strlen(key.data)+1));
00121 strcpy(tabtokens[i], key.data);
00122 tabnbap[i] = frequency;
00123 tabgood[i] = good;
00124 tabbad[i] = bad;
00125 tabfw[i] = fw;
00126 tabind[i] = i;
00127 i++;
00128 nbtokens += frequency;
00129 }
00130 }
00131 if(verbose == 2){
00132 triRapide(tabfw,0, tokens-1, tabind);
00133 printf("%-45s %7s %10s %7s %7s %9s \n", "tokens", "nbapp", "total", "good", "bad", "proba");
00134 for(i=0; i<tokens; i++){
00135 printf("%-45s %7u %10u %7u %7u %9f %s\n", tabtokens[tabind[i]], tabnbap[tabind[i]],tabgood[tabind[i]]+tabbad[tabind[i]], tabgood[tabind[i]], tabbad[tabind[i]], tabfw[i], tabsign[tabind[i]]);
00136 }
00137 printf("nbtokens : %d\n", nbtokens);
00138 }
00139
00140
00141 mpfr_init(h);
00142 mpfr_init(s);
00143 mpfr_init(res);
00144
00145 ret = score_getChi2(-2*sommeLog, 2*intervalle);
00146 mpfr_set(h, *ret, GMP_RNDN);
00147 ret = score_getChi2(-2*sommemLog, 2*intervalle);
00148 mpfr_set(s, *ret, GMP_RNDN);
00149
00150 mpfr_neg(s, s, GMP_RNDN);
00151 mpfr_add(res, h, s, GMP_RNDN);
00152 mpfr_add_ui(res, res, 1, GMP_RNDN);
00153 mpfr_div_ui(res, res, 2, GMP_RNDN);
00154
00155 if (cursorp != NULL)
00156 cursorp->c_close(cursorp);
00157
00158 if(verbose == 1){
00159 for (i=0; i<10; i++){
00160 printf("%-3d-%3d %4d : ", i*10, (i+1)*10, tabstats[i]);
00161 for(j=0; j<(int)floor(((double)tabstats[i]/nbtokens)*100); j++){
00162 printf("#");
00163 }
00164 printf("\n");
00165 }
00166 printf("nbtokens : %d\n", nbtokens);
00167 }
00168
00169 printf("%f\n", mpfr_get_d (res, GMP_RNDN));
00170 }
00171
00172
00173
00174
00175
00176
00177
00178 mpfr_t* score_getChi2(double chi, unsigned int df)
00179 {
00180 mpfr_t term, temp, tmp ,m, mm;
00181 unsigned int i=1;
00182
00183 mpfr_init(sum);
00184 mpfr_init(term);
00185 mpfr_init(temp);
00186 mpfr_init(tmp);
00187 mpfr_init(m);
00188 mpfr_init(mm);
00189
00190 mpfr_set_d(m, chi / 2.0, GMP_RNDN);
00191 mpfr_neg(mm,m, GMP_RNDD);
00192 mpfr_exp(tmp, mm, GMP_RNDN);
00193 mpfr_set(sum, tmp, GMP_RNDN);
00194 mpfr_set(term, tmp, GMP_RNDN);
00195 for(i=1;i<=df/2;i++){
00196 mpfr_div_ui(temp, m, i, GMP_RNDN);
00197 mpfr_mul(term, term, temp, GMP_RNDN);
00198 mpfr_add(sum, sum, term, GMP_RNDN);
00199 }
00200
00201 if ( mpfr_cmp_ui(sum,1) > 0){
00202 mpfr_set_d(sum, 1.0, GMP_RNDN);
00203 return (&sum);
00204 }else{
00205 return (&sum);
00206 }
00207 }
00208
00209
00210
00211
00212
00213
00214 unsigned int* score_getScore(char *token)
00215 {
00216 int ret;
00217 DBT key, data;
00218
00219 memset(&key, 0, sizeof(DBT));
00220 memset(&data, 0, sizeof(DBT));
00221
00222 key.data = token;
00223 key.size = strlen(token)+1;
00224
00225 ret = dbtemp->get(dbtemp, NULL, &key, &data, 0);
00226
00227 if(ret == DB_NOTFOUND){
00228 valeur[0] = 0;
00229 valeur[1] = 0;
00230 return (valeur);
00231 }else
00232 return (to_int((unsigned char*)data.data));
00233 }
00234
00235
00236
00237
00238
00239
00240
00241 void score_storeToken(char *token, int type)
00242 {
00243 int ret;
00244 DBT key, data;
00245 unsigned int *tmp;
00246 unsigned char *val=NULL;
00247 char *debut = NULL;
00248 char *chaine = NULL;
00249
00250 tokens +=1 ;
00251
00252 switch(type){
00253 case TAGS :
00254 debut = malloc(sizeof(char)*(strlen("tag:")+1));
00255 strcpy(debut, "tag:");
00256 break;
00257 case WORDS :
00258 debut = malloc(sizeof(char)*(strlen("word:")+1));
00259 strcpy(debut, "word:");
00260 break;
00261 case BIWORDS :
00262 debut = malloc(sizeof(char)*(strlen("biword:")+1));
00263 strcpy(debut, "biword:");
00264 break;
00265 case DOMAINS :
00266 debut = malloc(sizeof(char)*(strlen("domain:")+1));
00267 strcpy(debut, "domain:");
00268 break;
00269 default :
00270 debut = malloc(sizeof(char)*(strlen("")+1));
00271 strcpy(debut, "");
00272 break;
00273 }
00274
00275 memset(&key, 0, sizeof(DBT));
00276 memset(&data, 0, sizeof(DBT));
00277
00278 chaine = malloc(sizeof(char)*(strlen(token)+strlen(debut)+1));
00279 strcpy(chaine, debut);
00280 strcat(chaine, token);
00281
00282 key.data = chaine;
00283 key.size = strlen(chaine)+1;
00284
00285 val = to_hex(1, 0);
00286
00287 data.data = val;
00288 data.size = 4;
00289
00290 ret = dbtemp->put(dbtemp, NULL, &key, &data, DB_NOOVERWRITE);
00291 if (ret == DB_KEYEXIST) {
00292 dbtemp->get(dbtemp, NULL, &key, &data, 0);
00293 tmp = to_int(data.data);
00294 val = to_hex(tmp[0]+1, 0);
00295 data.data = val;
00296 data.size = 4;
00297 tokens -= 1;
00298 dbtemp->put(dbtemp, NULL, &key, &data, 0);
00299 }
00300
00301 if (debut != NULL) {free(debut); debut=NULL;}
00302 else printf("var debut NULL in store_storeToken");
00303 if(chaine != NULL) {free(chaine); chaine=NULL;}
00304 else printf("var chaine NULL in store_storeToken\n");
00305 }
00306
00307
00308
00309
00310 void score_closedb(void)
00311 {
00312 if (dbtemp != NULL)
00313 dbtemp->close(dbtemp, 0);
00314 }
00315
00316
00317
00318
00319
00320
00321 void print_mpfr(char *string, mpfr_t var)
00322 {
00323 printf ("%s", string);
00324 mpfr_out_str (stdout, 10, 0, var, GMP_RNDN);
00325 putchar ('\n');
00326 }
00327
00328
00329
00330
00331
00332
00333
00334
00335 void triRapide(double *t,int debut, int fin, int *tind) {
00336
00337 if(debut<fin) {
00338
00339 int placePivot;
00340
00341 partitionner(t, debut, fin, &placePivot, tind);
00342 triRapide(t, debut, placePivot-1, tind);
00343 triRapide(t, placePivot+1, fin, tind);
00344 }
00345 }
00346
00350 void partitionner(double *t, int debut, int fin, int *pPosition, int *tind) {
00351
00352 double valeurPivot=t[debut];
00353 int i;
00354
00355 *pPosition=debut;
00356 for(i=debut+1;i<=fin;i++)
00357 if(t[i]<valeurPivot) {
00358 (*pPosition)++;
00359 echanger(&t[*pPosition],&t[i]);
00360 echangerInt(&tind[*pPosition],&tind[i]);
00361 }
00362 echanger(&t[debut],&t[*pPosition]);
00363 echangerInt(&tind[debut],&tind[*pPosition]);
00364 }
00365
00369 void echanger(double *px, double *py) {
00370
00371 double aux = *px;
00372
00373 *px = *py;
00374 *py = aux;
00375 }
00376
00380 void echangerInt(int *px, int *py) {
00381
00382 int aux = *px;
00383
00384 *px = *py;
00385 *py = aux;
00386 }
00387