#include #include #include #include #include // ------------------------------------------------------------------- // CLASS DEFINITIONS // ------------------------------------------------------------------- class matrix { public: float *M; int dimx, dimy; char **name; char **namex; // access M with M[yi+xi*M->y] // namex is names of columns if rectangular and not the same as name void set(int y,int x,float val) { M[y+x*dimy]=val; } float get(int y,int x) {return M[y+x*dimy];} matrix(int y=0,int x=0) { dimx=0; dimy=0; M=NULL; name=NULL; namex=NULL; if(!(x==0 | y==0)) {M=new float[x*y]; dimx=x; dimy=y;} } }; typedef int* label; // potentially hierarchical labels, left-most is // largest in hierarchy class query { public: int index; char *name; label Y; // Y is the label query(int indx=0, int lab=0, char *nam=NULL) { index=indx; Y = new int[1]; Y[0]=lab; name=NULL; if(nam!=NULL) {name=new char[strlen(nam)]; strcpy(name,nam);} } query(int indx, int lab, int lab2, char *nam) { index=indx; Y = new int[2]; Y[0]=lab; Y[1]=lab2; name=NULL; if(nam!=NULL) {name=new char[strlen(nam)]; strcpy(name,nam);} } }; class database { public: query **entry; int dim; int labsz; database(int di=0) { dim=di; entry=NULL; labsz=0; if(dim>0) {entry=new query*[dim]; } } }; class ranking { public: query **entry; float *score; int *correct; int dim; ranking(int sz=0) { entry=NULL; score=NULL; correct=NULL; if(sz>0) { entry = new query*[sz]; score=new float[sz]; correct = new int[sz]; dim=sz; } } }; // ------------------------------------------------------------------- // LOADING FUNCTIONS // ------------------------------------------------------------------- matrix* load_matrix(char *fname,int skip_lines, int skip_name, int dimx, int dimy, int verbose, int realx=0) { float *K,val; char line[100000]; char **name; int i,j,k; FILE *f; f=fopen(fname,"r"); if (f==NULL) {printf("ERROR: File %s not found!\n",fname); exit(0);} printf("Loading %s...\n",fname); printf("[skip %d lines, skip_name=%d ",skip_lines,skip_name); printf(" ; assuming %d rows, %d cols]\n",dimy,dimx); K = new float[dimx*dimy]; name = new char*[dimy]; if(skip_lines>0) while(1) { fgets(((char*)line),100000,f); if (line[0]!='#') break; } for(i=1;i=dimy) { break; } if (!skip_name) { if(fscanf(f,"%s",&line)<1) break; name[i]=new char[strlen(line)+1]; strcpy(name[i],line); if (verbose) if ((i%100)==0) printf("%d : %s\n",i,name[i]); } for(j=0;j 1) printf("%f ",val); if (k<1) { printf("ERROR: matrix data shorter (%d) than expected length (%d)",i,dimy); break; } if (val==10000.0f) { val=10000.0f+((float)random())/((float)RAND_MAX);} // add randomization to ROC-50 scores if hit threshold of 10,000 K[i+j*dimy]=val; } if(realx>0) // if meant to skip some columns, keep reading! for(j=0;j1) printf("\n"); i++; } matrix *M; M = new matrix; M->dimx=dimx; M->dimy=dimy; M->M=K; M->name=name; M->namex=name; printf("[loaded.]\n"); fclose(f); return M; } database* load_label_file(char *fname, int indx, int name, int lab_sz) // indx: if there is an index as first column // name: if there is a name or not { int i,j,k,sz; FILE *f; f=fopen(fname,"r"); if (f==NULL) {printf("ERROR: File %s not found!\n",fname); exit(0);} i=0; char c; while(1) { k=fscanf(f,"%c",&c); if (c==10) { i++;} if (k<1) break; } fclose(f); sz=i-1; printf("Loading %d labels in file %s...\n",sz,fname); database *D=new database(sz); D->labsz=lab_sz; char nam[255]=" "; int Y[100]; int index; f=fopen(fname,"r"); if (f==NULL) {printf("ERROR: File not found!\n"); exit(0); } for(i=0;ientry[i]=new query(index,Y[0],nam); else D->entry[i]=new query(index,Y[0],Y[1],nam); } printf("[Done.]\n"); return D; } // ------------------------------------------------------------------- // EVALUATION FUNCTIONS // ------------------------------------------------------------------- int compar(const void *a, const void *b) { float *a1; float *b1; a1=(float*) a; b1=(float*) b; if (a1[0]>b1[0]) return 1; if (a1[0]Y[0])==(Q2->Y[0])); } ranking* rank_query_via_matrix(query *Q,matrix *M, database *D) // returns a ranked list of queries Q according to dist given // in matrix M (smallest first) // if query has index=-1, uses name instead, by indexing with M->name. { int i,j,k; query *Q2; float sort[(M->dimx)*2]; int q=Q->index,qv; // query index if(q<0) // have to find via name instead. { printf("ERROR: find query with name, not implemented yet."); exit(0); } int length=0; // number of queries we consider (might ignore some if // this is a homology problem ...) // have to qsort on an array, so make each element two floats, // first is the score, second is index of score. static int numj=0; for(i=0;i<(D->dim);i++) { length++; qv=D->entry[i]->index; sort[i*2+1]=qv; sort[i*2]=M->M[q+qv*(M->dimy)]; if(D->labsz==2) // if homology problem { Q2=D->entry[i]; if (Q->Y[1]==Q2->Y[1]) // if same fold if (Q->Y[0]!=Q2->Y[0]) // but different superf, ignore comparsn { length--; sort[i*2]=1e20; // sort it to end, then remove it later.. } } } qsort(&sort,M->dimx,sizeof(float)*2,&compar); ranking *R; R=new ranking(length); R->dim=length; for(i=0;idim); Q2=D->entry[q2]; R->entry[i]=Q2; R->score[i]=sort[i*2]; R->correct[i]=same_label(Q,Q2);// see if two queries have same label } return R; } float calc_roc_of_rank(ranking *R,float cutoff=10000000000.0f) { int numtot=R->dim,i; float area=0; float height=0; float fps=0; int *score=R->correct; float numpos=0; for (i=0;idim,2); m->name= new char*[Qs->dim]; for(i=0;i<(Qs->dim);i++) { // printf("ranking %d %s \n",Qs->entry[i]->index,Qs->entry[i]->name); ranking* R=rank_query_via_matrix(Qs->entry[i],M,D); m->set(i,0,calc_roc_of_rank(R,cutoff)); m->set(i,1,Qs->entry[i]->Y[0]); m->name[i]=Qs->entry[i]->name; // print index & roc-50 error rate printf("%d %s %f\n",Qs->entry[i]->index,Qs->entry[i]->name,m->get(i,0)); } return m; } // ------------------------------------------------------------------- // MAIN FUNCTION CALL // ------------------------------------------------------------------- void main(int argc, char *argv[]) { matrix *M; database *D,*Dtst; if (argc<3) { printf("Usage: eval \n"); printf(" Calculates ROC-50 error, prints to stdout, format: .\n"); exit(0); } M=load_matrix(argv[1],0,0,7329,7329,0); D=load_label_file(argv[2],1,1,2); Dtst=load_label_file(argv[3],1,1,2); rank_queries_via_matrix(Dtst,M,D,50); }