COOPY » Guide  version 0.6.5
/home/paulfitz/cvs/coopy_scm/coopy/src/libcoopy_core/MeasureMan.cpp
Go to the documentation of this file.
00001 #include <coopy/MeasureMan.h>
00002 #include <coopy/Viterbi.h>
00003 
00004 using namespace coopy::store;
00005 using namespace coopy::cmp;
00006 
00007 using namespace std;
00008 
00009 #include <math.h>
00010 
00011 #define dbg2_printf if(coopy_is_verbose()) printf
00012 
00013 static float costify(float x) {
00014   //if (x<0.01) x = 0.01;
00015   //return 1/x;
00016   return -x;
00017 }
00018 
00019 void MeasureMan::setup() {
00020   main.setup(main_pass);
00021   anorm.setup(anorm_pass);
00022   bnorm.setup(bnorm_pass);
00023 }
00024 
00025 void MeasureMan::compare() {
00026   compareCore();
00027   if (coopy_is_verbose()) {
00028     if (main_pass.bsel.height()!=main_pass.asel.height()) {
00029       for (int j=0; j<main_pass.bsel.height(); j++) {
00030         int i = main_pass.bsel.cell(0,j);
00031         if (i>=0) {
00032           dbg_printf("%d -> %d ", i, j);
00033           for (int x=0; x<4&&x<main_pass.a.width(); x++) {
00034             if (i<main_pass.a.height()) {
00035               dbg_printf("%s ", main_pass.a.cellString(x,i).c_str());
00036             }
00037           }
00038           dbg_printf("// ");
00039           for (int x=0; x<4&&x<main_pass.b.width(); x++) {
00040             if (j<main_pass.b.height()) {
00041               dbg_printf("%s ", main_pass.b.cellString(x,j).c_str());
00042             }
00043           }
00044           dbg_printf("\n");
00045         }
00046       }
00047     }
00048   }
00049 }
00050 
00051 void MeasureMan::compareCore() {
00052   int rem = -1;
00053   int ctrl = 0;
00054   int ctrlMax = main.getCtrlMax();
00055   bool allOrNothing = (flags.trust_ids && rowLike);
00056 
00057   int remaining = 0;
00058   for (int j=0; j<main_pass.bsel.height(); j++) {
00059     if (main_pass.bsel.cell(0,j)==-1) {
00060       remaining++;
00061     }
00062   }
00063   if (remaining==0) {
00064     dbg_printf("MeasureMan::compare finished\n");
00065     return;
00066   }
00067 
00068   for (int i=0; i<20; i++) {
00069     dbg_printf("\n====================================================\n");
00070     dbg_printf("== MeasureMan::compare pass %d\n", i);
00071     compare1(ctrl);
00072     int processed = 0;
00073     remaining = 0;
00074     for (int j=0; j<main_pass.bsel.height(); j++) {
00075       if (main_pass.bsel.cell(0,j)==-1) {
00076         remaining++;
00077       } else {
00078         processed++;
00079       }
00080     }
00081     if (remaining == 0) {
00082       dbg_printf("Everything allocated\n");
00083       break;
00084     }
00085     dbg_printf("Not everything allocated, %d processed, %d remain (a-total %d b-total %d)\n",
00086                processed,
00087                remaining,
00088                main_pass.asel.height(),
00089                main_pass.bsel.height());
00090     if (remaining<=(main_pass.bsel.height()-main_pass.asel.height())) {
00091       dbg_printf("No more could be allocated\n");
00092       break;
00093     }
00094     if (processed>remaining&&sampled) {
00095       dbg_printf("Good enough, sampling only, quitting\n");
00096       break;
00097     }
00098 
00099     if (rem==remaining) {
00100       dbg_printf("No progress\n");
00101       if (ctrl<ctrlMax) {
00102         dbg_printf("Increasing desperation\n");
00103         if (allOrNothing) {
00104           dbg_printf(" ... but we've been told to trust ids\n");
00105           break;
00106         }
00107         ctrl++;
00108         if (ctrl>ctrlMax) ctrl = ctrlMax;
00109       } else {
00110         break;
00111       }
00112     }
00113     rem = remaining;
00114   }
00115 }
00116 
00117 
00118 void MeasureMan::compare1(int ctrl) {
00119   Stat astat, bstat;
00120   dbg_printf("== Control level: %d, checking means...\n",ctrl);  
00121   main.measure(main_pass,ctrl);
00122 
00123   anorm_pass.asel = main_pass.asel;
00124   anorm_pass.bsel = main_pass.asel;
00125   anorm.measure(anorm_pass,ctrl);
00126   //dbg_printf("Checking [local] statistics\n");
00127   astat = anorm_pass.flatten();
00128   bnorm_pass.asel = main_pass.bsel;
00129   bnorm_pass.bsel = main_pass.bsel;
00130   bnorm.measure(bnorm_pass,ctrl);
00131   //dbg_printf("Checking [remote] statistics\n");
00132   bstat = bnorm_pass.flatten();
00133 
00134   if (bstat.valid && astat.valid) {
00135     // make bnorm look like anorm, statistically
00136     if (bstat.stddev>0.001) {
00137       bnorm_pass.match.offset(-bstat.mean);
00138       bnorm_pass.match.rescale(astat.stddev/bstat.stddev);
00139       bnorm_pass.match.offset(+astat.mean);
00140     }
00141   }
00142     
00143 
00144   SparseFloatSheet match = main_pass.match;
00145   bool flip = match.width()<match.height();
00146 
00147   IntSheet asel;
00148   IntSheet bsel;
00149   DataSheet& a = flip?main_pass.b:main_pass.a;
00150   DataSheet& b = flip?main_pass.a:main_pass.b;
00151   int match_width = flip?match.height():match.width();
00152   int match_height = flip?match.width():match.height();
00153 
00154   bool repeatNeeded = false;
00155   map<int,int> stateNode, stateNodePrev;
00156   int stateNodeCount = 0;
00157   int stateNodePower = 1;
00158 
00159   do {
00160     asel = flip?main_pass.bsel:main_pass.asel;
00161     bsel = flip?main_pass.asel:main_pass.bsel;
00162 
00163     repeatNeeded = false;
00164 
00165     Viterbi v;
00166     /*
00167       There are match.width() = W nodes
00168 
00169       stride = W+1
00170 
00171       state interpretation:
00172       0: an unmatching state
00173       x from 1 to W: "matches against remote column x-1, no preexisting-nodes" 
00174                   (or lingering without match)
00175     */
00176     int column_stride = match_width+1;
00177     int history_stride = stateNodePower;
00178     vector<int> history_offset;
00179     for (int i=0; i<match_width; i++) {
00180       if (stateNode.find(i)!=stateNode.end()) {
00181         history_offset.push_back(stateNode[i]);
00182       } else {
00183         history_offset.push_back(0);
00184       }
00185     }
00186     v.setSize(column_stride*history_stride,match_height);
00187     for (int i=0; i<match_height; i++) {
00188       const set<int>& idx0 = flip?match.getCellsInCol(i-1):match.getCellsInRow(i-1);
00189       const set<int>& idx1 = flip?match.getCellsInCol(i):match.getCellsInRow(i);
00190       const set<int> *pidx0 = &idx0;
00191       v.beginTransitions();
00192       for (int k=0; k<history_stride; k++) {
00193         int ksrc = k*column_stride;
00194         for (set<int>::const_iterator it1=idx1.begin(); it1!=idx1.end(); 
00195              it1++) {
00196           int i1 = (*it1);
00197           int offset = history_offset[i1];
00198           if (offset&k) continue; // cannot transition here - was there already
00199           int kdest = (k+offset)*column_stride;
00200           
00201           float c = costify(flip?match.cell(i,i1):match.cell(i1,i));
00202           double mod = 0.1*log(1+fabs((double)(i-i1)))/log(2);
00203           for (set<int>::const_iterator it0=pidx0->begin(); it0!=pidx0->end(); 
00204                it0++) {
00205             int i0 = (*it0);
00206             if (i0!=i1) {
00207               int offset0 = 0;
00208               //int offset0 = history_offset[i0];
00209               //if (offset0&k) continue;
00210               v.addTransition(ksrc+i0+1,
00211                               kdest+offset0*column_stride+i1+1,c+mod);
00212             }
00213             //printf("%d: %d to %d (diff %g) costs %g\n", i, i0+1, i1+1, mod, c+mod);
00214           }
00215           v.addTransition(ksrc+0,kdest+i1+1,c);
00216         }
00217         for (set<int>::const_iterator it0=pidx0->begin(); it0!=pidx0->end(); 
00218              it0++) {
00219           int i0 = (*it0);
00220           int offset0 = 0;
00221           //int offset0 = history_offset[i0];
00222           //if (offset0&k) continue;
00223           v.addTransition(ksrc+i0+1,ksrc+offset0*column_stride+0,0);
00224         }
00225         v.addTransition(ksrc+0,ksrc+0,0);
00226       }
00227       v.endTransitions();
00228     }
00229 
00230     if (_csv_verbose) {
00231       dbg_printf("Viterbi calculation: ");
00232       for (int i=0; i<v.length(); i++) {
00233         int idx = v(i)%column_stride;
00234         int k = v(i)/column_stride;
00235         if (k>0) {
00236           dbg_printf("%d:",k);
00237         }
00238         if (idx>0) {
00239           dbg_printf("%d ",idx-1);
00240         } else {
00241           dbg_printf(". ");
00242         }
00243       }
00244       dbg_printf(" (cost %g)\n", v.getCost());
00245       //v.showPath();
00246     }
00247 
00248     stateNodePrev = stateNode;
00249     for (int y=0; y<match_height; y++) {
00250       if (bsel.cell(0,y)==-1) {
00251         int bestIndex = v(y)%column_stride-1;
00252         double bestValue = 0;
00253         if (bestIndex>=0) {
00254           bestValue = flip?match.cell(y,bestIndex):match.cell(bestIndex,y);
00255         }
00256         double ref = bnorm_pass.match.cell(0,flip?bestIndex:y);
00257         double ref2 = anorm_pass.match.cell(0,flip?y:bestIndex);
00258         if (ref2<ref) ref = ref2;
00259         bool ok = false;
00260         if (bestValue>ref/4 && bestIndex>=0 && ref>0.01) {
00261           ok = true;
00262         }
00263         if (!ok) {
00264           if (bestIndex>=0 && y>=0) {
00265             // let's examine the best match in detail
00266             dbg2_printf("Detail check... %d->%d\n",y,bestIndex);
00267             if (cellLength(a)==cellLength(b)) {
00268               int misses = 0;
00269               for (int kk=0; kk<cellLength(a); kk++) {
00270                 if (cell(a,kk,bestIndex)!=cell(b,kk,y)) {
00271                   misses++;
00272                 }
00273               }
00274               dbg2_printf("Detailed examination gives %d misses for best match\n", 
00275                           misses);
00276               if (misses==0) {
00277                 // perfect match, let it through
00278                 ok = true;
00279               }
00280             }
00281           }
00282         }
00283         if (ok) {
00284        
00285           dbg2_printf("%d->%d, remote unit %d maps to local unit %d (%d %g : %g)\n",
00286                       y,bestIndex,y,bestIndex,
00287                       bestIndex, bestValue, ref);
00288           dbg2_printf("  [remote/local] %s %s\n", cell(b,0,y).c_str(), cell(a,0,bestIndex).c_str());
00289         
00290           if (asel.cell(0,bestIndex)!=-1 && asel.cell(0,bestIndex)!=y) {
00291             dbg2_printf("COLLISION! Unavailable match: %d (%d,%d)\n",
00292                         bestIndex, y, asel.cell(0,bestIndex));
00293             // want to say    asel[bestIndex] = y
00294             // but currently  asel[bestIndex] = y'
00295 
00296             if (stateNode.find(bestIndex)==stateNode.end()) {
00297               stateNode[bestIndex] = stateNodePower;
00298               stateNodeCount++;
00299               stateNodePower *= 2;
00300 
00301               if (stateNodeCount<4) {
00302                 repeatNeeded = true;
00303               } else {
00304                 //fprintf(stderr,"Warning: there's a lot of ambiguity\n");
00305                 repeatNeeded = false;
00306               }
00307             } else {
00308               if (stateNodePrev.find(bestIndex)!=stateNodePrev.end()) {
00309                 dbg_printf("Catastrophe in MeasureMan\n");
00310                 //exit(1);
00311               }
00312             }
00313 
00314           } else {
00315             bsel.cell(0,y) = bestIndex;
00316             asel.cell(0,bestIndex) = y;
00317           }
00318         }
00319         if (!ok) {
00320           dbg2_printf("%d->?, do not know what to make of remote unit %d (%d %g %g : %g)\n",
00321                       y, y, bestIndex, bestValue, -1.0, ref);
00322           dbg2_printf("  [remote] %s\n", cell(b,0,y).c_str());
00323           if (bestIndex>=0) {
00324             dbg2_printf("  [local] [MISS] %s\n", cell(a,0,bestIndex).c_str());
00325           }
00326         }
00327       }
00328     }
00329     if (repeatNeeded) {
00330       dbg_printf("====================================\n");
00331       dbg_printf("====================================\n");
00332       dbg_printf("====================================\n");
00333       dbg_printf("====================================\n");
00334       dbg_printf("Repeating with more state\n");
00335     }
00336   } while (repeatNeeded);
00337   main_pass.asel = flip?bsel:asel;
00338   main_pass.bsel = flip?asel:bsel;
00339 }
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines