COOPY » Guide
version 0.6.5
|
00001 #include <coopy/MeasureMan.h> 00002 #include <coopy/Viterbi.h> 00003 00004 using namespace coopy::store; 00005 using namespace coopy::cmp; 00006 00007 using namespace std; 00008 00009 #include <math.h> 00010 00011 #define dbg2_printf if(coopy_is_verbose()) printf 00012 00013 static float costify(float x) { 00014 //if (x<0.01) x = 0.01; 00015 //return 1/x; 00016 return -x; 00017 } 00018 00019 void MeasureMan::setup() { 00020 main.setup(main_pass); 00021 anorm.setup(anorm_pass); 00022 bnorm.setup(bnorm_pass); 00023 } 00024 00025 void MeasureMan::compare() { 00026 compareCore(); 00027 if (coopy_is_verbose()) { 00028 if (main_pass.bsel.height()!=main_pass.asel.height()) { 00029 for (int j=0; j<main_pass.bsel.height(); j++) { 00030 int i = main_pass.bsel.cell(0,j); 00031 if (i>=0) { 00032 dbg_printf("%d -> %d ", i, j); 00033 for (int x=0; x<4&&x<main_pass.a.width(); x++) { 00034 if (i<main_pass.a.height()) { 00035 dbg_printf("%s ", main_pass.a.cellString(x,i).c_str()); 00036 } 00037 } 00038 dbg_printf("// "); 00039 for (int x=0; x<4&&x<main_pass.b.width(); x++) { 00040 if (j<main_pass.b.height()) { 00041 dbg_printf("%s ", main_pass.b.cellString(x,j).c_str()); 00042 } 00043 } 00044 dbg_printf("\n"); 00045 } 00046 } 00047 } 00048 } 00049 } 00050 00051 void MeasureMan::compareCore() { 00052 int rem = -1; 00053 int ctrl = 0; 00054 int ctrlMax = main.getCtrlMax(); 00055 bool allOrNothing = (flags.trust_ids && rowLike); 00056 00057 int remaining = 0; 00058 for (int j=0; j<main_pass.bsel.height(); j++) { 00059 if (main_pass.bsel.cell(0,j)==-1) { 00060 remaining++; 00061 } 00062 } 00063 if (remaining==0) { 00064 dbg_printf("MeasureMan::compare finished\n"); 00065 return; 00066 } 00067 00068 for (int i=0; i<20; i++) { 00069 dbg_printf("\n====================================================\n"); 00070 dbg_printf("== MeasureMan::compare pass %d\n", i); 00071 compare1(ctrl); 00072 int processed = 0; 00073 remaining = 0; 00074 for (int j=0; j<main_pass.bsel.height(); j++) { 00075 if (main_pass.bsel.cell(0,j)==-1) { 00076 remaining++; 00077 } else { 00078 processed++; 00079 } 00080 } 00081 if (remaining == 0) { 00082 dbg_printf("Everything allocated\n"); 00083 break; 00084 } 00085 dbg_printf("Not everything allocated, %d processed, %d remain (a-total %d b-total %d)\n", 00086 processed, 00087 remaining, 00088 main_pass.asel.height(), 00089 main_pass.bsel.height()); 00090 if (remaining<=(main_pass.bsel.height()-main_pass.asel.height())) { 00091 dbg_printf("No more could be allocated\n"); 00092 break; 00093 } 00094 if (processed>remaining&&sampled) { 00095 dbg_printf("Good enough, sampling only, quitting\n"); 00096 break; 00097 } 00098 00099 if (rem==remaining) { 00100 dbg_printf("No progress\n"); 00101 if (ctrl<ctrlMax) { 00102 dbg_printf("Increasing desperation\n"); 00103 if (allOrNothing) { 00104 dbg_printf(" ... but we've been told to trust ids\n"); 00105 break; 00106 } 00107 ctrl++; 00108 if (ctrl>ctrlMax) ctrl = ctrlMax; 00109 } else { 00110 break; 00111 } 00112 } 00113 rem = remaining; 00114 } 00115 } 00116 00117 00118 void MeasureMan::compare1(int ctrl) { 00119 Stat astat, bstat; 00120 dbg_printf("== Control level: %d, checking means...\n",ctrl); 00121 main.measure(main_pass,ctrl); 00122 00123 anorm_pass.asel = main_pass.asel; 00124 anorm_pass.bsel = main_pass.asel; 00125 anorm.measure(anorm_pass,ctrl); 00126 //dbg_printf("Checking [local] statistics\n"); 00127 astat = anorm_pass.flatten(); 00128 bnorm_pass.asel = main_pass.bsel; 00129 bnorm_pass.bsel = main_pass.bsel; 00130 bnorm.measure(bnorm_pass,ctrl); 00131 //dbg_printf("Checking [remote] statistics\n"); 00132 bstat = bnorm_pass.flatten(); 00133 00134 if (bstat.valid && astat.valid) { 00135 // make bnorm look like anorm, statistically 00136 if (bstat.stddev>0.001) { 00137 bnorm_pass.match.offset(-bstat.mean); 00138 bnorm_pass.match.rescale(astat.stddev/bstat.stddev); 00139 bnorm_pass.match.offset(+astat.mean); 00140 } 00141 } 00142 00143 00144 SparseFloatSheet match = main_pass.match; 00145 bool flip = match.width()<match.height(); 00146 00147 IntSheet asel; 00148 IntSheet bsel; 00149 DataSheet& a = flip?main_pass.b:main_pass.a; 00150 DataSheet& b = flip?main_pass.a:main_pass.b; 00151 int match_width = flip?match.height():match.width(); 00152 int match_height = flip?match.width():match.height(); 00153 00154 bool repeatNeeded = false; 00155 map<int,int> stateNode, stateNodePrev; 00156 int stateNodeCount = 0; 00157 int stateNodePower = 1; 00158 00159 do { 00160 asel = flip?main_pass.bsel:main_pass.asel; 00161 bsel = flip?main_pass.asel:main_pass.bsel; 00162 00163 repeatNeeded = false; 00164 00165 Viterbi v; 00166 /* 00167 There are match.width() = W nodes 00168 00169 stride = W+1 00170 00171 state interpretation: 00172 0: an unmatching state 00173 x from 1 to W: "matches against remote column x-1, no preexisting-nodes" 00174 (or lingering without match) 00175 */ 00176 int column_stride = match_width+1; 00177 int history_stride = stateNodePower; 00178 vector<int> history_offset; 00179 for (int i=0; i<match_width; i++) { 00180 if (stateNode.find(i)!=stateNode.end()) { 00181 history_offset.push_back(stateNode[i]); 00182 } else { 00183 history_offset.push_back(0); 00184 } 00185 } 00186 v.setSize(column_stride*history_stride,match_height); 00187 for (int i=0; i<match_height; i++) { 00188 const set<int>& idx0 = flip?match.getCellsInCol(i-1):match.getCellsInRow(i-1); 00189 const set<int>& idx1 = flip?match.getCellsInCol(i):match.getCellsInRow(i); 00190 const set<int> *pidx0 = &idx0; 00191 v.beginTransitions(); 00192 for (int k=0; k<history_stride; k++) { 00193 int ksrc = k*column_stride; 00194 for (set<int>::const_iterator it1=idx1.begin(); it1!=idx1.end(); 00195 it1++) { 00196 int i1 = (*it1); 00197 int offset = history_offset[i1]; 00198 if (offset&k) continue; // cannot transition here - was there already 00199 int kdest = (k+offset)*column_stride; 00200 00201 float c = costify(flip?match.cell(i,i1):match.cell(i1,i)); 00202 double mod = 0.1*log(1+fabs((double)(i-i1)))/log(2); 00203 for (set<int>::const_iterator it0=pidx0->begin(); it0!=pidx0->end(); 00204 it0++) { 00205 int i0 = (*it0); 00206 if (i0!=i1) { 00207 int offset0 = 0; 00208 //int offset0 = history_offset[i0]; 00209 //if (offset0&k) continue; 00210 v.addTransition(ksrc+i0+1, 00211 kdest+offset0*column_stride+i1+1,c+mod); 00212 } 00213 //printf("%d: %d to %d (diff %g) costs %g\n", i, i0+1, i1+1, mod, c+mod); 00214 } 00215 v.addTransition(ksrc+0,kdest+i1+1,c); 00216 } 00217 for (set<int>::const_iterator it0=pidx0->begin(); it0!=pidx0->end(); 00218 it0++) { 00219 int i0 = (*it0); 00220 int offset0 = 0; 00221 //int offset0 = history_offset[i0]; 00222 //if (offset0&k) continue; 00223 v.addTransition(ksrc+i0+1,ksrc+offset0*column_stride+0,0); 00224 } 00225 v.addTransition(ksrc+0,ksrc+0,0); 00226 } 00227 v.endTransitions(); 00228 } 00229 00230 if (_csv_verbose) { 00231 dbg_printf("Viterbi calculation: "); 00232 for (int i=0; i<v.length(); i++) { 00233 int idx = v(i)%column_stride; 00234 int k = v(i)/column_stride; 00235 if (k>0) { 00236 dbg_printf("%d:",k); 00237 } 00238 if (idx>0) { 00239 dbg_printf("%d ",idx-1); 00240 } else { 00241 dbg_printf(". "); 00242 } 00243 } 00244 dbg_printf(" (cost %g)\n", v.getCost()); 00245 //v.showPath(); 00246 } 00247 00248 stateNodePrev = stateNode; 00249 for (int y=0; y<match_height; y++) { 00250 if (bsel.cell(0,y)==-1) { 00251 int bestIndex = v(y)%column_stride-1; 00252 double bestValue = 0; 00253 if (bestIndex>=0) { 00254 bestValue = flip?match.cell(y,bestIndex):match.cell(bestIndex,y); 00255 } 00256 double ref = bnorm_pass.match.cell(0,flip?bestIndex:y); 00257 double ref2 = anorm_pass.match.cell(0,flip?y:bestIndex); 00258 if (ref2<ref) ref = ref2; 00259 bool ok = false; 00260 if (bestValue>ref/4 && bestIndex>=0 && ref>0.01) { 00261 ok = true; 00262 } 00263 if (!ok) { 00264 if (bestIndex>=0 && y>=0) { 00265 // let's examine the best match in detail 00266 dbg2_printf("Detail check... %d->%d\n",y,bestIndex); 00267 if (cellLength(a)==cellLength(b)) { 00268 int misses = 0; 00269 for (int kk=0; kk<cellLength(a); kk++) { 00270 if (cell(a,kk,bestIndex)!=cell(b,kk,y)) { 00271 misses++; 00272 } 00273 } 00274 dbg2_printf("Detailed examination gives %d misses for best match\n", 00275 misses); 00276 if (misses==0) { 00277 // perfect match, let it through 00278 ok = true; 00279 } 00280 } 00281 } 00282 } 00283 if (ok) { 00284 00285 dbg2_printf("%d->%d, remote unit %d maps to local unit %d (%d %g : %g)\n", 00286 y,bestIndex,y,bestIndex, 00287 bestIndex, bestValue, ref); 00288 dbg2_printf(" [remote/local] %s %s\n", cell(b,0,y).c_str(), cell(a,0,bestIndex).c_str()); 00289 00290 if (asel.cell(0,bestIndex)!=-1 && asel.cell(0,bestIndex)!=y) { 00291 dbg2_printf("COLLISION! Unavailable match: %d (%d,%d)\n", 00292 bestIndex, y, asel.cell(0,bestIndex)); 00293 // want to say asel[bestIndex] = y 00294 // but currently asel[bestIndex] = y' 00295 00296 if (stateNode.find(bestIndex)==stateNode.end()) { 00297 stateNode[bestIndex] = stateNodePower; 00298 stateNodeCount++; 00299 stateNodePower *= 2; 00300 00301 if (stateNodeCount<4) { 00302 repeatNeeded = true; 00303 } else { 00304 //fprintf(stderr,"Warning: there's a lot of ambiguity\n"); 00305 repeatNeeded = false; 00306 } 00307 } else { 00308 if (stateNodePrev.find(bestIndex)!=stateNodePrev.end()) { 00309 dbg_printf("Catastrophe in MeasureMan\n"); 00310 //exit(1); 00311 } 00312 } 00313 00314 } else { 00315 bsel.cell(0,y) = bestIndex; 00316 asel.cell(0,bestIndex) = y; 00317 } 00318 } 00319 if (!ok) { 00320 dbg2_printf("%d->?, do not know what to make of remote unit %d (%d %g %g : %g)\n", 00321 y, y, bestIndex, bestValue, -1.0, ref); 00322 dbg2_printf(" [remote] %s\n", cell(b,0,y).c_str()); 00323 if (bestIndex>=0) { 00324 dbg2_printf(" [local] [MISS] %s\n", cell(a,0,bestIndex).c_str()); 00325 } 00326 } 00327 } 00328 } 00329 if (repeatNeeded) { 00330 dbg_printf("====================================\n"); 00331 dbg_printf("====================================\n"); 00332 dbg_printf("====================================\n"); 00333 dbg_printf("====================================\n"); 00334 dbg_printf("Repeating with more state\n"); 00335 } 00336 } while (repeatNeeded); 00337 main_pass.asel = flip?bsel:asel; 00338 main_pass.bsel = flip?asel:bsel; 00339 }