COOPY » Guide
version 0.6.5
|
00001 #include <stdio.h> 00002 #include <stdlib.h> 00003 #include <ctype.h> 00004 00005 #include <coopy/SheetCompare.h> 00006 #include <coopy/CsvSheet.h> 00007 #include <coopy/OrderResult.h> 00008 #include <coopy/MeasurePass.h> 00009 #include <coopy/RowMan.h> 00010 #include <coopy/ColMan.h> 00011 #include <coopy/MeasureMan.h> 00012 #include <coopy/Merger.h> 00013 #include <coopy/SchemaSniffer.h> 00014 #include <coopy/Compare.h> 00015 00016 #include <string> 00017 #include <map> 00018 00019 using namespace coopy::store; 00020 using namespace coopy::cmp; 00021 00022 using namespace std; 00023 00024 namespace coopy { 00025 namespace cmp { 00026 class FastMatch; 00027 class RedundancyMatch; 00028 } 00029 } 00030 00031 class coopy::cmp::FastMatch { 00032 public: 00033 MeasurePass& pass; 00034 NameSniffer *local_names; 00035 NameSniffer *remote_names; 00036 std::string local_hash; 00037 std::string remote_hash; 00038 00039 FastMatch(MeasurePass& pass) : pass(pass) { 00040 //local_names = remote_names = NULL; 00041 local_names = &pass.va.meta; 00042 remote_names = &pass.vb.meta; 00043 local_hash = pass.va.sha1; 00044 remote_hash = pass.vb.sha1; 00045 } 00046 00047 void match(bool rowLike, const CompareFlags& flags) { 00048 00049 // We check if the two things being compared are identical. 00050 // If so, that's easy! 00051 00052 // add matches for easy cases here 00053 if (pass.a.width()==pass.b.width() && 00054 pass.a.height()==pass.b.height()) { 00055 bool fail = false; 00056 00057 if (local_hash=="") { 00058 for (int r=0; r<pass.a.height() && !fail; r++) { 00059 for (int c=0; c<pass.a.width(); c++) { 00060 if (pass.a.cellSummary(c,r)!=pass.b.cellSummary(c,r)) { 00061 dbg_printf("FastMatch::match mismatch at (%d,%d): [%s] vs [%s]\n", 00062 c,r, 00063 pass.a.cellSummary(c,r).toString().c_str(), 00064 pass.b.cellSummary(c,r).toString().c_str()); 00065 fail = true; 00066 break; 00067 } 00068 } 00069 } 00070 } else { 00071 fail = (local_hash!=remote_hash); 00072 } 00073 00074 if (!fail) { 00075 // sheets are identical! 00076 dbg_printf("FastMatch::match identical sheets found (%d units)\n", 00077 pass.asel.height()); 00078 for (int i=0; i<pass.asel.height(); i++) { 00079 pass.asel.cell(0,i) = i; 00080 pass.bsel.cell(0,i) = i; 00081 } 00082 return; 00083 } else { 00084 dbg_printf("FastMatch::match sheets same size (%dx%d), but differ in content\n", 00085 pass.a.width(),pass.a.height()); 00086 } 00087 } else { 00088 dbg_printf("FastMatch::match sheets differ in size, %dx%d vs %dx%d\n", 00089 pass.a.width(),pass.a.height(), 00090 pass.b.width(),pass.b.height()); 00091 } 00092 00093 // do we just trust the column names? 00094 if (!rowLike) { 00095 if (flags.trust_column_names) { 00096 dbg_printf("FastMatch would like to trust columns\n"); 00097 if (local_names!=NULL && remote_names!=NULL) { 00098 vector<string> ln = local_names->suggestNames(); 00099 vector<string> rn = remote_names->suggestNames(); 00100 map<string,int> lni, rni; 00101 for (int i=0; i<(int)ln.size(); i++) { 00102 lni[ln[i]] = i; 00103 } 00104 for (int i=0; i<(int)rn.size(); i++) { 00105 rni[rn[i]] = i; 00106 } 00107 dbg_printf("FastMatch trusting columns\n"); 00108 for (int i=0; i<pass.asel.height(); i++) { 00109 string name = ln[i]; 00110 if (rni.find(name)!=rni.end()) { 00111 pass.asel.cell(0,i) = rni[name]; 00112 } 00113 } 00114 for (int i=0; i<pass.bsel.height(); i++) { 00115 string name = rn[i]; 00116 if (lni.find(name)!=lni.end()) { 00117 pass.bsel.cell(0,i) = lni[name]; 00118 } 00119 } 00120 return; 00121 } 00122 } 00123 } 00124 00125 00126 // Non identical eh? Well, maybe we've been told to trust 00127 // some identifying columns. 00128 00129 if (local_names!=NULL && remote_names!=NULL) { 00130 if (local_names->hasSubset()&&remote_names->hasSubset()) { 00131 // Great! No need to do anything elaborate. We've probably 00132 // already wasted too much time sucking data into memory, 00133 // oh well... 00134 00135 // process subset here... 00136 } 00137 } 00138 } 00139 }; 00140 00141 00142 static string encodeKey(DataSheet& sheet, int x, int y, int len) { 00143 string result = ""; 00144 for (int i=x; i<x+len; i++) { 00145 if (i>x) { 00146 result += "__"; 00147 } 00148 result += sheet.cellSummary(i,y).toString(); 00149 } 00150 return result; 00151 } 00152 00153 00154 void SheetCompare::doRowMapping(OrderResult& p2l_row_order, 00155 OrderResult& p2r_row_order, 00156 const OrderResult& p2l_col_order, 00157 const OrderResult& p2r_col_order, 00158 const CompareFlags& flags, 00159 const CompareFlags& eflags, 00160 SheetView& vpivot, 00161 SheetView& vlocal, 00162 SheetView& vremote, 00163 bool approx) { 00165 // PIVOT to LOCAL row mapping 00166 00167 dbg_printf("SheetCompare::compare pivot <-> local rows\n"); 00168 00169 bool valueBasedPivot = (flags.mapping==NULL); 00170 00171 if (valueBasedPivot) { 00172 IdentityOrderResult id; 00173 MeasurePass p2l_row_pass_local(vpivot,vlocal); 00174 MeasurePass p2l_row_pass_norm1(vpivot,vpivot); 00175 MeasurePass p2l_row_pass_norm2(vlocal,vlocal); 00176 00177 IntSheet p2l_p = p2l_col_order.allA2b(); 00178 IntSheet p2l_l = p2l_col_order.allB2a(); 00179 for (int i=0; i<p2l_p.height(); i++) { 00180 if (p2l_p.cell(0,i)>=0) { 00181 p2l_p.cell(0,i) = i; 00182 } 00183 } 00184 for (int i=0; i<p2l_l.height(); i++) { 00185 if (p2l_l.cell(0,i)>=0) { 00186 p2l_l.cell(0,i) = i; 00187 } 00188 } 00189 00190 OrderResult p2l_1, p2l_2; 00191 p2l_1.setup(p2l_p,p2l_p); 00192 p2l_2.setup(p2l_l,p2l_l); 00193 00194 if (p2l_p.height()>0 || p2l_l.height()>0) { 00195 COOPY_ASSERT(p2l_p.height()==vpivot.sheet.width()); 00196 COOPY_ASSERT(p2l_l.height()==vlocal.sheet.width()); 00197 } 00198 00199 CombinedRowMan p2l_row_local(eflags,p2l_col_order,vlocal.sheet.height()); 00200 CombinedRowMan p2l_row_norm1(eflags,p2l_1,vpivot.sheet.height()); 00201 CombinedRowMan p2l_row_norm2(eflags,p2l_2,vlocal.sheet.height()); 00202 00203 MeasureMan p2l_row_man(p2l_row_local,p2l_row_pass_local, 00204 p2l_row_norm1,p2l_row_pass_norm1, 00205 p2l_row_norm2,p2l_row_pass_norm2, 00206 1, 00207 eflags, 00208 approx); 00209 00210 p2l_row_man.setup(); 00211 FastMatch p2l_row_fast_match(p2l_row_pass_local); 00212 //p2l_row_fast_match.local_names = &pivot_names; 00213 //p2l_row_fast_match.remote_names = &local_names; 00214 //p2l_row_fast_match.local_hash = pivot_hash; 00215 //p2l_row_fast_match.remote_hash = local_hash; 00216 p2l_row_fast_match.match(true,eflags); 00217 p2l_row_man.compare(); 00218 00220 // PIVOT to REMOTE row mapping 00221 00222 dbg_printf("SheetCompare::compare pivot <-> remote rows\n"); 00223 00224 MeasurePass p2r_row_pass_local(vpivot,vremote); 00225 MeasurePass p2r_row_pass_norm1(vpivot,vpivot); 00226 MeasurePass p2r_row_pass_norm2(vremote,vremote); 00227 00228 IntSheet p2r_p = p2r_col_order.allA2b(); 00229 IntSheet p2r_r = p2r_col_order.allB2a(); 00230 for (int i=0; i<p2r_p.height(); i++) { 00231 if (p2r_p.cell(0,i)>=0) { 00232 p2r_p.cell(0,i) = i; 00233 } 00234 } 00235 for (int i=0; i<p2r_r.height(); i++) { 00236 if (p2r_r.cell(0,i)>=0) { 00237 p2r_r.cell(0,i) = i; 00238 } 00239 } 00240 00241 OrderResult p2r_1, p2r_2; 00242 p2r_1.setup(p2r_p,p2r_p); 00243 p2r_2.setup(p2r_r,p2r_r); 00244 00245 if (p2r_p.height()>0 || p2r_r.height()>0) { 00246 COOPY_ASSERT(p2r_p.height()==vpivot.sheet.width()); 00247 COOPY_ASSERT(p2r_r.height()==vremote.sheet.width()); 00248 } 00249 00250 CombinedRowMan p2r_row_local(eflags,p2r_col_order,vremote.sheet.height()); 00251 CombinedRowMan p2r_row_norm1(eflags,p2r_1,vpivot.sheet.height()); 00252 CombinedRowMan p2r_row_norm2(eflags,p2r_2,vremote.sheet.height()); 00253 00254 MeasureMan p2r_row_man(p2r_row_local,p2r_row_pass_local, 00255 p2r_row_norm1,p2r_row_pass_norm1, 00256 p2r_row_norm2,p2r_row_pass_norm2, 00257 1, 00258 eflags, 00259 approx); 00260 00261 p2r_row_man.setup(); 00262 FastMatch p2r_row_fast_match(p2r_row_pass_local); 00263 //p2r_row_fast_match.local_names = &pivot_names; 00264 //p2r_row_fast_match.remote_names = &remote_names; 00265 //p2r_row_fast_match.local_hash = pivot_hash; 00266 //p2r_row_fast_match.remote_hash = remote_hash; 00267 p2r_row_fast_match.match(true,eflags); 00268 p2r_row_man.compare(); 00269 00270 p2l_row_order = p2l_row_pass_local.getOrder(); 00271 p2r_row_order = p2r_row_pass_local.getOrder(); 00272 } else { 00273 00274 // set up links using mapping 00275 00276 bool local_pivot = flags.pivot_sides_with_local; 00277 00278 DataSheet& mapping = *(flags.mapping); 00279 int n = mapping.width()/2; 00280 map<string,int> local_index, remote_index, pivot_index; 00281 IntSheet l2p, r2p, p2l, p2r; 00282 DataSheet& pivot = vpivot.sheet; 00283 DataSheet& local = vlocal.sheet; 00284 DataSheet& remote = vremote.sheet; 00285 l2p.resize(1,local.height(),-1); 00286 r2p.resize(1,remote.height(),-1); 00287 p2l.resize(1,pivot.height(),-1); 00288 p2r.resize(1,pivot.height(),-1); 00289 for (int i=0; i<local.height(); i++) { 00290 // assume 0 offsetting - not true in general, need to fix 00291 string k = encodeKey(local,0,i,n); 00292 local_index[k] = i; 00293 } 00294 for (int i=0; i<remote.height(); i++) { 00295 string k = encodeKey(remote,0,i,n); 00296 remote_index[k] = i; 00297 } 00298 for (int i=0; i<pivot.height(); i++) { 00299 string k = encodeKey(pivot,0,i,n); 00300 pivot_index[k] = i; 00301 } 00302 00303 for (int i=0; i<mapping.height(); i++) { 00304 string klocal = encodeKey(mapping,0,i,n); 00305 string kremote = encodeKey(mapping,n,i,n); 00306 int l = -1; 00307 int r = -1; 00308 int p = -1; 00309 map<string,int>::iterator it = local_index.find(klocal); 00310 if (it!=local_index.end()) { 00311 l = it->second; 00312 } 00313 it = remote_index.find(kremote); 00314 if (it!=remote_index.end()) { 00315 r = it->second; 00316 } 00317 it = pivot_index.find(local_pivot?klocal:kremote); 00318 if (it!=pivot_index.end()) { 00319 p = it->second; 00320 } 00321 if (p!=-1) { 00322 if (l!=-1) { 00323 l2p.cell(0,l) = p; 00324 p2l.cell(0,p) = l; 00325 } 00326 if (r!=-1) { 00327 r2p.cell(0,r) = p; 00328 p2r.cell(0,p) = r; 00329 } 00330 } 00331 } 00332 p2l_row_order.setup(p2l,l2p); 00333 p2r_row_order.setup(p2r,r2p); 00334 } 00335 } 00336 00337 00338 void SheetCompare::doColMapping(const OrderResult& p2l_row_order, 00339 const OrderResult& p2r_row_order, 00340 OrderResult& p2l_col_order, 00341 OrderResult& p2r_col_order, 00342 const CompareFlags& flags, 00343 const CompareFlags& eflags, 00344 SheetView& vpivot, 00345 SheetView& vlocal, 00346 SheetView& vremote) { 00347 IdentityOrderResult id; 00348 00350 // PIVOT to LOCAL column mapping 00351 00352 dbg_printf("SheetCompare::compare pivot <-> local columns\n"); 00353 00354 MeasurePass p2l_col_pass_local(vpivot,vlocal); 00355 MeasurePass p2l_col_pass_norm1(vpivot,vpivot); 00356 MeasurePass p2l_col_pass_norm2(vlocal,vlocal); 00357 00358 ColMan p2l_col_local(p2l_row_order); 00359 ColMan p2l_col_norm1(id); 00360 ColMan p2l_col_norm2(id); 00361 00362 MeasureMan p2l_col_man(p2l_col_local,p2l_col_pass_local, 00363 p2l_col_norm1,p2l_col_pass_norm1, 00364 p2l_col_norm2,p2l_col_pass_norm2, 00365 0, 00366 eflags); 00367 00368 p2l_col_man.setup(); 00369 FastMatch p2l_col_fast_match(p2l_col_pass_local); 00370 p2l_col_fast_match.match(false,eflags); 00371 p2l_col_man.compare(); 00372 00373 00375 // PIVOT to REMOTE column mapping 00376 00377 dbg_printf("SheetCompare::compare pivot <-> remote columns\n"); 00378 00379 MeasurePass p2r_col_pass_local(vpivot,vremote); 00380 MeasurePass p2r_col_pass_norm1(vpivot,vpivot); 00381 MeasurePass p2r_col_pass_norm2(vremote,vremote); 00382 00383 ColMan p2r_col_local(p2r_row_order); 00384 ColMan p2r_col_norm1(id); 00385 ColMan p2r_col_norm2(id); 00386 00387 MeasureMan p2r_col_man(p2r_col_local,p2r_col_pass_local, 00388 p2r_col_norm1,p2r_col_pass_norm1, 00389 p2r_col_norm2,p2r_col_pass_norm2, 00390 0, 00391 eflags); 00392 00393 p2r_col_man.setup(); 00394 FastMatch p2r_col_fast_match(p2r_col_pass_local); 00395 //p2r_col_fast_match.local_hash = pivot_hash; 00396 //p2r_col_fast_match.remote_hash = remote_hash; 00397 p2r_col_fast_match.match(false,eflags); 00398 p2r_col_man.compare(); 00399 00400 p2l_col_order = p2l_col_pass_local.getOrder(); 00401 p2r_col_order = p2r_col_pass_local.getOrder(); 00402 } 00403 00404 int SheetCompare::compare(DataSheet& _pivot, DataSheet& _local, 00405 DataSheet& _remote, 00406 Patcher& output, const CompareFlags& flags, 00407 const char *output_name) { 00408 DataSheet *ppivot = &_pivot; 00409 DataSheet *plocal = &_local; 00410 DataSheet *premote = &_remote; 00411 00412 SchemaSniffer spivot(_pivot,NULL,true); 00413 SchemaSniffer slocal(_local,NULL,true); 00414 SchemaSniffer sremote(_remote,NULL,true); 00415 PolySheet dpivot, dlocal, dremote; 00416 bool appleOrange = false; 00417 dbg_printf("SheetCompare::compare local external names? %s\n", 00418 _local.hasExternalColumnNames()?"yes":"no"); 00419 dbg_printf("SheetCompare::compare remote external names? %s\n", 00420 _remote.hasExternalColumnNames()?"yes":"no"); 00421 dbg_printf("SheetCompare::compare pivot external names? %s\n", 00422 _pivot.hasExternalColumnNames()?"yes":"no"); 00423 00424 if (flags.assume_header || 00425 _local.hasExternalColumnNames()!=_remote.hasExternalColumnNames() || 00426 _local.hasExternalColumnNames()!=_pivot.hasExternalColumnNames()) { 00427 appleOrange = true; 00428 spivot.sniff(); 00429 slocal.sniff(); 00430 sremote.sniff(); 00431 spivot.resniff(slocal); 00432 spivot.resniff(sremote); 00433 slocal.resniff(spivot); 00434 slocal.resniff(sremote); 00435 sremote.resniff(spivot); 00436 sremote.resniff(slocal); 00437 00438 dbg_printf("SheetCompare::compare wrapping sheets\n"); 00439 00440 dlocal = PolySheet(&_local,false); 00441 dlocal.setSchema(slocal.suggestSchema(),false); 00442 dlocal.setMeta(); 00443 dlocal.hideHeaders(); 00444 plocal = &dlocal; 00445 //printf("LOCAL IS %s [%s]\n", plocal->toString().c_str(), 00446 //slocal.suggestSchema()->toString().c_str()); 00447 00448 dremote = PolySheet(&_remote,false); 00449 dremote.setSchema(sremote.suggestSchema(),false); 00450 dremote.setMeta(); 00451 dremote.hideHeaders(); 00452 premote = &dremote; 00453 //printf("REMOTE IS %s\n", premote->toString().c_str()); 00454 00455 dpivot = PolySheet(&_pivot,false); 00456 dpivot.setSchema(spivot.suggestSchema(),false); 00457 dpivot.setMeta(); 00458 dpivot.hideHeaders(); 00459 ppivot = &dpivot; 00460 } 00461 00462 DataSheet& pivot = *ppivot; 00463 DataSheet& local = *plocal; 00464 DataSheet& remote = *premote; 00465 00466 if (output_name) { 00467 bool ok = output.setSheet(output_name); 00468 if (!ok) { 00469 fprintf(stderr,"Output format rejected sheet \"%s\"\n", output_name); 00470 return -1; 00471 } 00472 } 00473 output.metaHint(local); 00474 00475 NameSniffer pivot_names(pivot,flags,false); 00476 NameSniffer local_names(local,flags,false); 00477 NameSniffer remote_names(remote,flags,false); 00478 00479 CompareFlags eflags = flags; 00480 00481 if (!local.isSequential()) { 00482 eflags.use_order = false; 00483 } 00484 if (!remote.isSequential()) { 00485 eflags.use_order = false; 00486 } 00487 00488 if (eflags.trust_ids || eflags.bias_ids || eflags.trust_column_names) { 00489 local_names.sniff(); 00490 remote_names.sniff(); 00491 pivot_names.sniff(); 00492 } 00493 00494 bool id_based = false; 00495 if (eflags.trust_ids||eflags.bias_ids) { 00496 id_based = true; 00497 dbg_printf("Checking IDs\n"); 00498 bool ok = local_names.subset(eflags.ids); 00499 ok = ok && remote_names.subset(eflags.ids); 00500 ok = ok && pivot_names.subset(eflags.ids); 00501 if (!ok) { 00502 slocal.sniff(); 00503 SheetSchema *schema = slocal.suggestSchema(); 00504 COOPY_ASSERT(schema!=NULL); 00505 std::string sname = schema->getSheetName(); 00506 if (sname=="") sname = coopy_get_default_table_name(); 00507 fprintf(stderr,"*** Not all ID columns found for %s\n", 00508 sname.c_str()); 00509 dbg_printf("*** Not all ID columns found for %s\n", 00510 sname.c_str()); 00511 eflags.trust_ids = false; 00512 } 00513 } 00514 00515 if (eflags.offload_to_sql_when_possible) { 00516 if (!output.wantLinks()) { 00517 void *local_db = local.getDatabase(); 00518 void *remote_db = remote.getDatabase(); 00519 void *pivot_db = pivot.getDatabase(); 00520 if (local_db!=NULL && local_db == remote_db && local_db == pivot_db) { 00521 int result = homogeneousCompare(pivot,local,remote,output,flags); 00522 if (result==0) return result; 00523 } 00524 } 00525 } 00526 00527 std::string local_hash = local.getHash(true); 00528 std::string remote_hash = remote.getHash(true); 00529 std::string pivot_hash = pivot.getHash(true); 00530 00531 SheetView vpivot(pivot,pivot_names,pivot_hash); 00532 SheetView vlocal(local,local_names,local_hash); 00533 SheetView vremote(remote,remote_names,remote_hash); 00534 00535 OrderResult p2l_row_order; 00536 OrderResult p2r_row_order; 00537 OrderResult p2l_col_order; 00538 OrderResult p2r_col_order; 00539 00541 // ROW MAPPING from PIVOT to LOCAL and REMOTE 00542 dbg_printf("SheetCompare::compare row mapping\n"); 00543 00544 doRowMapping(p2l_row_order,p2r_row_order, 00545 p2l_col_order,p2r_col_order, 00546 flags,eflags, 00547 vpivot,vlocal,vremote,!id_based); 00548 00550 // COLUMN MAPPING from PIVOT to LOCAL and REMOTE 00551 dbg_printf("SheetCompare::compare row mapping\n"); 00552 00553 doColMapping(p2l_row_order,p2r_row_order, 00554 p2l_col_order,p2r_col_order, 00555 flags,eflags, 00556 vpivot,vlocal,vremote); 00557 00558 if (!id_based) { 00559 // worth repeating row mapping, now we have columns 00560 dbg_printf("SheetCompare::compare review row mapping\n"); 00561 00562 doRowMapping(p2l_row_order,p2r_row_order, 00563 p2l_col_order,p2r_col_order, 00564 flags,eflags, 00565 vpivot,vlocal,vremote,false); 00566 } 00567 00569 // Integrate results 00570 00571 dbg_printf("SheetCompare::compare integrate\n"); 00572 00573 Merger merger; 00574 MergerState state(pivot,local,remote, 00575 p2l_row_order, 00576 p2r_row_order, 00577 p2l_col_order, 00578 p2r_col_order, 00579 output, 00580 eflags, 00581 local_names, 00582 remote_names); 00583 state.allIdentical = (pivot_hash == local_hash) && 00584 (pivot_hash == remote_hash) && 00585 (pivot_hash != ""); 00586 00587 bool ok = merger.merge(state); 00588 00589 dbg_printf("SheetCompare::compare done\n"); 00590 00591 return ok?0:-1; 00592 } 00593 00594 00595 void SheetCompare::setVerbose(bool verbose) { 00596 _csv_verbose = verbose; 00597 } 00598 00599 00600 int SheetCompare::homogeneousCompare(coopy::store::DataSheet& pivot, 00601 coopy::store::DataSheet& local, 00602 coopy::store::DataSheet& remote, 00603 Patcher& output, 00604 const CompareFlags& flags) { 00605 Compare *cmp = pivot.getComparisonMethod(); 00606 if (cmp==NULL) return -1; 00607 return cmp->compare(pivot,local,remote,output,flags); 00608 } 00609