COOPY » Guide  version 0.6.5
/home/paulfitz/cvs/coopy_scm/coopy/src/libcoopy_core/SheetCompare.cpp
Go to the documentation of this file.
00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <ctype.h>
00004 
00005 #include <coopy/SheetCompare.h>
00006 #include <coopy/CsvSheet.h>
00007 #include <coopy/OrderResult.h>
00008 #include <coopy/MeasurePass.h>
00009 #include <coopy/RowMan.h>
00010 #include <coopy/ColMan.h>
00011 #include <coopy/MeasureMan.h>
00012 #include <coopy/Merger.h>
00013 #include <coopy/SchemaSniffer.h>
00014 #include <coopy/Compare.h>
00015 
00016 #include <string>
00017 #include <map>
00018 
00019 using namespace coopy::store;
00020 using namespace coopy::cmp;
00021 
00022 using namespace std;
00023 
00024 namespace coopy {
00025   namespace cmp {
00026     class FastMatch;
00027     class RedundancyMatch;
00028   }
00029 }
00030 
00031 class coopy::cmp::FastMatch {
00032 public:
00033   MeasurePass& pass;
00034   NameSniffer *local_names;
00035   NameSniffer *remote_names;
00036   std::string local_hash;
00037   std::string remote_hash;
00038 
00039   FastMatch(MeasurePass& pass) : pass(pass) {
00040     //local_names = remote_names = NULL;
00041     local_names = &pass.va.meta;
00042     remote_names = &pass.vb.meta;
00043     local_hash = pass.va.sha1;
00044     remote_hash = pass.vb.sha1;
00045   }
00046 
00047   void match(bool rowLike, const CompareFlags& flags) {
00048 
00049     // We check if the two things being compared are identical.
00050     // If so, that's easy!
00051 
00052     // add matches for easy cases here
00053     if (pass.a.width()==pass.b.width() &&
00054         pass.a.height()==pass.b.height()) {
00055       bool fail = false;
00056 
00057       if (local_hash=="") {
00058         for (int r=0; r<pass.a.height() && !fail; r++) {
00059           for (int c=0; c<pass.a.width(); c++) {
00060             if (pass.a.cellSummary(c,r)!=pass.b.cellSummary(c,r)) {
00061               dbg_printf("FastMatch::match mismatch at (%d,%d): [%s] vs [%s]\n",
00062                          c,r,
00063                          pass.a.cellSummary(c,r).toString().c_str(),
00064                          pass.b.cellSummary(c,r).toString().c_str());
00065               fail = true;
00066               break;
00067             }
00068           }
00069         }
00070       } else {
00071         fail = (local_hash!=remote_hash);
00072       }
00073 
00074       if (!fail) {
00075         // sheets are identical!
00076         dbg_printf("FastMatch::match identical sheets found (%d units)\n",
00077                    pass.asel.height());
00078         for (int i=0; i<pass.asel.height(); i++) {
00079           pass.asel.cell(0,i) = i;
00080           pass.bsel.cell(0,i) = i;
00081         }
00082         return;
00083       } else {
00084         dbg_printf("FastMatch::match sheets same size (%dx%d), but differ in content\n",
00085                    pass.a.width(),pass.a.height());
00086       }
00087     } else {
00088       dbg_printf("FastMatch::match sheets differ in size, %dx%d vs %dx%d\n",
00089                  pass.a.width(),pass.a.height(),
00090                  pass.b.width(),pass.b.height());
00091     }
00092 
00093     // do we just trust the column names?
00094     if (!rowLike) {
00095       if (flags.trust_column_names) {
00096         dbg_printf("FastMatch would like to trust columns\n");
00097         if (local_names!=NULL && remote_names!=NULL) {
00098           vector<string> ln = local_names->suggestNames();
00099           vector<string> rn = remote_names->suggestNames();
00100           map<string,int> lni, rni;
00101           for (int i=0; i<(int)ln.size(); i++) {
00102             lni[ln[i]] = i;
00103           }
00104           for (int i=0; i<(int)rn.size(); i++) {
00105             rni[rn[i]] = i;
00106           }
00107           dbg_printf("FastMatch trusting columns\n");
00108           for (int i=0; i<pass.asel.height(); i++) {
00109             string name = ln[i];
00110             if (rni.find(name)!=rni.end()) {
00111               pass.asel.cell(0,i) = rni[name];
00112             }
00113           }
00114           for (int i=0; i<pass.bsel.height(); i++) {
00115             string name = rn[i];
00116             if (lni.find(name)!=lni.end()) {
00117               pass.bsel.cell(0,i) = lni[name];
00118             }
00119           }
00120           return;
00121         }
00122       }
00123     }
00124 
00125 
00126     // Non identical eh?  Well, maybe we've been told to trust
00127     // some identifying columns.
00128 
00129     if (local_names!=NULL && remote_names!=NULL) {
00130       if (local_names->hasSubset()&&remote_names->hasSubset()) {
00131         // Great!  No need to do anything elaborate.  We've probably
00132         // already wasted too much time sucking data into memory,
00133         // oh well...
00134         
00135         // process subset here...
00136       }
00137     }
00138   }
00139 };
00140 
00141 
00142 static string encodeKey(DataSheet& sheet, int x, int y, int len) {
00143   string result = "";
00144   for (int i=x; i<x+len; i++) {
00145     if (i>x) {
00146     result += "__";
00147     }
00148     result += sheet.cellSummary(i,y).toString();
00149   }
00150   return result;
00151 }
00152 
00153 
00154 void SheetCompare::doRowMapping(OrderResult& p2l_row_order,
00155                                 OrderResult& p2r_row_order,
00156                                 const OrderResult& p2l_col_order,
00157                                 const OrderResult& p2r_col_order,
00158                                 const CompareFlags& flags,
00159                                 const CompareFlags& eflags,
00160                                 SheetView& vpivot,
00161                                 SheetView& vlocal,
00162                                 SheetView& vremote,
00163                                 bool approx) {
00165   // PIVOT to LOCAL row mapping
00166 
00167   dbg_printf("SheetCompare::compare pivot <-> local rows\n");
00168 
00169   bool valueBasedPivot = (flags.mapping==NULL);
00170 
00171   if (valueBasedPivot) {
00172     IdentityOrderResult id;
00173     MeasurePass p2l_row_pass_local(vpivot,vlocal);
00174     MeasurePass p2l_row_pass_norm1(vpivot,vpivot);
00175     MeasurePass p2l_row_pass_norm2(vlocal,vlocal);
00176 
00177     IntSheet p2l_p = p2l_col_order.allA2b();
00178     IntSheet p2l_l = p2l_col_order.allB2a();
00179     for (int i=0; i<p2l_p.height(); i++) {
00180       if (p2l_p.cell(0,i)>=0) {
00181         p2l_p.cell(0,i) = i;
00182       }
00183     }
00184     for (int i=0; i<p2l_l.height(); i++) {
00185       if (p2l_l.cell(0,i)>=0) {
00186         p2l_l.cell(0,i) = i;
00187       }
00188     }
00189 
00190     OrderResult p2l_1, p2l_2;
00191     p2l_1.setup(p2l_p,p2l_p);
00192     p2l_2.setup(p2l_l,p2l_l);
00193 
00194     if (p2l_p.height()>0 || p2l_l.height()>0) {
00195       COOPY_ASSERT(p2l_p.height()==vpivot.sheet.width());
00196       COOPY_ASSERT(p2l_l.height()==vlocal.sheet.width());
00197     }
00198 
00199     CombinedRowMan p2l_row_local(eflags,p2l_col_order,vlocal.sheet.height());
00200     CombinedRowMan p2l_row_norm1(eflags,p2l_1,vpivot.sheet.height());
00201     CombinedRowMan p2l_row_norm2(eflags,p2l_2,vlocal.sheet.height());
00202     
00203     MeasureMan p2l_row_man(p2l_row_local,p2l_row_pass_local,
00204                            p2l_row_norm1,p2l_row_pass_norm1,
00205                            p2l_row_norm2,p2l_row_pass_norm2,
00206                            1,
00207                            eflags,
00208                            approx);
00209     
00210     p2l_row_man.setup();
00211     FastMatch p2l_row_fast_match(p2l_row_pass_local);
00212     //p2l_row_fast_match.local_names = &pivot_names;
00213     //p2l_row_fast_match.remote_names = &local_names;
00214     //p2l_row_fast_match.local_hash = pivot_hash;
00215     //p2l_row_fast_match.remote_hash = local_hash;
00216     p2l_row_fast_match.match(true,eflags);
00217     p2l_row_man.compare();
00218     
00220     // PIVOT to REMOTE row mapping
00221     
00222     dbg_printf("SheetCompare::compare pivot <-> remote rows\n");
00223     
00224     MeasurePass p2r_row_pass_local(vpivot,vremote);
00225     MeasurePass p2r_row_pass_norm1(vpivot,vpivot);
00226     MeasurePass p2r_row_pass_norm2(vremote,vremote);
00227 
00228     IntSheet p2r_p = p2r_col_order.allA2b();
00229     IntSheet p2r_r = p2r_col_order.allB2a();
00230     for (int i=0; i<p2r_p.height(); i++) {
00231       if (p2r_p.cell(0,i)>=0) {
00232         p2r_p.cell(0,i) = i;
00233       }
00234     }
00235     for (int i=0; i<p2r_r.height(); i++) {
00236       if (p2r_r.cell(0,i)>=0) {
00237         p2r_r.cell(0,i) = i;
00238       }
00239     }
00240 
00241     OrderResult p2r_1, p2r_2;
00242     p2r_1.setup(p2r_p,p2r_p);
00243     p2r_2.setup(p2r_r,p2r_r);
00244 
00245     if (p2r_p.height()>0 || p2r_r.height()>0) {
00246       COOPY_ASSERT(p2r_p.height()==vpivot.sheet.width());
00247       COOPY_ASSERT(p2r_r.height()==vremote.sheet.width());
00248     }
00249     
00250     CombinedRowMan p2r_row_local(eflags,p2r_col_order,vremote.sheet.height());
00251     CombinedRowMan p2r_row_norm1(eflags,p2r_1,vpivot.sheet.height());
00252     CombinedRowMan p2r_row_norm2(eflags,p2r_2,vremote.sheet.height());
00253     
00254     MeasureMan p2r_row_man(p2r_row_local,p2r_row_pass_local,
00255                            p2r_row_norm1,p2r_row_pass_norm1,
00256                            p2r_row_norm2,p2r_row_pass_norm2,
00257                            1,
00258                            eflags,
00259                            approx);
00260     
00261     p2r_row_man.setup();
00262     FastMatch p2r_row_fast_match(p2r_row_pass_local);
00263     //p2r_row_fast_match.local_names = &pivot_names;
00264     //p2r_row_fast_match.remote_names = &remote_names;
00265     //p2r_row_fast_match.local_hash = pivot_hash;
00266     //p2r_row_fast_match.remote_hash = remote_hash;
00267     p2r_row_fast_match.match(true,eflags);
00268     p2r_row_man.compare();
00269     
00270     p2l_row_order = p2l_row_pass_local.getOrder();
00271     p2r_row_order = p2r_row_pass_local.getOrder();
00272   } else {
00273 
00274     // set up links using mapping
00275 
00276     bool local_pivot  = flags.pivot_sides_with_local;
00277 
00278     DataSheet& mapping = *(flags.mapping);
00279     int n = mapping.width()/2;
00280     map<string,int> local_index, remote_index, pivot_index;
00281     IntSheet l2p, r2p, p2l, p2r;
00282     DataSheet& pivot = vpivot.sheet;
00283     DataSheet& local = vlocal.sheet;
00284     DataSheet& remote = vremote.sheet;
00285     l2p.resize(1,local.height(),-1);
00286     r2p.resize(1,remote.height(),-1);
00287     p2l.resize(1,pivot.height(),-1);
00288     p2r.resize(1,pivot.height(),-1);
00289     for (int i=0; i<local.height(); i++) {
00290       // assume 0 offsetting - not true in general, need to fix
00291       string k = encodeKey(local,0,i,n);
00292       local_index[k] = i;
00293     }
00294     for (int i=0; i<remote.height(); i++) {
00295       string k = encodeKey(remote,0,i,n);
00296       remote_index[k] = i;
00297     }
00298     for (int i=0; i<pivot.height(); i++) {
00299       string k = encodeKey(pivot,0,i,n);
00300       pivot_index[k] = i;
00301     }
00302 
00303     for (int i=0; i<mapping.height(); i++) {
00304       string klocal = encodeKey(mapping,0,i,n);
00305       string kremote = encodeKey(mapping,n,i,n);
00306       int l = -1;
00307       int r = -1;
00308       int p = -1;
00309       map<string,int>::iterator it = local_index.find(klocal);
00310       if (it!=local_index.end()) {
00311         l = it->second;
00312       }
00313       it = remote_index.find(kremote);
00314       if (it!=remote_index.end()) {
00315         r = it->second;
00316       }
00317       it = pivot_index.find(local_pivot?klocal:kremote);
00318       if (it!=pivot_index.end()) {
00319         p = it->second;
00320       }
00321       if (p!=-1) {
00322         if (l!=-1) {
00323           l2p.cell(0,l) = p;
00324           p2l.cell(0,p) = l;
00325         }
00326         if (r!=-1) {
00327           r2p.cell(0,r) = p;
00328           p2r.cell(0,p) = r;
00329         }
00330       }
00331     }
00332     p2l_row_order.setup(p2l,l2p);
00333     p2r_row_order.setup(p2r,r2p);
00334   }
00335 }
00336 
00337 
00338 void SheetCompare::doColMapping(const OrderResult& p2l_row_order,
00339                                 const OrderResult& p2r_row_order,
00340                                 OrderResult& p2l_col_order,
00341                                 OrderResult& p2r_col_order,
00342                                 const CompareFlags& flags,
00343                                 const CompareFlags& eflags,
00344                                 SheetView& vpivot,
00345                                 SheetView& vlocal,
00346                                 SheetView& vremote) {
00347   IdentityOrderResult id;
00348 
00350   // PIVOT to LOCAL column mapping
00351 
00352   dbg_printf("SheetCompare::compare pivot <-> local columns\n");
00353 
00354   MeasurePass p2l_col_pass_local(vpivot,vlocal);
00355   MeasurePass p2l_col_pass_norm1(vpivot,vpivot);
00356   MeasurePass p2l_col_pass_norm2(vlocal,vlocal);
00357 
00358   ColMan p2l_col_local(p2l_row_order);
00359   ColMan p2l_col_norm1(id);
00360   ColMan p2l_col_norm2(id);
00361 
00362   MeasureMan p2l_col_man(p2l_col_local,p2l_col_pass_local,
00363                          p2l_col_norm1,p2l_col_pass_norm1,
00364                          p2l_col_norm2,p2l_col_pass_norm2,
00365                          0,
00366                          eflags);
00367 
00368   p2l_col_man.setup();
00369   FastMatch p2l_col_fast_match(p2l_col_pass_local);
00370   p2l_col_fast_match.match(false,eflags);
00371   p2l_col_man.compare();
00372 
00373 
00375   // PIVOT to REMOTE column mapping
00376 
00377   dbg_printf("SheetCompare::compare pivot <-> remote columns\n");
00378 
00379   MeasurePass p2r_col_pass_local(vpivot,vremote);
00380   MeasurePass p2r_col_pass_norm1(vpivot,vpivot);
00381   MeasurePass p2r_col_pass_norm2(vremote,vremote);
00382 
00383   ColMan p2r_col_local(p2r_row_order);
00384   ColMan p2r_col_norm1(id);
00385   ColMan p2r_col_norm2(id);
00386 
00387   MeasureMan p2r_col_man(p2r_col_local,p2r_col_pass_local,
00388                          p2r_col_norm1,p2r_col_pass_norm1,
00389                          p2r_col_norm2,p2r_col_pass_norm2,
00390                          0,
00391                          eflags);
00392 
00393   p2r_col_man.setup();
00394   FastMatch p2r_col_fast_match(p2r_col_pass_local);
00395   //p2r_col_fast_match.local_hash = pivot_hash;
00396   //p2r_col_fast_match.remote_hash = remote_hash;
00397   p2r_col_fast_match.match(false,eflags);
00398   p2r_col_man.compare();
00399 
00400   p2l_col_order = p2l_col_pass_local.getOrder();
00401   p2r_col_order = p2r_col_pass_local.getOrder();
00402 }
00403 
00404 int SheetCompare::compare(DataSheet& _pivot, DataSheet& _local, 
00405                           DataSheet& _remote,
00406                           Patcher& output, const CompareFlags& flags,
00407                           const char *output_name) {
00408   DataSheet *ppivot = &_pivot;
00409   DataSheet *plocal = &_local;
00410   DataSheet *premote = &_remote;
00411 
00412   SchemaSniffer spivot(_pivot,NULL,true);
00413   SchemaSniffer slocal(_local,NULL,true);
00414   SchemaSniffer sremote(_remote,NULL,true);
00415   PolySheet dpivot, dlocal, dremote;
00416   bool appleOrange = false;
00417   dbg_printf("SheetCompare::compare local external names? %s\n", 
00418              _local.hasExternalColumnNames()?"yes":"no");
00419   dbg_printf("SheetCompare::compare remote external names? %s\n", 
00420              _remote.hasExternalColumnNames()?"yes":"no");
00421   dbg_printf("SheetCompare::compare pivot external names? %s\n", 
00422              _pivot.hasExternalColumnNames()?"yes":"no");
00423 
00424   if (flags.assume_header ||
00425       _local.hasExternalColumnNames()!=_remote.hasExternalColumnNames() ||
00426       _local.hasExternalColumnNames()!=_pivot.hasExternalColumnNames()) {
00427     appleOrange = true;
00428     spivot.sniff();
00429     slocal.sniff();
00430     sremote.sniff();
00431     spivot.resniff(slocal);
00432     spivot.resniff(sremote);
00433     slocal.resniff(spivot);
00434     slocal.resniff(sremote);
00435     sremote.resniff(spivot);
00436     sremote.resniff(slocal);
00437 
00438     dbg_printf("SheetCompare::compare wrapping sheets\n");
00439 
00440     dlocal = PolySheet(&_local,false);
00441     dlocal.setSchema(slocal.suggestSchema(),false);
00442     dlocal.setMeta();
00443     dlocal.hideHeaders();
00444     plocal = &dlocal;
00445     //printf("LOCAL IS %s [%s]\n", plocal->toString().c_str(),
00446     //slocal.suggestSchema()->toString().c_str());
00447 
00448     dremote = PolySheet(&_remote,false);
00449     dremote.setSchema(sremote.suggestSchema(),false);
00450     dremote.setMeta();
00451     dremote.hideHeaders();
00452     premote = &dremote;
00453     //printf("REMOTE IS %s\n", premote->toString().c_str());
00454 
00455     dpivot = PolySheet(&_pivot,false);
00456     dpivot.setSchema(spivot.suggestSchema(),false);
00457     dpivot.setMeta();
00458     dpivot.hideHeaders();
00459     ppivot = &dpivot;
00460   }
00461 
00462   DataSheet& pivot = *ppivot;
00463   DataSheet& local = *plocal;
00464   DataSheet& remote = *premote;
00465 
00466   if (output_name) {
00467     bool ok = output.setSheet(output_name);
00468     if (!ok) {
00469       fprintf(stderr,"Output format rejected sheet \"%s\"\n", output_name);
00470       return -1;
00471     }
00472   }
00473   output.metaHint(local);
00474 
00475   NameSniffer pivot_names(pivot,flags,false);
00476   NameSniffer local_names(local,flags,false);
00477   NameSniffer remote_names(remote,flags,false);
00478 
00479   CompareFlags eflags = flags;
00480 
00481   if (!local.isSequential()) {
00482     eflags.use_order = false;
00483   }
00484   if (!remote.isSequential()) {
00485     eflags.use_order = false;
00486   }
00487 
00488   if (eflags.trust_ids || eflags.bias_ids || eflags.trust_column_names) {
00489     local_names.sniff();
00490     remote_names.sniff();
00491     pivot_names.sniff();
00492   }
00493 
00494   bool id_based = false;
00495   if (eflags.trust_ids||eflags.bias_ids) {
00496     id_based = true;
00497     dbg_printf("Checking IDs\n");
00498     bool ok = local_names.subset(eflags.ids);
00499     ok = ok && remote_names.subset(eflags.ids);
00500     ok = ok && pivot_names.subset(eflags.ids);
00501     if (!ok) {
00502       slocal.sniff();
00503       SheetSchema *schema = slocal.suggestSchema();
00504       COOPY_ASSERT(schema!=NULL);
00505       std::string sname = schema->getSheetName();
00506       if (sname=="") sname = coopy_get_default_table_name();
00507       fprintf(stderr,"*** Not all ID columns found for %s\n", 
00508               sname.c_str());
00509       dbg_printf("*** Not all ID columns found for %s\n", 
00510                  sname.c_str());
00511       eflags.trust_ids = false;
00512     }
00513   }
00514 
00515   if (eflags.offload_to_sql_when_possible) {
00516     if (!output.wantLinks()) {
00517       void *local_db = local.getDatabase();
00518       void *remote_db = remote.getDatabase();
00519       void *pivot_db = pivot.getDatabase();
00520       if (local_db!=NULL && local_db == remote_db && local_db == pivot_db) {
00521         int result = homogeneousCompare(pivot,local,remote,output,flags);
00522         if (result==0) return result;
00523       }
00524     }
00525   }
00526 
00527   std::string local_hash = local.getHash(true);
00528   std::string remote_hash = remote.getHash(true);
00529   std::string pivot_hash = pivot.getHash(true);
00530 
00531   SheetView vpivot(pivot,pivot_names,pivot_hash);
00532   SheetView vlocal(local,local_names,local_hash);
00533   SheetView vremote(remote,remote_names,remote_hash);
00534 
00535   OrderResult p2l_row_order;
00536   OrderResult p2r_row_order;
00537   OrderResult p2l_col_order;
00538   OrderResult p2r_col_order;
00539 
00541   // ROW MAPPING from PIVOT to LOCAL and REMOTE
00542   dbg_printf("SheetCompare::compare row mapping\n");
00543 
00544   doRowMapping(p2l_row_order,p2r_row_order,
00545                p2l_col_order,p2r_col_order,
00546                flags,eflags,
00547                vpivot,vlocal,vremote,!id_based);
00548 
00550   // COLUMN MAPPING from PIVOT to LOCAL and REMOTE
00551   dbg_printf("SheetCompare::compare row mapping\n");
00552 
00553   doColMapping(p2l_row_order,p2r_row_order,
00554                p2l_col_order,p2r_col_order,
00555                flags,eflags,
00556                vpivot,vlocal,vremote);
00557 
00558   if (!id_based) {
00559     // worth repeating row mapping, now we have columns
00560     dbg_printf("SheetCompare::compare review row mapping\n");
00561 
00562     doRowMapping(p2l_row_order,p2r_row_order,
00563                  p2l_col_order,p2r_col_order,
00564                  flags,eflags,
00565                  vpivot,vlocal,vremote,false);
00566   }
00567 
00569   // Integrate results
00570 
00571   dbg_printf("SheetCompare::compare integrate\n");
00572 
00573   Merger merger;
00574   MergerState state(pivot,local,remote,
00575                     p2l_row_order,
00576                     p2r_row_order,
00577                     p2l_col_order,
00578                     p2r_col_order,
00579                     output,
00580                     eflags,
00581                     local_names,
00582                     remote_names);
00583   state.allIdentical = (pivot_hash == local_hash) && 
00584     (pivot_hash == remote_hash) &&
00585     (pivot_hash != "");
00586 
00587   bool ok = merger.merge(state);
00588 
00589   dbg_printf("SheetCompare::compare done\n");
00590 
00591   return ok?0:-1;
00592 }
00593 
00594 
00595 void SheetCompare::setVerbose(bool verbose) {
00596   _csv_verbose = verbose;
00597 }
00598 
00599 
00600 int SheetCompare::homogeneousCompare(coopy::store::DataSheet& pivot, 
00601                                      coopy::store::DataSheet& local, 
00602                                      coopy::store::DataSheet& remote, 
00603                                      Patcher& output, 
00604                                      const CompareFlags& flags) {
00605   Compare *cmp = pivot.getComparisonMethod();
00606   if (cmp==NULL) return -1;
00607   return cmp->compare(pivot,local,remote,output,flags);
00608 }
00609 
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines