COOPY » Guide  version 0.6.5
/home/paulfitz/cvs/coopy_scm/coopy/src/libcoopy_core/DataStat.cpp
Go to the documentation of this file.
00001 #include <math.h>
00002 
00003 #include <coopy/DataStat.h>
00004 #include <coopy/CsvWrite.h>
00005 
00006 #include <map>
00007 
00008 using namespace std;
00009 using namespace coopy::store;
00010 using namespace coopy::cmp;
00011 
00012 void DataStat::evaluate(const DataSheet& sheet) {
00013   CompareFlags flags;
00014   evaluate2(sheet,flags);
00015   /*
00016   int hh = 50;
00017   if (sheet.height()<hh) hh = sheet.height();
00018   clear();
00019   col.clear();
00020   if (sheet.width()==0) return;
00021   if (hh==0) return;
00022   oddness.resize(sheet.width(),hh,0);
00023   oddness_accum.resize(1,hh,0);
00024   for (int j=0; j<sheet.width(); j++) {
00025     col.push_back(DataColumn(sheet,j,hh));
00026     ct.push_back(ColumnType());
00027   }
00028   for (int j=0; j<sheet.width(); j++) {
00029     col[j].evaluate();
00030   }
00031   for (int j=0; j<sheet.width(); j++) {
00032     DataColumn& c = col[j];
00033     Nature n = c.getNature();
00034     vector<float> cmp;
00035     float tot = 0;
00036     float tot2 = 0;
00037     for (int i=0; i<hh; i++) {
00038       float r = n.compare(sheet.cellString(j,i).c_str());
00039       cmp.push_back(r);
00040       tot += r;
00041       tot2 += r*r;
00042     }
00043     float mean = tot;
00044     float dev = 1;
00045     if (hh>0) {
00046       mean /= hh;
00047       dev = tot2 / hh - mean*mean;
00048       // occasionally dev is just slightly less than zero due to round-offs
00049       if (dev<0) { 
00050         dev = 0; 
00051       } else {
00052         dev = sqrt(dev);
00053       }
00054     }
00055     if (dev<0.1) dev = 0.1;
00056     for (int i=0; i<hh; i++) {
00057       float v = (cmp[i]-mean)/dev;
00058       v = -v;
00059       if (v<0) v = 0;
00060       if (sheet.cellString(j,i)=="") {
00061         v = 0;
00062       }
00063       oddness.cell(j,i) = v;
00064       oddness_accum.cell(0,i) += v;
00065     }
00066   }
00067 
00068   oddness_accum.normalize(-1,-1,1);
00069 
00070   //CsvFile::write(oddness,"-");
00071   //CsvFile::write(oddness_accum,"oddness_accum.csv");
00072 
00073   int top = -1;
00074   int evidence = 0;
00075   for (int i=0; i<hh; i++) {
00076     float v = oddness_accum.cell(0,i);
00077     if (v>=1) {
00078       if (i<10 && i<=hh*0.25+2) {
00079         top = i;
00080       } else {
00081         top = -1;
00082       }
00083     }
00084     if (v<0.5) {
00085       evidence++;
00086     }
00087   }
00088 
00089   if (top>=0) {
00090     // correct type checking
00091     for (int j=0; j<sheet.width(); j++) {
00092       DataColumn& c = col[j];
00093       c.unevaluate(top);
00094     }
00095   }
00096 
00097   for (int j=0; j<sheet.width(); j++) {
00098     DataColumn& c = col[j];
00099     Nature n = c.getNature();
00100     ColumnType mct;
00101     if (n.couldBeInteger()) {
00102       mct.setType("INTEGER");
00103     }
00104     ct[j] = mct;
00105   }
00106 
00107   if (top>=0) {
00108     dbg_printf("Header guess: [%d]", top);
00109     for (int j=0; j<sheet.width(); j++) {
00110       dbg_printf(" [%s]", sheet.cellString(j,top).c_str());
00111     }
00112     dbg_printf("\n");
00113   } else {
00114     dbg_printf("Cannot guess header\n");
00115   }
00116   rowDivider = top;
00117   */
00118 }
00119 
00120 
00121 void DataStat::evaluate2(const coopy::store::DataSheet& sheet,
00122                          const coopy::cmp::CompareFlags& flags) {
00123   int hh = 50;
00124   if (sheet.height()<hh) hh = sheet.height();
00125   clear();
00126   col.clear();
00127   if (sheet.width()==0) return;
00128   if (hh==0) return;
00129   int top = -1;
00130   for (int j=0; j<sheet.width(); j++) {
00131     col.push_back(DataColumn(sheet,j,hh));
00132     ct.push_back(ColumnType());
00133   }
00134   for (int j=0; j<sheet.width(); j++) {
00135     col[j].evaluate();
00136   }
00137 
00138   if (flags.ids.size()>0) {
00139     //printf("CHECKING IDS\n");
00140     map<string,int> ids;
00141     for (int i=0; i<(int)flags.ids.size(); i++) {
00142       ids[flags.ids[i]] = 1;
00143     }
00144     for (int i=0; i<hh; i++) {
00145       int found = 0;
00146       for (int j=0; j<sheet.width(); j++) {
00147         string txt = sheet.cellString(j,i);
00148         if (ids.find(txt)!=ids.end()) {
00149           //printf("FOUND %s\n", txt.c_str());
00150           found++;
00151         }
00152       }
00153       if (found==(int)flags.ids.size()) {
00154         //printf("GOT IT on %d\n", i);
00155         top = i;
00156         break;
00157       }
00158     }
00159   } else {
00160 
00161     oddness.resize(sheet.width(),hh,0);
00162     oddness_accum.resize(1,hh,0);
00163     
00164     for (int j=0; j<sheet.width(); j++) {
00165       DataColumn& c = col[j];
00166       Nature n = c.getNature();
00167       //n.show();
00168       vector<float> cmp;
00169       float tot = 0;
00170       float tot2 = 0;
00171       for (int i=0; i<hh; i++) {
00172         float r = n.compare(sheet.cellString(j,i).c_str(),true,i);
00173         cmp.push_back(r);
00174         tot += r;
00175         tot2 += r*r;
00176       }
00177       float mean = tot;
00178       float dev = 1;
00179       if (hh>0) {
00180         mean /= hh;
00181         dev = tot2 / hh - mean*mean;
00182         // occasionally dev is just slightly less than zero due to round-offs
00183         if (dev<0) { 
00184           dev = 0; 
00185         } else {
00186           dev = sqrt(dev);
00187         }
00188       }
00189       if (dev<0.1) dev = 0.1;
00190       for (int i=0; i<hh; i++) {
00191         float v = (cmp[i]-mean)/dev;
00192         v = -v;
00193         if (v<0) v = 0;
00194         if (sheet.cellString(j,i)=="") {
00195           v = 0;
00196         }
00197         oddness.cell(j,i) = v;
00198       }
00199     }
00200   
00201     bool have_null = false;
00202     for (int i=0; i<hh; i++) {
00203       oddness_accum.cell(0,i) = 0;
00204       if (!have_null) {
00205         for (int j=0; j<sheet.width(); j++) {
00206           float v = oddness.cell(j,i);
00207           //oddness_accum.cell(0,i) += v/sheet.width();
00208           if (oddness_accum.cell(0,i)<v) {
00209             oddness_accum.cell(0,i) = v;
00210           }
00211           SheetCell c = sheet.cellSummary(j,i);
00212           if (c.escaped) {
00213             have_null = true;
00214             oddness_accum.cell(0,i) = 0;
00215             break;
00216           }
00217         }
00218       }
00219     }
00220 
00221     //oddness_accum.normalize(-1,-1,1);
00222 
00223     /*
00224       Property p;
00225       p.put("file","-");
00226       p.put("header",-1);
00227       CsvFile::write(oddness,p);
00228       if (oddness_accum.height()>1) {
00229       p.put("file","-");
00230       CsvFile::write(oddness_accum,p);
00231       }
00232     */
00233 
00234     /*
00235     for (int i=0; i<hh; i++) {
00236       float v = oddness_accum.cell(0,i);
00237       printf(">>> %g\n", v);
00238     }
00239     */
00240     
00241     int evidence = 0;
00242     float peak = 0;
00243     for (int i=0; i<hh; i++) {
00244       float v = oddness_accum.cell(0,i)*((hh-i)/(float)hh);
00245       if (v>=0.99&&v>peak*0.75) {
00246         if (i<10 && i<=hh*0.25+2) {
00247           top = i;
00248           peak = v;
00249         } else {
00250           top = -1;
00251         }
00252       }
00253       if (v<0.5) {
00254         evidence++;
00255       }
00256     }
00257 
00258     if (top>=0) {
00259       // correct type checking
00260       for (int j=0; j<sheet.width(); j++) {
00261         DataColumn& c = col[j];
00262         c.unevaluate(top);
00263       }
00264     }
00265   }
00266 
00267   for (int j=0; j<sheet.width(); j++) {
00268     DataColumn& c = col[j];
00269     Nature n = c.getNature();
00270     ColumnType mct;
00271     if (n.couldBeInteger()) {
00272       mct.setType("INTEGER");
00273     }
00274     ct[j] = mct;
00275   }
00276 
00277   if (top>=0) {
00278     dbg_printf("Header guess: [%d]", top);
00279     for (int j=0; j<sheet.width(); j++) {
00280       dbg_printf(" [%s]", sheet.cellString(j,top).c_str());
00281     }
00282     dbg_printf("\n");
00283   } else {
00284     dbg_printf("Cannot guess header\n");
00285   }
00286   rowDivider = top;
00287 }
00288 
00289 
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines