COOPY » Guide
version 0.6.5
|
00001 #include <math.h> 00002 00003 #include <coopy/DataStat.h> 00004 #include <coopy/CsvWrite.h> 00005 00006 #include <map> 00007 00008 using namespace std; 00009 using namespace coopy::store; 00010 using namespace coopy::cmp; 00011 00012 void DataStat::evaluate(const DataSheet& sheet) { 00013 CompareFlags flags; 00014 evaluate2(sheet,flags); 00015 /* 00016 int hh = 50; 00017 if (sheet.height()<hh) hh = sheet.height(); 00018 clear(); 00019 col.clear(); 00020 if (sheet.width()==0) return; 00021 if (hh==0) return; 00022 oddness.resize(sheet.width(),hh,0); 00023 oddness_accum.resize(1,hh,0); 00024 for (int j=0; j<sheet.width(); j++) { 00025 col.push_back(DataColumn(sheet,j,hh)); 00026 ct.push_back(ColumnType()); 00027 } 00028 for (int j=0; j<sheet.width(); j++) { 00029 col[j].evaluate(); 00030 } 00031 for (int j=0; j<sheet.width(); j++) { 00032 DataColumn& c = col[j]; 00033 Nature n = c.getNature(); 00034 vector<float> cmp; 00035 float tot = 0; 00036 float tot2 = 0; 00037 for (int i=0; i<hh; i++) { 00038 float r = n.compare(sheet.cellString(j,i).c_str()); 00039 cmp.push_back(r); 00040 tot += r; 00041 tot2 += r*r; 00042 } 00043 float mean = tot; 00044 float dev = 1; 00045 if (hh>0) { 00046 mean /= hh; 00047 dev = tot2 / hh - mean*mean; 00048 // occasionally dev is just slightly less than zero due to round-offs 00049 if (dev<0) { 00050 dev = 0; 00051 } else { 00052 dev = sqrt(dev); 00053 } 00054 } 00055 if (dev<0.1) dev = 0.1; 00056 for (int i=0; i<hh; i++) { 00057 float v = (cmp[i]-mean)/dev; 00058 v = -v; 00059 if (v<0) v = 0; 00060 if (sheet.cellString(j,i)=="") { 00061 v = 0; 00062 } 00063 oddness.cell(j,i) = v; 00064 oddness_accum.cell(0,i) += v; 00065 } 00066 } 00067 00068 oddness_accum.normalize(-1,-1,1); 00069 00070 //CsvFile::write(oddness,"-"); 00071 //CsvFile::write(oddness_accum,"oddness_accum.csv"); 00072 00073 int top = -1; 00074 int evidence = 0; 00075 for (int i=0; i<hh; i++) { 00076 float v = oddness_accum.cell(0,i); 00077 if (v>=1) { 00078 if (i<10 && i<=hh*0.25+2) { 00079 top = i; 00080 } else { 00081 top = -1; 00082 } 00083 } 00084 if (v<0.5) { 00085 evidence++; 00086 } 00087 } 00088 00089 if (top>=0) { 00090 // correct type checking 00091 for (int j=0; j<sheet.width(); j++) { 00092 DataColumn& c = col[j]; 00093 c.unevaluate(top); 00094 } 00095 } 00096 00097 for (int j=0; j<sheet.width(); j++) { 00098 DataColumn& c = col[j]; 00099 Nature n = c.getNature(); 00100 ColumnType mct; 00101 if (n.couldBeInteger()) { 00102 mct.setType("INTEGER"); 00103 } 00104 ct[j] = mct; 00105 } 00106 00107 if (top>=0) { 00108 dbg_printf("Header guess: [%d]", top); 00109 for (int j=0; j<sheet.width(); j++) { 00110 dbg_printf(" [%s]", sheet.cellString(j,top).c_str()); 00111 } 00112 dbg_printf("\n"); 00113 } else { 00114 dbg_printf("Cannot guess header\n"); 00115 } 00116 rowDivider = top; 00117 */ 00118 } 00119 00120 00121 void DataStat::evaluate2(const coopy::store::DataSheet& sheet, 00122 const coopy::cmp::CompareFlags& flags) { 00123 int hh = 50; 00124 if (sheet.height()<hh) hh = sheet.height(); 00125 clear(); 00126 col.clear(); 00127 if (sheet.width()==0) return; 00128 if (hh==0) return; 00129 int top = -1; 00130 for (int j=0; j<sheet.width(); j++) { 00131 col.push_back(DataColumn(sheet,j,hh)); 00132 ct.push_back(ColumnType()); 00133 } 00134 for (int j=0; j<sheet.width(); j++) { 00135 col[j].evaluate(); 00136 } 00137 00138 if (flags.ids.size()>0) { 00139 //printf("CHECKING IDS\n"); 00140 map<string,int> ids; 00141 for (int i=0; i<(int)flags.ids.size(); i++) { 00142 ids[flags.ids[i]] = 1; 00143 } 00144 for (int i=0; i<hh; i++) { 00145 int found = 0; 00146 for (int j=0; j<sheet.width(); j++) { 00147 string txt = sheet.cellString(j,i); 00148 if (ids.find(txt)!=ids.end()) { 00149 //printf("FOUND %s\n", txt.c_str()); 00150 found++; 00151 } 00152 } 00153 if (found==(int)flags.ids.size()) { 00154 //printf("GOT IT on %d\n", i); 00155 top = i; 00156 break; 00157 } 00158 } 00159 } else { 00160 00161 oddness.resize(sheet.width(),hh,0); 00162 oddness_accum.resize(1,hh,0); 00163 00164 for (int j=0; j<sheet.width(); j++) { 00165 DataColumn& c = col[j]; 00166 Nature n = c.getNature(); 00167 //n.show(); 00168 vector<float> cmp; 00169 float tot = 0; 00170 float tot2 = 0; 00171 for (int i=0; i<hh; i++) { 00172 float r = n.compare(sheet.cellString(j,i).c_str(),true,i); 00173 cmp.push_back(r); 00174 tot += r; 00175 tot2 += r*r; 00176 } 00177 float mean = tot; 00178 float dev = 1; 00179 if (hh>0) { 00180 mean /= hh; 00181 dev = tot2 / hh - mean*mean; 00182 // occasionally dev is just slightly less than zero due to round-offs 00183 if (dev<0) { 00184 dev = 0; 00185 } else { 00186 dev = sqrt(dev); 00187 } 00188 } 00189 if (dev<0.1) dev = 0.1; 00190 for (int i=0; i<hh; i++) { 00191 float v = (cmp[i]-mean)/dev; 00192 v = -v; 00193 if (v<0) v = 0; 00194 if (sheet.cellString(j,i)=="") { 00195 v = 0; 00196 } 00197 oddness.cell(j,i) = v; 00198 } 00199 } 00200 00201 bool have_null = false; 00202 for (int i=0; i<hh; i++) { 00203 oddness_accum.cell(0,i) = 0; 00204 if (!have_null) { 00205 for (int j=0; j<sheet.width(); j++) { 00206 float v = oddness.cell(j,i); 00207 //oddness_accum.cell(0,i) += v/sheet.width(); 00208 if (oddness_accum.cell(0,i)<v) { 00209 oddness_accum.cell(0,i) = v; 00210 } 00211 SheetCell c = sheet.cellSummary(j,i); 00212 if (c.escaped) { 00213 have_null = true; 00214 oddness_accum.cell(0,i) = 0; 00215 break; 00216 } 00217 } 00218 } 00219 } 00220 00221 //oddness_accum.normalize(-1,-1,1); 00222 00223 /* 00224 Property p; 00225 p.put("file","-"); 00226 p.put("header",-1); 00227 CsvFile::write(oddness,p); 00228 if (oddness_accum.height()>1) { 00229 p.put("file","-"); 00230 CsvFile::write(oddness_accum,p); 00231 } 00232 */ 00233 00234 /* 00235 for (int i=0; i<hh; i++) { 00236 float v = oddness_accum.cell(0,i); 00237 printf(">>> %g\n", v); 00238 } 00239 */ 00240 00241 int evidence = 0; 00242 float peak = 0; 00243 for (int i=0; i<hh; i++) { 00244 float v = oddness_accum.cell(0,i)*((hh-i)/(float)hh); 00245 if (v>=0.99&&v>peak*0.75) { 00246 if (i<10 && i<=hh*0.25+2) { 00247 top = i; 00248 peak = v; 00249 } else { 00250 top = -1; 00251 } 00252 } 00253 if (v<0.5) { 00254 evidence++; 00255 } 00256 } 00257 00258 if (top>=0) { 00259 // correct type checking 00260 for (int j=0; j<sheet.width(); j++) { 00261 DataColumn& c = col[j]; 00262 c.unevaluate(top); 00263 } 00264 } 00265 } 00266 00267 for (int j=0; j<sheet.width(); j++) { 00268 DataColumn& c = col[j]; 00269 Nature n = c.getNature(); 00270 ColumnType mct; 00271 if (n.couldBeInteger()) { 00272 mct.setType("INTEGER"); 00273 } 00274 ct[j] = mct; 00275 } 00276 00277 if (top>=0) { 00278 dbg_printf("Header guess: [%d]", top); 00279 for (int j=0; j<sheet.width(); j++) { 00280 dbg_printf(" [%s]", sheet.cellString(j,top).c_str()); 00281 } 00282 dbg_printf("\n"); 00283 } else { 00284 dbg_printf("Cannot guess header\n"); 00285 } 00286 rowDivider = top; 00287 } 00288 00289