COOPY » Guide
version 0.6.5
|
00001 #include <coopy/NameSniffer.h> 00002 #include <coopy/DataStat.h> 00003 #include <coopy/Stringer.h> 00004 00005 #include <map> 00006 00007 using namespace coopy::store; 00008 using namespace coopy::cmp; 00009 using namespace std; 00010 00011 void NameSniffer::sniff(int suggest) { 00012 if (sniffed) return; 00013 sniffed = true; 00014 embed = false; 00015 fake = false; 00016 canUseTop = false; 00017 names.clear(); 00018 ct.clear(); 00019 00020 SheetSchema *schema = sheet.getSchema(); 00021 00022 /* 00023 if (schema==NULL) { 00024 schema = sheet.getMeta(); 00025 if (schema!=NULL) { 00026 printf("Hey! I found a schema in getMeta.\n"); 00027 } 00028 } 00029 */ 00030 /* 00031 else { 00032 SheetSchema *schema2 = sheet.getMeta(); 00033 if (schema2!=NULL) { 00034 dbg_printf("Hey! I found two schema %ld\n", 00035 (long int)(&sheet.tail_const())); 00036 dbg_printf("schema 1 %s\n", schema->toString().c_str()); 00037 dbg_printf("schema 2 %s\n", schema2->toString().c_str()); 00038 } 00039 } 00040 */ 00041 00042 div = suggest; 00043 if (schema!=NULL) { 00044 //printf("Working with %s\n", schema->toString().c_str()); 00045 div = schema->headerHeight()-1; 00046 fake = schema->isGuess(); 00047 if (schema->getColumnCount()==0 && schema->headerHeight()>0) { 00048 // minimal schema, not complete 00049 dbg_printf("Sniffing... minimal schema!\n"); 00050 div = schema->headerHeight()-1; 00051 } else { 00052 dbg_printf("Sniffing... found schema! %s\n", schema->toString().c_str()); 00053 if (sheet.width()!=schema->getColumnCount()) { 00054 dbg_printf("Problem detecting schema\n"); 00055 dbg_printf(" table has %d columns\n", sheet.width()); 00056 dbg_printf(" schema has %d columns\n", schema->getColumnCount()); 00057 for (int i=0; i<schema->getColumnCount(); i++) { 00058 dbg_printf(" Column %d: %s\n", i, schema->getColumnInfo(i).getName().c_str()); 00059 } 00060 dbg_printf("Table contents:\n%s", sheet.toString().c_str()); 00061 } else { 00062 COOPY_ASSERT(sheet.width()==schema->getColumnCount()); 00063 for (int i=0; i<sheet.width(); i++) { 00064 ColumnInfo info = schema->getColumnInfo(i); 00065 if (!info.hasName()) { 00066 names.clear(); 00067 ct.clear(); 00068 break; 00069 } 00070 names.push_back(info.getName()); 00071 ct.push_back(info.getColumnType()); 00072 } 00073 if (names.size()>0) { 00074 dbg_printf("Found names in schema (%d)\n", names.size()); 00075 /* 00076 if (schema->headerHeight()>0) { 00077 dbg_printf("Also, embedded\n"); 00078 div = schema->headerHeight()-1; 00079 embed = true; 00080 } 00081 */ 00082 if (!embed && sheet.width()==names.size() && 00083 sheet.height()>=1 && sheet.getDatabase()==NULL) { 00084 bool ok = true; 00085 for (int i=0; i<(int)names.size(); i++) { 00086 if (sheet.cellString(i,0)!=names[i]) { 00087 ok = false; 00088 break; 00089 } 00090 } 00091 if (ok) { 00092 dbg_printf("Also, embedded, it seems\n"); 00093 div = 0; 00094 embed = true; 00095 } 00096 } 00097 while (ct.size()<names.size()) { 00098 ct.push_back(ColumnType()); 00099 } 00100 return; 00101 } 00102 } 00103 } 00104 } else { 00105 dbg_printf("Full sniff\n"); 00106 } 00107 00108 dbg_printf("NON-SCHEMA sniff\n"); 00109 if (div<0) { 00110 DataStat stat; 00111 stat.evaluate2(sheet,flags); 00112 div = stat.getRowDivider(); 00113 ct = stat.suggestTypes(); 00114 } 00115 if (div<0 && sheet.height()==2) { 00116 int low = 0; 00117 int high = 0; 00118 for (int i=0; i<sheet.width(); i++) { 00119 string x = sheet.cellString(i,0); 00120 for (int j=0; j<(int)x.length(); j++) { 00121 char ch = x[j]; 00122 if (ch>='a'&&ch<='z') { 00123 low++; 00124 } 00125 if (ch>='A'&&ch<='Z') { 00126 high++; 00127 } 00128 } 00129 } 00130 if (low==0 && high>0) { 00131 low = 0; 00132 high = 0; 00133 for (int i=0; i<sheet.width(); i++) { 00134 string x = sheet.cellString(i,1); 00135 for (int j=0; j<(int)x.length(); j++) { 00136 char ch = x[j]; 00137 if (ch>='a'&&ch<='z') { 00138 low++; 00139 } 00140 if (ch>='A'&&ch<='Z') { 00141 high++; 00142 } 00143 } 00144 } 00145 if (low>0) { 00146 div = 0; 00147 dbg_printf("Detected two-liner table.\n"); 00148 } 00149 } 00150 } 00151 00152 00153 int adiv = div; 00154 if (div<0) { 00155 // no obvious header 00156 fake = true; 00157 while (ct.size()<names.size()) { 00158 ct.push_back(ColumnType()); 00159 } 00160 if (sheet.height()<1) { 00161 return; 00162 } 00163 adiv = 0; 00164 canUseTop = true; 00165 return; 00166 } 00167 00168 // try header line 00169 names.clear(); 00170 map<string,int> nameCheck; 00171 bool failure = false; 00172 string lastName = ""; 00173 for (int i=0; i<sheet.width(); i++) { 00174 string name = sheet.cellString(i,adiv); 00175 if (name=="") { 00176 string below = ""; 00177 if (sheet.height()>adiv+1) { 00178 below = sheet.cellString(i,adiv+1); 00179 } 00180 if (below!="") { 00181 dbg_printf("Reject header, blank name\n"); 00182 failure = true; 00183 break; 00184 } 00185 } else { 00186 lastName = name; 00187 } 00188 if (name=="") { 00189 name = lastName + "_"; 00190 lastName = name; 00191 } 00192 00193 if (nameCheck.find(name)!=nameCheck.end()) { 00194 dbg_printf("Reject header, repeated name %s\n", name.c_str()); 00195 failure = true; 00196 break; 00197 } 00198 nameCheck[name] = 1; 00199 names.push_back(name); 00200 } 00201 if (failure) { 00202 names.clear(); 00203 ct.clear(); 00204 fake = true; 00205 canUseTop = false; 00206 } 00207 div = adiv; 00208 00209 while (ct.size()<names.size()) { 00210 ct.push_back(ColumnType()); 00211 } 00212 if (!failure) { 00213 embed = true; 00214 } 00215 } 00216 00217 00218 std::string NameSniffer::suggestColumnName(int col) const { 00219 if (names.size()>0) { 00220 return names[col]; 00221 } 00222 return Stringer::getSpreadsheetColumnName(col); 00223 } 00224 00225 00226 bool NameSniffer::subset(std::vector<std::string>& ext) { 00227 subset_index.clear(); 00228 for (int i=0; i<(int)ext.size(); i++) { 00229 for (int j=0; j<(int)names.size(); j++) { 00230 if (ext[i]==names[j]) { 00231 dbg_printf("pos %d %s\n", j, names[j].c_str()); 00232 subset_index.push_back(j); 00233 break; 00234 } 00235 } 00236 if (ext[i].size()==1) { 00237 int j = ext[i][0]-'A'; 00238 if (j>=0&&j<=26) { 00239 dbg_printf("pos %d %s\n", j, names[j].c_str()); 00240 subset_index.push_back(j); 00241 break; 00242 } 00243 } 00244 } 00245 return subset_index.size()==ext.size(); 00246 } 00247 00248 00249 bool NameSniffer::resniff(NameSniffer& alt) { 00250 if (sheet.height()>5) return false; 00251 if (sheet.height()==0) return false; 00252 if (!fake) return false; 00253 if (alt.isFake()) return false; 00254 int ct = 0; 00255 std::vector<std::string> onames = alt.suggestNames(); 00256 int at = -1; 00257 for (int k=0; k<(int)onames.size(); k++) { 00258 bool done = false; 00259 for (int i=0; i<sheet.width() && !done && at!=-2; i++) { 00260 for (int j=0; j<sheet.height(); j++) { 00261 if (sheet.cellString(i,j) == onames[k]) { 00262 ct++; 00263 if (at>=-1) { 00264 at = j; 00265 } else if (at!=j) { 00266 at = -2; 00267 } 00268 done = true; 00269 break; 00270 } 00271 } 00272 } 00273 } 00274 if (at>=0) { 00275 sniffed = false; 00276 sniff(at); 00277 return true; 00278 } 00279 return false; 00280 } 00281 00282