COOPY » Guide  version 0.6.5
/home/paulfitz/cvs/coopy_scm/coopy/src/libcoopy_core/NameSniffer.cpp
Go to the documentation of this file.
00001 #include <coopy/NameSniffer.h>
00002 #include <coopy/DataStat.h>
00003 #include <coopy/Stringer.h>
00004 
00005 #include <map>
00006 
00007 using namespace coopy::store;
00008 using namespace coopy::cmp;
00009 using namespace std;
00010 
00011 void NameSniffer::sniff(int suggest) {
00012   if (sniffed) return;
00013   sniffed = true;
00014   embed = false;
00015   fake = false;
00016   canUseTop = false;
00017   names.clear();
00018   ct.clear();
00019 
00020   SheetSchema *schema = sheet.getSchema();
00021 
00022   /*
00023   if (schema==NULL) {
00024     schema = sheet.getMeta();
00025     if (schema!=NULL) {
00026       printf("Hey! I found a schema in getMeta.\n");
00027     }
00028   } 
00029   */
00030   /*
00031   else {
00032     SheetSchema *schema2 = sheet.getMeta();
00033     if (schema2!=NULL) {
00034       dbg_printf("Hey! I found two schema %ld\n",
00035                  (long int)(&sheet.tail_const()));
00036       dbg_printf("schema 1 %s\n", schema->toString().c_str());
00037       dbg_printf("schema 2 %s\n", schema2->toString().c_str());
00038     }
00039   }
00040   */
00041 
00042   div = suggest;
00043   if (schema!=NULL) {
00044     //printf("Working with %s\n", schema->toString().c_str());
00045     div = schema->headerHeight()-1;
00046     fake = schema->isGuess();
00047     if (schema->getColumnCount()==0 && schema->headerHeight()>0) {
00048       // minimal schema, not complete
00049       dbg_printf("Sniffing... minimal schema!\n");
00050       div = schema->headerHeight()-1;
00051     } else {
00052       dbg_printf("Sniffing... found schema! %s\n", schema->toString().c_str());
00053       if (sheet.width()!=schema->getColumnCount()) {
00054         dbg_printf("Problem detecting schema\n");
00055         dbg_printf("  table has %d columns\n", sheet.width());
00056         dbg_printf("  schema has %d columns\n", schema->getColumnCount());
00057         for (int i=0; i<schema->getColumnCount(); i++) {
00058           dbg_printf("    Column %d: %s\n", i, schema->getColumnInfo(i).getName().c_str());
00059         }
00060         dbg_printf("Table contents:\n%s", sheet.toString().c_str());
00061       } else {
00062         COOPY_ASSERT(sheet.width()==schema->getColumnCount());
00063         for (int i=0; i<sheet.width(); i++) {
00064           ColumnInfo info = schema->getColumnInfo(i);
00065           if (!info.hasName()) {
00066             names.clear();
00067             ct.clear();
00068             break;
00069           }
00070           names.push_back(info.getName());
00071           ct.push_back(info.getColumnType());
00072         }
00073         if (names.size()>0) {
00074           dbg_printf("Found names in schema (%d)\n", names.size());
00075           /*
00076           if (schema->headerHeight()>0) {
00077             dbg_printf("Also, embedded\n");
00078             div = schema->headerHeight()-1;
00079             embed = true;
00080           }
00081           */
00082           if (!embed && sheet.width()==names.size() && 
00083               sheet.height()>=1 && sheet.getDatabase()==NULL) {
00084             bool ok = true;
00085             for (int i=0; i<(int)names.size(); i++) {
00086               if (sheet.cellString(i,0)!=names[i]) {
00087                 ok = false;
00088                 break;
00089               }
00090             }
00091             if (ok) {
00092               dbg_printf("Also, embedded, it seems\n");
00093               div = 0;
00094               embed = true;
00095             }
00096           }
00097           while (ct.size()<names.size()) {
00098             ct.push_back(ColumnType());
00099           }
00100           return;
00101         }
00102       }
00103     }
00104   } else {
00105     dbg_printf("Full sniff\n");
00106   }
00107 
00108   dbg_printf("NON-SCHEMA sniff\n");
00109   if (div<0) {
00110     DataStat stat;
00111     stat.evaluate2(sheet,flags);
00112     div = stat.getRowDivider();
00113     ct = stat.suggestTypes();
00114   }
00115   if (div<0 && sheet.height()==2) {
00116     int low = 0;
00117     int high = 0;
00118     for (int i=0; i<sheet.width(); i++) {
00119       string x = sheet.cellString(i,0);
00120       for (int j=0; j<(int)x.length(); j++) {
00121         char ch = x[j];
00122         if (ch>='a'&&ch<='z') {
00123           low++;
00124         }
00125         if (ch>='A'&&ch<='Z') {
00126           high++;
00127         }
00128       }
00129     }
00130     if (low==0 && high>0) {
00131       low = 0;
00132       high = 0;
00133       for (int i=0; i<sheet.width(); i++) {
00134         string x = sheet.cellString(i,1);
00135         for (int j=0; j<(int)x.length(); j++) {
00136           char ch = x[j];
00137           if (ch>='a'&&ch<='z') {
00138             low++;
00139           }
00140           if (ch>='A'&&ch<='Z') {
00141             high++;
00142           }
00143         }
00144       }
00145       if (low>0) {
00146         div = 0;
00147         dbg_printf("Detected two-liner table.\n");
00148       }
00149     }
00150   }
00151 
00152 
00153   int adiv = div;
00154   if (div<0) {
00155     // no obvious header
00156     fake = true;
00157     while (ct.size()<names.size()) {
00158       ct.push_back(ColumnType());
00159     }
00160     if (sheet.height()<1) {
00161       return;
00162     }
00163     adiv = 0;
00164     canUseTop = true;
00165     return;
00166   }
00167 
00168   // try header line
00169   names.clear();
00170   map<string,int> nameCheck;
00171   bool failure = false;
00172   string lastName = "";
00173   for (int i=0; i<sheet.width(); i++) {
00174     string name = sheet.cellString(i,adiv);
00175     if (name=="") {
00176       string below = "";
00177       if (sheet.height()>adiv+1) {
00178         below = sheet.cellString(i,adiv+1);
00179       }
00180       if (below!="") {
00181         dbg_printf("Reject header, blank name\n");
00182         failure = true;
00183         break;
00184       }
00185     } else {
00186       lastName = name;
00187     }
00188     if (name=="") {
00189       name = lastName + "_";
00190       lastName = name;
00191     }
00192 
00193     if (nameCheck.find(name)!=nameCheck.end()) {
00194       dbg_printf("Reject header, repeated name %s\n", name.c_str());
00195       failure = true;
00196       break;
00197     }
00198     nameCheck[name] = 1;
00199     names.push_back(name);
00200   }
00201   if (failure) {
00202     names.clear();
00203     ct.clear();
00204     fake = true;
00205     canUseTop = false;
00206   } 
00207   div = adiv;
00208   
00209   while (ct.size()<names.size()) {
00210     ct.push_back(ColumnType());
00211   }
00212   if (!failure) {
00213     embed = true;
00214   }
00215 }
00216 
00217 
00218 std::string NameSniffer::suggestColumnName(int col) const {
00219   if (names.size()>0) {
00220     return names[col];
00221   }
00222   return Stringer::getSpreadsheetColumnName(col);
00223 }
00224 
00225 
00226 bool NameSniffer::subset(std::vector<std::string>& ext) {
00227   subset_index.clear();
00228   for (int i=0; i<(int)ext.size(); i++) {
00229     for (int j=0; j<(int)names.size(); j++) {
00230       if (ext[i]==names[j]) {
00231         dbg_printf("pos %d %s\n", j, names[j].c_str());
00232         subset_index.push_back(j);
00233         break;
00234       }
00235     }
00236     if (ext[i].size()==1) {
00237       int j = ext[i][0]-'A';
00238       if (j>=0&&j<=26) {
00239         dbg_printf("pos %d %s\n", j, names[j].c_str());
00240         subset_index.push_back(j);
00241         break;
00242       }
00243     }
00244   }
00245   return subset_index.size()==ext.size();
00246 }
00247 
00248 
00249 bool NameSniffer::resniff(NameSniffer& alt) {
00250   if (sheet.height()>5) return false;
00251   if (sheet.height()==0) return false;
00252   if (!fake) return false;
00253   if (alt.isFake()) return false;
00254   int ct = 0;
00255   std::vector<std::string> onames = alt.suggestNames();
00256   int at = -1;
00257   for (int k=0; k<(int)onames.size(); k++) {
00258     bool done = false;
00259     for (int i=0; i<sheet.width() && !done && at!=-2; i++) {
00260       for (int j=0; j<sheet.height(); j++) {
00261         if (sheet.cellString(i,j) == onames[k]) {
00262           ct++;
00263           if (at>=-1) {
00264             at = j;
00265           } else if (at!=j) {
00266             at = -2;
00267           }
00268           done = true;
00269           break;
00270         }
00271       }
00272     }
00273   }
00274   if (at>=0) {
00275     sniffed = false;
00276     sniff(at);
00277     return true;
00278   }
00279   return false;
00280 }
00281 
00282 
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines