COOPY » Guide
version 0.6.5
|
00001 #include <coopy/SheetStyle.h> 00002 00003 #include <ctype.h> 00004 00005 #include <string> 00006 #include <map> 00007 00008 using namespace std; 00009 using namespace coopy::store; 00010 00011 const SheetStyle SheetStyle::defaultStyle; 00012 00013 bool SheetStyle::setFromFilename(const char *fname) { 00014 string name = fname; 00015 delim = ","; 00016 if (name.length()>=4) { 00017 string ext = name.substr(name.length()-4); 00018 for (size_t i=0; i<ext.length(); i++) { 00019 ext[i] = tolower(ext[i]); 00020 } 00021 if (ext==".tsv") { 00022 delim = "\t"; 00023 } else if (ext==".ssv") { 00024 delim = ";"; 00025 } else if (ext==".csv") { 00026 delim = ","; 00027 } else if (ext==".wsv") { 00028 delim = " "; 00029 } else if (ext==".list") { 00030 delim = ","; 00031 } else { 00032 return false; 00033 } 00034 } else { 00035 return false; 00036 } 00037 return true; 00038 } 00039 00040 void SheetStyle::setFromProperty(const Property& config) { 00041 if (config.check("delimiter")) { 00042 delim = config.get("delimiter").asString(); 00043 } 00044 if (config.check("null_token")) { 00045 nullToken = config.get("null_token").asString(); 00046 } 00047 if (config.check("have_null")) { 00048 haveNull = config.get("have_null").asInt()!=0; 00049 } 00050 if (config.check("avoid_collision")) { 00051 quoteCollision = config.get("avoid_collision").asInt()!=0; 00052 } 00053 if (config.check("mark_header")) { 00054 markHeader = config.get("mark_header").asInt()!=0; 00055 } 00056 } 00057 00058 class SeparatorHistory { 00059 public: 00060 int ct; 00061 int best_ct; 00062 int best; 00063 map<int,int> votes; 00064 char ch; 00065 00066 SeparatorHistory(char ch) : ch(ch) { 00067 ct = 0; 00068 best = -1; 00069 best_ct = 0; 00070 } 00071 00072 void bump() { 00073 ct++; 00074 } 00075 00076 void add() { 00077 if (votes.find(ct)==votes.end()) { 00078 votes[ct] = 1; 00079 } 00080 votes[ct]++; 00081 if (votes[ct]>best_ct) { 00082 best_ct = votes[ct]; 00083 best = ct; 00084 } 00085 ct = 0; 00086 } 00087 }; 00088 00089 #define SEPARATOR_COMMA 0 00090 #define SEPARATOR_TAB 1 00091 #define SEPARATOR_SEMICOLON 2 00092 #define SEPARATOR_COUNT 3 00093 00094 void SheetStyle::setFromInspection(const char *buffer, int len) { 00095 SeparatorHistory history[SEPARATOR_COUNT] = { ',', '\t', ';' }; 00096 int rows = 0; 00097 bool quoted = false; 00098 bool content = false; 00099 for (int i=0; i<=len; i++) { 00100 char ch = '\n'; 00101 if (i<len) { 00102 ch = buffer[i]; 00103 } 00104 if (ch=='\"') { 00105 quoted = !quoted; 00106 } else if (!quoted) { 00107 if (ch!='\n'&&ch!='\r') { 00108 content = true; 00109 } 00110 if (ch=='\n') { 00111 rows++; 00112 if (content) { 00113 for (int j=0; j<SEPARATOR_COUNT; j++) { 00114 history[j].add(); 00115 } 00116 content = false; 00117 } 00118 } else { 00119 for (int j=0; j<SEPARATOR_COUNT; j++) { 00120 if (ch==history[j].ch) { 00121 history[j].bump(); 00122 } 00123 } 00124 } 00125 } 00126 } 00127 00128 delim = ","; 00129 int best_ct = 0; 00130 for (int j=0; j<SEPARATOR_COUNT; j++) { 00131 if (history[j].best!=0) { 00132 if (history[j].best_ct>best_ct) { 00133 best_ct = history[j].best_ct; 00134 delim = string() + history[j].ch; 00135 } 00136 } 00137 } 00138 } 00139