Fennel: /home/pub/open/dev/fennel/flatfile/FlatFileParser.cpp Source File (original) (raw)

00001 00002 00003 00004 00005 00006 00007 00008 00009 00010 00011 00012 00013 00014 00015 00016 00017 00018 00019 00020 00021 00022 00023 #include "fennel/common/CommonPreamble.h" 00024 #include "fennel/flatfile/FlatFileParser.h" 00025 00026 FENNEL_BEGIN_CPPFILE("$Id: //open/dev/fennel/flatfile/FlatFileParser.cpp#1 $"); 00027 00028 const char SPACE_CHAR = ' '; 00029 00030 void FlatFileColumnParseResult::setResult( 00031 FlatFileColumnParseResult::DelimiterType type, char *buffer, uint size) 00032 { 00033 this->type = type; 00034 this->size = size; 00035 00036 next = buffer + size; 00037 switch (type) { 00038 case NO_DELIM: 00039 case MAX_LENGTH: 00040 break; 00041 case FlatFileColumnParseResult::FIELD_DELIM: 00042 case FlatFileColumnParseResult::ROW_DELIM: 00043 next++; 00044 break; 00045 default: 00046 permAssert(false); 00047 } 00048 } 00049 00050 FlatFileRowDescriptor::FlatFileRowDescriptor() : 00051 std::vector<FlatFileColumnDescriptor>() 00052 { 00053 bounded = true; 00054 } 00055 00056 void FlatFileRowDescriptor::setUnbounded() 00057 { 00058 bounded = false; 00059 } 00060 00061 bool FlatFileRowDescriptor::isBounded() const 00062 { 00063 return bounded; 00064 } 00065 00066 FlatFileRowParseResult::FlatFileRowParseResult() 00067 { 00068 reset(); 00069 } 00070 00071 void FlatFileRowParseResult::reset() 00072 { 00073 status = NO_STATUS; 00074 current = next = NULL; 00075 nRowDelimsRead = 0; 00076 } 00077 00078 FlatFileParser::FlatFileParser( 00079 char fieldDelim, char rowDelim, char quote, char escape, bool doTrim) 00080 { 00081 this->fieldDelim = fieldDelim; 00082 this->rowDelim = rowDelim; 00083 this->quote = quote; 00084 this->escape = escape; 00085 this->doTrim = doTrim; 00086 00087 fixed = (fieldDelim == 0); 00088 if (fixed) { 00089 assert(quote == 0); 00090 assert(escape == 0); 00091 } 00092 } 00093 00094 void FlatFileParser::scanRow( 00095 const char *buffer, 00096 int size, 00097 const FlatFileRowDescriptor &columns, 00098 FlatFileRowParseResult &result) 00099 { 00100 assert(size >= 0); 00101 const char *row = buffer; 00102 uint offset = 0; 00103 FlatFileColumnParseResult columnResult; 00104 00105 result.status = FlatFileRowParseResult::NO_STATUS; 00106 bool bounded = columns.isBounded(); 00107 bool lenient = columns.isLenient(); 00108 bool mapped = columns.isMapped(); 00109 bool strict = (bounded && (!lenient)); 00110 00111 uint maxColumns = columns.getMaxColumns(); 00112 uint resultColumns = columns.size(); 00113 if (bounded) { 00114 result.resize(resultColumns); 00115 for (uint i = 0; i < resultColumns; i++) { 00116 result.setNull(i); 00117 } 00118 } else { 00119 result.clear(); 00120 } 00121 00122
00123
00124
00125
00126 const char *nonDelim = scanRowDelim(row, size, false); 00127 offset = nonDelim - row; 00128 00129 bool done = false; 00130 bool rowDelim = false; 00131 for (uint i = 0; i < maxColumns; i++) { 00132 uint maxLength = columns.getMaxLength(i); 00133 scanColumn( 00134 row + offset, 00135 size - offset, 00136 maxLength, 00137 columnResult); 00138 switch (columnResult.type) { 00139 case FlatFileColumnParseResult::NO_DELIM: 00140 result.status = FlatFileRowParseResult::INCOMPLETE_COLUMN; 00141 done = true; 00142 break; 00143 case FlatFileColumnParseResult::ROW_DELIM: 00144 if (strict && (i+1 != columns.size())) { 00145 if (i == 0) { 00146 result.status = FlatFileRowParseResult::NO_COLUMN_DELIM; 00147 } else { 00148 result.status = FlatFileRowParseResult::TOO_FEW_COLUMNS; 00149 } 00150 } 00151 done = true; 00152 rowDelim = true; 00153 break; 00154 case FlatFileColumnParseResult::MAX_LENGTH: 00155 case FlatFileColumnParseResult::FIELD_DELIM: 00156 if (strict && (i+1 == columns.size())) { 00157 result.status = FlatFileRowParseResult::TOO_MANY_COLUMNS; 00158 done = true; 00159 } 00160 break; 00161 default: 00162 permAssert(false); 00163 } 00164 if (bounded) { 00165 int target = mapped ? columns.getMap(i) : i; 00166 if (target >= 0) { 00167 assert (target < resultColumns); 00168 result.setColumn(target, offset, columnResult.size); 00169 } 00170 } else { 00171 result.addColumn(offset, columnResult.size); 00172 } 00173 offset = columnResult.next - row; 00174 if (done) { 00175 break; 00176 } 00177 } 00178 result.current = const_cast<char *>(row); 00179 result.next = const_cast<char *>( 00180 scanRowEnd( 00181 columnResult.next, 00182 buffer + size - columnResult.next, 00183 rowDelim, 00184 result)); 00185 } 00186 00187 const char *FlatFileParser::scanRowEnd( 00188 const char *buffer, 00189 int size, 00190 bool rowDelim, 00191 FlatFileRowParseResult &result) 00192 { 00193 const char *read = buffer; 00194 const char *end = buffer + size; 00195 switch (result.status) { 00196 case FlatFileRowParseResult::INCOMPLETE_COLUMN: 00197 case FlatFileRowParseResult::ROW_TOO_LARGE: 00198 assert(read == end); 00199 return read; 00200 default: 00201 break; 00202 } 00203 00204
00205
00206 if (!rowDelim) { 00207 read = scanRowDelim(read, end - read, true); 00208 if (read == end) { 00209 return read; 00210 } 00211 } 00212 result.nRowDelimsRead++; 00213 00214
00215 read = scanRowDelim(read, end - read, false); 00216 return read; 00217 } 00218 00219 const char *FlatFileParser::scanRowDelim( 00220 const char *buffer, 00221 int size, 00222 bool search) 00223 { 00224 const char *read = buffer; 00225 const char *end = buffer + size; 00226 while (read < end) { 00227 if (isRowDelim(*read) == search) { 00228 break; 00229 } else { 00230 read++; 00231 } 00232 } 00233 return read; 00234 } 00235 00236 bool FlatFileParser::isRowDelim(char c) 00237 { 00238 assert(rowDelim != '\r'); 00239 return (rowDelim == '\n') ? (c == '\r' || c == '\n') : (c == rowDelim); 00240 } 00241 00242 void FlatFileParser::scanColumn( 00243 const char *buffer, 00244 uint size, 00245 uint maxLength, 00246 FlatFileColumnParseResult &result) 00247 { 00248 if (fixed) { 00249 return scanFixedColumn(buffer, size, maxLength, result); 00250 } 00251 00252 assert(buffer != NULL); 00253 const char *read = buffer; 00254 const char *end = buffer + size; 00255 00256
00257 if (doTrim) { 00258 while (read < end && SPACE_CHAR == *read) { 00259 read++; 00260 } 00261 } 00262 00263 bool quoted = (read < end && *read == quote); 00264 bool quoteEscape = (quoted && quote == escape); 00265 00266 FlatFileColumnParseResult::DelimiterType type = 00267 FlatFileColumnParseResult::NO_DELIM; 00268 if (quoted) { 00269 read++; 00270 } 00271 while (read < end) { 00272 if (*read == quote) { 00273 read++; 00274 if (quoteEscape) { 00275
00276
00277 if (read == end) { 00278 break; 00279 } 00280 if (*read == quote) { 00281
00282
00283 read++; 00284 continue; 00285 } 00286 } 00287 if (quoted) { 00288
00289 quoteEscape = quoted = false; 00290 } 00291 } else if (*read == escape) { 00292 read++; 00293
00294 if (read == end) { 00295 break; 00296 } 00297 read++; 00298 } else if (quoted) { 00299 read++; 00300 } else if (*read == fieldDelim) { 00301 type = FlatFileColumnParseResult::FIELD_DELIM; 00302 break; 00303 } else if (isRowDelim(*read)) { 00304 type = FlatFileColumnParseResult::ROW_DELIM; 00305 break; 00306 } else { 00307 read++; 00308 } 00309 } 00310 00311 uint resultSize = read - buffer; 00312 result.setResult(type, const_cast<char *>(buffer), resultSize); 00313 } 00314 00315 void FlatFileParser::scanFixedColumn( 00316 const char *buffer, 00317 uint size, 00318 uint maxLength, 00319 FlatFileColumnParseResult &result) 00320 { 00321 assert(buffer != NULL); 00322 const char *read = buffer; 00323 const char *end = buffer + size; 00324 uint remaining = maxLength; 00325 00326 FlatFileColumnParseResult::DelimiterType type = 00327 FlatFileColumnParseResult::NO_DELIM; 00328 while (read < end && remaining > 0) { 00329 if (isRowDelim(*read)) { 00330 type = FlatFileColumnParseResult::ROW_DELIM; 00331 break; 00332 } 00333 read++; 00334 remaining--; 00335 } 00336 00337
00338
00339 if (type == FlatFileColumnParseResult::NO_DELIM && read < end) { 00340 if (isRowDelim(*read)) { 00341 type = FlatFileColumnParseResult::ROW_DELIM; 00342 } else if (remaining == 0) { 00343 type = FlatFileColumnParseResult::MAX_LENGTH; 00344 } 00345 } 00346 00347 uint resultSize = read - buffer; 00348 result.setResult(type, const_cast<char *>(buffer), resultSize); 00349 } 00350 00351 void FlatFileParser::stripQuoting( 00352 FlatFileRowParseResult &rowResult, 00353 bool trim) 00354 { 00355 int nFields = rowResult.getReadCount(); 00356 00357 if (rowResult.strippedSizes.size() < nFields) { 00358 rowResult.strippedSizes.resize(nFields); 00359 } 00360 00361 for (uint i = 0; i < nFields; i++) { 00362 char *value = rowResult.getColumn(i); 00363 uint newSize = 0; 00364 if (value != NULL) { 00365 uint oldSize = rowResult.getRawColumnSize(i); 00366 newSize = stripQuoting(value, oldSize, trim); 00367 } 00368 rowResult.strippedSizes[i] = newSize; 00369 } 00370 } 00371 00372 uint FlatFileParser::stripQuoting( 00373 char *buffer, uint sizeIn, bool untrimmed) 00374 { 00375 assert(buffer != NULL); 00376 if (sizeIn == 0) { 00377 return 0; 00378 } 00379 int size = untrimmed ? trim(buffer, sizeIn) : sizeIn; 00380 bool quoted = false; 00381 char *read = buffer; 00382 char *end = buffer + size; 00383 char *write = buffer; 00384 00385 if (*buffer == quote) { 00386 quoted = true; 00387 read++; 00388 } 00389 bool quoteEscape = (quoted && quote == escape); 00390 while (read < end) { 00391 if (quoteEscape && *read == quote) { 00392 read++; 00393 if ((read < end) && (*read == quote)) { 00394
00395 *write++ = *read++; 00396 } else { 00397
00398 break; 00399 } 00400 } else if (quoted && *read == quote) { 00401 break; 00402 } else if (*read == escape) { 00403 read++; 00404 if (read < end) { 00405 *write++ = *read++; 00406 } 00407 } else { 00408 *write++ = *read++; 00409 } 00410 } 00411 return write - buffer; 00412 } 00413 00414 uint FlatFileParser::trim(char *buffer, uint size) 00415 { 00416 assert(buffer != NULL); 00417 if (size == 0) { 00418 return 0; 00419 } 00420 char *read = buffer; 00421 char *write = buffer; 00422 char *end = buffer + size; 00423 00424 while (read < end && *read == ' ') { 00425 read++; 00426 } 00427 end--; 00428 while (end >= read && *end == ' ') { 00429 end--; 00430 } 00431 end++; 00432 while (read < end) { 00433 *write++ = *read++; 00434 } 00435 return write - buffer; 00436 } 00437 00438 FENNEL_END_CPPFILE("$Id: //open/dev/fennel/flatfile/FlatFileParser.cpp#1 $"); 00439 00440