libcsv.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. /*
  2. libcsv - parse and write csv data
  3. Copyright (C) 2008 Robert Gamble
  4. This library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. This library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with this library; if not, write to the Free Software
  14. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  15. */
  16. #if ___STDC_VERSION__ >= 199901L
  17. #include <stdint.h>
  18. #else
  19. #define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
  20. #endif
  21. #include "libcsv.h"
  22. #define VERSION "3.0.3"
  23. #define ROW_NOT_BEGUN 0
  24. #define FIELD_NOT_BEGUN 1
  25. #define FIELD_BEGUN 2
  26. #define FIELD_MIGHT_HAVE_ENDED 3
  27. /*
  28. Explanation of states
  29. ROW_NOT_BEGUN There have not been any fields encountered for this row
  30. FIELD_NOT_BEGUN There have been fields but we are currently not in one
  31. FIELD_BEGUN We are in a field
  32. FIELD_MIGHT_HAVE_ENDED
  33. We encountered a double quote inside a quoted field, the
  34. field is either ended or the quote is literal
  35. */
  36. #define MEM_BLK_SIZE 128
  37. #define SUBMIT_FIELD(p) \
  38. do \
  39. { \
  40. if (!quoted) \
  41. entry_pos -= spaces; \
  42. if (p->options & CSV_APPEND_NULL) \
  43. ((p)->entry_buf[entry_pos]) = '\0'; \
  44. if (cb1 && (p->options & CSV_EMPTY_IS_NULL) && !quoted && entry_pos == 0) \
  45. cb1(NULL, entry_pos, data); \
  46. else if (cb1) \
  47. cb1(p->entry_buf, entry_pos, data); \
  48. pstate = FIELD_NOT_BEGUN; \
  49. entry_pos = quoted = spaces = 0; \
  50. } while (0)
  51. #define SUBMIT_ROW(p, c) \
  52. do \
  53. { \
  54. if (cb2) \
  55. cb2(c, data); \
  56. pstate = ROW_NOT_BEGUN; \
  57. entry_pos = quoted = spaces = 0; \
  58. } while (0)
  59. #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
  60. static char *csv_errors[] =
  61. {
  62. "success",
  63. "error parsing data while strict checking enabled",
  64. "memory exhausted while increasing buffer size",
  65. "data size too large",
  66. "invalid status code"};
  67. int csv_error(struct csv_parser *p)
  68. {
  69. /* Return the current status of the parser */
  70. return p->status;
  71. }
  72. char *csv_strerror(int status)
  73. {
  74. /* Return a textual description of status */
  75. if (status >= CSV_EINVALID || status < 0)
  76. return csv_errors[CSV_EINVALID];
  77. else
  78. return csv_errors[status];
  79. }
  80. int csv_get_opts(struct csv_parser *p)
  81. {
  82. /* Return the currently set options of parser */
  83. if (p == NULL)
  84. return -1;
  85. return p->options;
  86. }
  87. int csv_set_opts(struct csv_parser *p, unsigned char options)
  88. {
  89. /* Set the options */
  90. if (p == NULL)
  91. return -1;
  92. p->options = options;
  93. return 0;
  94. }
  95. int csv_init(struct csv_parser *p, unsigned char options)
  96. {
  97. /* Initialize a csv_parser object returns 0 on success, -1 on error */
  98. if (p == NULL)
  99. return -1;
  100. p->entry_buf = NULL;
  101. p->pstate = ROW_NOT_BEGUN;
  102. p->quoted = 0;
  103. p->spaces = 0;
  104. p->entry_pos = 0;
  105. p->entry_size = 0;
  106. p->status = 0;
  107. p->options = options;
  108. p->quote_char = CSV_QUOTE;
  109. p->delim_char = CSV_COMMA;
  110. p->is_space = NULL;
  111. p->is_term = NULL;
  112. p->blk_size = MEM_BLK_SIZE;
  113. p->malloc_func = NULL;
  114. p->realloc_func = realloc;
  115. p->free_func = free;
  116. return 0;
  117. }
  118. void csv_free(struct csv_parser *p)
  119. {
  120. /* Free the entry_buffer of csv_parser object */
  121. if (p == NULL)
  122. return;
  123. if (p->entry_buf)
  124. p->free_func(p->entry_buf);
  125. p->entry_buf = NULL;
  126. p->entry_size = 0;
  127. return;
  128. }
  129. int csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
  130. {
  131. /* Finalize parsing. Needed, for example, when file does not end in a newline */
  132. int quoted = p->quoted;
  133. int pstate = p->pstate;
  134. size_t spaces = p->spaces;
  135. size_t entry_pos = p->entry_pos;
  136. /* Liu2guang add MDK5 warring */
  137. pstate = pstate;
  138. if (p == NULL)
  139. return -1;
  140. if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI)
  141. {
  142. /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
  143. p->status = CSV_EPARSE;
  144. return -1;
  145. }
  146. switch (p->pstate)
  147. {
  148. case FIELD_MIGHT_HAVE_ENDED:
  149. p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */
  150. /* Fall-through */
  151. case FIELD_NOT_BEGUN:
  152. case FIELD_BEGUN:
  153. quoted = p->quoted, pstate = p->pstate;
  154. spaces = p->spaces, entry_pos = p->entry_pos;
  155. SUBMIT_FIELD(p);
  156. SUBMIT_ROW(p, -1);
  157. case ROW_NOT_BEGUN: /* Already ended properly */
  158. ;
  159. }
  160. /* Reset parser */
  161. p->spaces = p->quoted = p->entry_pos = p->status = 0;
  162. p->pstate = ROW_NOT_BEGUN;
  163. return 0;
  164. }
  165. void csv_set_delim(struct csv_parser *p, unsigned char c)
  166. {
  167. /* Set the delimiter */
  168. if (p)
  169. p->delim_char = c;
  170. }
  171. void csv_set_quote(struct csv_parser *p, unsigned char c)
  172. {
  173. /* Set the quote character */
  174. if (p)
  175. p->quote_char = c;
  176. }
  177. unsigned char csv_get_delim(struct csv_parser *p)
  178. {
  179. /* Get the delimiter */
  180. return p->delim_char;
  181. }
  182. unsigned char csv_get_quote(struct csv_parser *p)
  183. {
  184. /* Get the quote character */
  185. return p->quote_char;
  186. }
  187. void csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
  188. {
  189. /* Set the space function */
  190. if (p)
  191. p->is_space = f;
  192. }
  193. void csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
  194. {
  195. /* Set the term function */
  196. if (p)
  197. p->is_term = f;
  198. }
  199. void csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
  200. {
  201. /* Set the realloc function used to increase buffer size */
  202. if (p && f)
  203. p->realloc_func = f;
  204. }
  205. void csv_set_free_func(struct csv_parser *p, void (*f)(void *))
  206. {
  207. /* Set the free function used to free the buffer */
  208. if (p && f)
  209. p->free_func = f;
  210. }
  211. void csv_set_blk_size(struct csv_parser *p, size_t size)
  212. {
  213. /* Set the block size used to increment buffer size */
  214. if (p)
  215. p->blk_size = size;
  216. }
  217. size_t csv_get_buffer_size(struct csv_parser *p)
  218. {
  219. /* Get the size of the entry buffer */
  220. if (p)
  221. return p->entry_size;
  222. return 0;
  223. }
  224. static int csv_increase_buffer(struct csv_parser *p)
  225. {
  226. /* Increase the size of the entry buffer. Attempt to increase size by
  227. * p->blk_size, if this is larger than SIZE_MAX try to increase current
  228. * buffer size to SIZE_MAX. If allocation fails, try to allocate halve
  229. * the size and try again until successful or increment size is zero.
  230. */
  231. size_t to_add = p->blk_size;
  232. void *vp;
  233. if (p->entry_size >= SIZE_MAX - to_add)
  234. to_add = SIZE_MAX - p->entry_size;
  235. if (!to_add)
  236. {
  237. p->status = CSV_ETOOBIG;
  238. return -1;
  239. }
  240. while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL)
  241. {
  242. to_add /= 2;
  243. if (!to_add)
  244. {
  245. p->status = CSV_ENOMEM;
  246. return -1;
  247. }
  248. }
  249. /* Update entry buffer pointer and entry_size if successful */
  250. p->entry_buf = vp;
  251. p->entry_size += to_add;
  252. return 0;
  253. }
  254. size_t csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
  255. {
  256. unsigned const char *us = s; /* Access input data as array of unsigned char */
  257. unsigned char c; /* The character we are currently processing */
  258. size_t pos = 0; /* The number of characters we have processed in this call */
  259. /* Store key fields into local variables for performance */
  260. unsigned char delim = p->delim_char;
  261. unsigned char quote = p->quote_char;
  262. int (*is_space)(unsigned char) = p->is_space;
  263. int (*is_term)(unsigned char) = p->is_term;
  264. int quoted = p->quoted;
  265. int pstate = p->pstate;
  266. size_t spaces = p->spaces;
  267. size_t entry_pos = p->entry_pos;
  268. if (!p->entry_buf && pos < len)
  269. {
  270. /* Buffer hasn't been allocated yet and len > 0 */
  271. if (csv_increase_buffer(p) != 0)
  272. {
  273. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  274. return pos;
  275. }
  276. }
  277. while (pos < len)
  278. {
  279. /* Check memory usage, increase buffer if neccessary */
  280. if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size))
  281. {
  282. if (csv_increase_buffer(p) != 0)
  283. {
  284. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  285. return pos;
  286. }
  287. }
  288. c = us[pos++];
  289. switch (pstate)
  290. {
  291. case ROW_NOT_BEGUN:
  292. case FIELD_NOT_BEGUN:
  293. if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c != delim) /* Space or Tab */
  294. {
  295. continue;
  296. }
  297. else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) /* Carriage Return or Line Feed */
  298. {
  299. if (pstate == FIELD_NOT_BEGUN)
  300. {
  301. SUBMIT_FIELD(p);
  302. SUBMIT_ROW(p, (unsigned char)c);
  303. }
  304. else /* ROW_NOT_BEGUN */
  305. {
  306. /* Don't submit empty rows by default */
  307. if (p->options & CSV_REPALL_NL)
  308. {
  309. SUBMIT_ROW(p, (unsigned char)c);
  310. }
  311. }
  312. continue;
  313. }
  314. else if (c == delim) /* Comma */
  315. {
  316. SUBMIT_FIELD(p);
  317. break;
  318. }
  319. else if (c == quote) /* Quote */
  320. {
  321. pstate = FIELD_BEGUN;
  322. quoted = 1;
  323. }
  324. else /* Anything else */
  325. {
  326. pstate = FIELD_BEGUN;
  327. quoted = 0;
  328. SUBMIT_CHAR(p, c);
  329. }
  330. break;
  331. case FIELD_BEGUN:
  332. if (c == quote) /* Quote */
  333. {
  334. if (quoted)
  335. {
  336. SUBMIT_CHAR(p, c);
  337. pstate = FIELD_MIGHT_HAVE_ENDED;
  338. }
  339. else
  340. {
  341. /* STRICT ERROR - double quote inside non-quoted field */
  342. if (p->options & CSV_STRICT)
  343. {
  344. p->status = CSV_EPARSE;
  345. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  346. return pos - 1;
  347. }
  348. SUBMIT_CHAR(p, c);
  349. spaces = 0;
  350. }
  351. }
  352. else if (c == delim) /* Comma */
  353. {
  354. if (quoted)
  355. {
  356. SUBMIT_CHAR(p, c);
  357. }
  358. else
  359. {
  360. SUBMIT_FIELD(p);
  361. }
  362. }
  363. else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) /* Carriage Return or Line Feed */
  364. {
  365. if (!quoted)
  366. {
  367. SUBMIT_FIELD(p);
  368. SUBMIT_ROW(p, (unsigned char)c);
  369. }
  370. else
  371. {
  372. SUBMIT_CHAR(p, c);
  373. }
  374. }
  375. else if (!quoted && (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) /* Tab or space for non-quoted field */
  376. {
  377. SUBMIT_CHAR(p, c);
  378. spaces++;
  379. }
  380. else /* Anything else */
  381. {
  382. SUBMIT_CHAR(p, c);
  383. spaces = 0;
  384. }
  385. break;
  386. case FIELD_MIGHT_HAVE_ENDED:
  387. /* This only happens when a quote character is encountered in a quoted field */
  388. if (c == delim) /* Comma */
  389. {
  390. entry_pos -= spaces + 1; /* get rid of spaces and original quote */
  391. SUBMIT_FIELD(p);
  392. }
  393. else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) /* Carriage Return or Line Feed */
  394. {
  395. entry_pos -= spaces + 1; /* get rid of spaces and original quote */
  396. SUBMIT_FIELD(p);
  397. SUBMIT_ROW(p, (unsigned char)c);
  398. }
  399. else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) /* Space or Tab */
  400. {
  401. SUBMIT_CHAR(p, c);
  402. spaces++;
  403. }
  404. else if (c == quote) /* Quote */
  405. {
  406. if (spaces)
  407. {
  408. /* STRICT ERROR - unescaped double quote */
  409. if (p->options & CSV_STRICT)
  410. {
  411. p->status = CSV_EPARSE;
  412. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  413. return pos - 1;
  414. }
  415. spaces = 0;
  416. SUBMIT_CHAR(p, c);
  417. }
  418. else
  419. {
  420. /* Two quotes in a row */
  421. pstate = FIELD_BEGUN;
  422. }
  423. }
  424. else /* Anything else */
  425. {
  426. /* STRICT ERROR - unescaped double quote */
  427. if (p->options & CSV_STRICT)
  428. {
  429. p->status = CSV_EPARSE;
  430. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  431. return pos - 1;
  432. }
  433. pstate = FIELD_BEGUN;
  434. spaces = 0;
  435. SUBMIT_CHAR(p, c);
  436. }
  437. break;
  438. default:
  439. break;
  440. }
  441. }
  442. p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
  443. return pos;
  444. }
  445. size_t csv_write(void *dest, size_t dest_size, const void *src, size_t src_size)
  446. {
  447. unsigned char *cdest = dest;
  448. const unsigned char *csrc = src;
  449. size_t chars = 0;
  450. if (src == NULL)
  451. return 0;
  452. if (cdest == NULL)
  453. dest_size = 0;
  454. if (dest_size > 0)
  455. *cdest++ = '"';
  456. chars++;
  457. while (src_size)
  458. {
  459. if (*csrc == '"')
  460. {
  461. if (dest_size > chars)
  462. *cdest++ = '"';
  463. if (chars < SIZE_MAX)
  464. chars++;
  465. }
  466. if (dest_size > chars)
  467. *cdest++ = *csrc;
  468. if (chars < SIZE_MAX)
  469. chars++;
  470. src_size--;
  471. csrc++;
  472. }
  473. if (dest_size > chars)
  474. *cdest = '"';
  475. if (chars < SIZE_MAX)
  476. chars++;
  477. return chars;
  478. }
  479. int csv_fwrite(FILE *fp, const void *src, size_t src_size)
  480. {
  481. const unsigned char *csrc = src;
  482. if (fp == NULL || src == NULL)
  483. return 0;
  484. if (fputc('"', fp) == EOF)
  485. return EOF;
  486. while (src_size)
  487. {
  488. if (*csrc == '"')
  489. {
  490. if (fputc('"', fp) == EOF)
  491. return EOF;
  492. }
  493. if (fputc(*csrc, fp) == EOF)
  494. return EOF;
  495. src_size--;
  496. csrc++;
  497. }
  498. if (fputc('"', fp) == EOF)
  499. {
  500. return EOF;
  501. }
  502. return 0;
  503. }
  504. size_t csv_write2(void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
  505. {
  506. unsigned char *cdest = dest;
  507. const unsigned char *csrc = src;
  508. size_t chars = 0;
  509. if (src == NULL)
  510. return 0;
  511. if (dest == NULL)
  512. dest_size = 0;
  513. if (dest_size > 0)
  514. *cdest++ = quote;
  515. chars++;
  516. while (src_size)
  517. {
  518. if (*csrc == quote)
  519. {
  520. if (dest_size > chars)
  521. *cdest++ = quote;
  522. if (chars < SIZE_MAX)
  523. chars++;
  524. }
  525. if (dest_size > chars)
  526. *cdest++ = *csrc;
  527. if (chars < SIZE_MAX)
  528. chars++;
  529. src_size--;
  530. csrc++;
  531. }
  532. if (dest_size > chars)
  533. *cdest = quote;
  534. if (chars < SIZE_MAX)
  535. chars++;
  536. return chars;
  537. }
  538. int csv_fwrite2(FILE *fp, const void *src, size_t src_size, unsigned char quote)
  539. {
  540. const unsigned char *csrc = src;
  541. if (fp == NULL || src == NULL)
  542. return 0;
  543. if (fputc(quote, fp) == EOF)
  544. return EOF;
  545. while (src_size)
  546. {
  547. if (*csrc == quote)
  548. {
  549. if (fputc(quote, fp) == EOF)
  550. return EOF;
  551. }
  552. if (fputc(*csrc, fp) == EOF)
  553. return EOF;
  554. src_size--;
  555. csrc++;
  556. }
  557. if (fputc(quote, fp) == EOF)
  558. {
  559. return EOF;
  560. }
  561. return 0;
  562. }