base_xml.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. #include "fitz.h"
  2. struct parser
  3. {
  4. fz_xml *head;
  5. fz_context *ctx;
  6. };
  7. struct attribute
  8. {
  9. char name[40];
  10. char *value;
  11. struct attribute *next;
  12. };
  13. struct fz_xml_s
  14. {
  15. char name[40];
  16. char *text;
  17. struct attribute *atts;
  18. fz_xml *up, *down, *next;
  19. };
  20. static inline void indent(int n)
  21. {
  22. while (n--) putchar(' ');
  23. }
  24. void fz_debug_xml(fz_xml *item, int level)
  25. {
  26. while (item) {
  27. if (item->text) {
  28. printf("%s\n", item->text);
  29. } else {
  30. struct attribute *att;
  31. indent(level);
  32. printf("<%s", item->name);
  33. for (att = item->atts; att; att = att->next)
  34. printf(" %s=\"%s\"", att->name, att->value);
  35. if (item->down) {
  36. printf(">\n");
  37. fz_debug_xml(item->down, level + 1);
  38. indent(level);
  39. printf("</%s>\n", item->name);
  40. }
  41. else {
  42. printf("/>\n");
  43. }
  44. }
  45. item = item->next;
  46. }
  47. }
  48. fz_xml *fz_xml_next(fz_xml *item)
  49. {
  50. return item->next;
  51. }
  52. fz_xml *fz_xml_down(fz_xml *item)
  53. {
  54. return item->down;
  55. }
  56. char *fz_xml_text(fz_xml *item)
  57. {
  58. return item->text;
  59. }
  60. char *fz_xml_tag(fz_xml *item)
  61. {
  62. return item->name;
  63. }
  64. char *fz_xml_att(fz_xml *item, const char *name)
  65. {
  66. struct attribute *att;
  67. for (att = item->atts; att; att = att->next)
  68. if (!strcmp(att->name, name))
  69. return att->value;
  70. return NULL;
  71. }
  72. static void xml_free_attribute(fz_context *ctx, struct attribute *att)
  73. {
  74. while (att) {
  75. struct attribute *next = att->next;
  76. if (att->value)
  77. fz_free(ctx, att->value);
  78. fz_free(ctx, att);
  79. att = next;
  80. }
  81. }
  82. void fz_free_xml(fz_context *ctx, fz_xml *item)
  83. {
  84. while (item)
  85. {
  86. fz_xml *next = item->next;
  87. if (item->text)
  88. fz_free(ctx, item->text);
  89. if (item->atts)
  90. xml_free_attribute(ctx, item->atts);
  91. if (item->down)
  92. fz_free_xml(ctx, item->down);
  93. fz_free(ctx, item);
  94. item = next;
  95. }
  96. }
  97. void fz_detach_xml(fz_xml *node)
  98. {
  99. if (node->up)
  100. node->up->down = NULL;
  101. }
  102. static int xml_parse_entity(int *c, char *a)
  103. {
  104. char *b;
  105. if (a[1] == '#') {
  106. if (a[2] == 'x')
  107. *c = strtol(a + 3, &b, 16);
  108. else
  109. *c = strtol(a + 2, &b, 10);
  110. if (*b == ';')
  111. return b - a + 1;
  112. }
  113. else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') {
  114. *c = '<';
  115. return 4;
  116. }
  117. else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') {
  118. *c = '>';
  119. return 4;
  120. }
  121. else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') {
  122. *c = '&';
  123. return 5;
  124. }
  125. else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') {
  126. *c = '\'';
  127. return 6;
  128. }
  129. else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') {
  130. *c = '"';
  131. return 6;
  132. }
  133. *c = *a++;
  134. return 1;
  135. }
  136. static inline int isname(int c)
  137. {
  138. return c == '.' || c == '-' || c == '_' || c == ':' ||
  139. (c >= '0' && c <= '9') ||
  140. (c >= 'A' && c <= 'Z') ||
  141. (c >= 'a' && c <= 'z');
  142. }
  143. static inline int iswhite(int c)
  144. {
  145. return c == ' ' || c == '\r' || c == '\n' || c == '\t';
  146. }
  147. static void xml_emit_open_tag(struct parser *parser, char *a, char *b)
  148. {
  149. fz_xml *head, *tail;
  150. head = fz_malloc_struct(parser->ctx, fz_xml);
  151. if (b - a > sizeof(head->name) - 1)
  152. b = a + sizeof(head->name) - 1;
  153. memcpy(head->name, a, b - a);
  154. head->name[b - a] = 0;
  155. head->atts = NULL;
  156. head->text = NULL;
  157. head->up = parser->head;
  158. head->down = NULL;
  159. head->next = NULL;
  160. if (!parser->head->down) {
  161. parser->head->down = head;
  162. }
  163. else {
  164. tail = parser->head->down;
  165. while (tail->next)
  166. tail = tail->next;
  167. tail->next = head;
  168. }
  169. parser->head = head;
  170. }
  171. static void xml_emit_att_name(struct parser *parser, char *a, char *b)
  172. {
  173. fz_xml *head = parser->head;
  174. struct attribute *att;
  175. att = fz_malloc_struct(parser->ctx, struct attribute);
  176. if (b - a > sizeof(att->name) - 1)
  177. b = a + sizeof(att->name) - 1;
  178. memcpy(att->name, a, b - a);
  179. att->name[b - a] = 0;
  180. att->value = NULL;
  181. att->next = head->atts;
  182. head->atts = att;
  183. }
  184. static void xml_emit_att_value(struct parser *parser, char *a, char *b)
  185. {
  186. fz_xml *head = parser->head;
  187. struct attribute *att = head->atts;
  188. char *s;
  189. int c;
  190. /* entities are all longer than UTFmax so runetochar is safe */
  191. s = att->value = fz_malloc(parser->ctx, b - a + 1);
  192. while (a < b) {
  193. if (*a == '&') {
  194. a += xml_parse_entity(&c, a);
  195. s += fz_runetochar(s, c);
  196. }
  197. else {
  198. *s++ = *a++;
  199. }
  200. }
  201. *s = 0;
  202. }
  203. static void xml_emit_close_tag(struct parser *parser)
  204. {
  205. if (parser->head->up)
  206. parser->head = parser->head->up;
  207. }
  208. static void xml_emit_text(struct parser *parser, char *a, char *b)
  209. {
  210. static char *empty = "";
  211. fz_xml *head;
  212. char *s;
  213. int c;
  214. /* Skip all-whitespace text nodes */
  215. for (s = a; s < b; s++)
  216. if (!iswhite(*s))
  217. break;
  218. if (s == b)
  219. return;
  220. xml_emit_open_tag(parser, empty, empty);
  221. head = parser->head;
  222. /* entities are all longer than UTFmax so runetochar is safe */
  223. s = head->text = fz_malloc(parser->ctx, b - a + 1);
  224. while (a < b) {
  225. if (*a == '&') {
  226. a += xml_parse_entity(&c, a);
  227. s += fz_runetochar(s, c);
  228. }
  229. else {
  230. *s++ = *a++;
  231. }
  232. }
  233. *s = 0;
  234. xml_emit_close_tag(parser);
  235. }
  236. static char *xml_parse_document_imp(struct parser *x, char *p)
  237. {
  238. char *mark;
  239. int quote;
  240. parse_text:
  241. mark = p;
  242. while (*p && *p != '<') ++p;
  243. xml_emit_text(x, mark, p);
  244. if (*p == '<') { ++p; goto parse_element; }
  245. return NULL;
  246. parse_element:
  247. if (*p == '/') { ++p; goto parse_closing_element; }
  248. if (*p == '!') { ++p; goto parse_comment; }
  249. if (*p == '?') { ++p; goto parse_processing_instruction; }
  250. while (iswhite(*p)) ++p;
  251. if (isname(*p))
  252. goto parse_element_name;
  253. return "syntax error in element";
  254. parse_comment:
  255. if (*p == '[') goto parse_cdata;
  256. if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
  257. if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
  258. mark = p;
  259. while (*p) {
  260. if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
  261. p += 3;
  262. goto parse_text;
  263. }
  264. ++p;
  265. }
  266. return "end of data in comment";
  267. parse_cdata:
  268. if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
  269. return "syntax error in CDATA section";
  270. p += 7;
  271. mark = p;
  272. while (*p) {
  273. if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
  274. p += 3;
  275. goto parse_text;
  276. }
  277. ++p;
  278. }
  279. return "end of data in CDATA section";
  280. parse_processing_instruction:
  281. while (*p) {
  282. if (p[0] == '?' && p[1] == '>') {
  283. p += 2;
  284. goto parse_text;
  285. }
  286. ++p;
  287. }
  288. return "end of data in processing instruction";
  289. parse_closing_element:
  290. while (iswhite(*p)) ++p;
  291. mark = p;
  292. while (isname(*p)) ++p;
  293. while (iswhite(*p)) ++p;
  294. if (*p != '>')
  295. return "syntax error in closing element";
  296. xml_emit_close_tag(x);
  297. ++p;
  298. goto parse_text;
  299. parse_element_name:
  300. mark = p;
  301. while (isname(*p)) ++p;
  302. xml_emit_open_tag(x, mark, p);
  303. if (*p == '>') { ++p; goto parse_text; }
  304. if (p[0] == '/' && p[1] == '>') {
  305. xml_emit_close_tag(x);
  306. p += 2;
  307. goto parse_text;
  308. }
  309. if (iswhite(*p))
  310. goto parse_attributes;
  311. return "syntax error after element name";
  312. parse_attributes:
  313. while (iswhite(*p)) ++p;
  314. if (isname(*p))
  315. goto parse_attribute_name;
  316. if (*p == '>') { ++p; goto parse_text; }
  317. if (p[0] == '/' && p[1] == '>') {
  318. xml_emit_close_tag(x);
  319. p += 2;
  320. goto parse_text;
  321. }
  322. return "syntax error in attributes";
  323. parse_attribute_name:
  324. mark = p;
  325. while (isname(*p)) ++p;
  326. xml_emit_att_name(x, mark, p);
  327. while (iswhite(*p)) ++p;
  328. if (*p == '=') { ++p; goto parse_attribute_value; }
  329. return "syntax error after attribute name";
  330. parse_attribute_value:
  331. while (iswhite(*p)) ++p;
  332. quote = *p++;
  333. if (quote != '"' && quote != '\'')
  334. return "missing quote character";
  335. mark = p;
  336. while (*p && *p != quote) ++p;
  337. if (*p == quote) {
  338. xml_emit_att_value(x, mark, p++);
  339. goto parse_attributes;
  340. }
  341. return "end of data in attribute value";
  342. }
  343. static char *convert_to_utf8(fz_context *doc, unsigned char *s, int n, int *dofree)
  344. {
  345. unsigned char *e = s + n;
  346. char *dst, *d;
  347. int c;
  348. if (s[0] == 0xFE && s[1] == 0xFF) {
  349. s += 2;
  350. dst = d = fz_malloc(doc, n * 2);
  351. while (s + 1 < e) {
  352. c = s[0] << 8 | s[1];
  353. d += fz_runetochar(d, c);
  354. s += 2;
  355. }
  356. *d = 0;
  357. *dofree = 1;
  358. return dst;
  359. }
  360. if (s[0] == 0xFF && s[1] == 0xFE) {
  361. s += 2;
  362. dst = d = fz_malloc(doc, n * 2);
  363. while (s + 1 < e) {
  364. c = s[0] | s[1] << 8;
  365. d += fz_runetochar(d, c);
  366. s += 2;
  367. }
  368. *d = 0;
  369. *dofree = 1;
  370. return dst;
  371. }
  372. *dofree = 0;
  373. if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
  374. return (char*)s+3;
  375. return (char*)s;
  376. }
  377. fz_xml *
  378. fz_parse_xml(fz_context *ctx, unsigned char *s, int n)
  379. {
  380. struct parser parser;
  381. fz_xml root;
  382. char *p, *error;
  383. int dofree;
  384. /* s is already null-terminated (see xps_new_part) */
  385. memset(&root, 0, sizeof(root));
  386. parser.head = &root;
  387. parser.ctx = ctx;
  388. p = convert_to_utf8(ctx, s, n, &dofree);
  389. fz_try(ctx)
  390. {
  391. error = xml_parse_document_imp(&parser, p);
  392. if (error)
  393. fz_throw(ctx, "%s", error);
  394. }
  395. fz_always(ctx)
  396. {
  397. if (dofree)
  398. fz_free(ctx, p);
  399. }
  400. fz_catch(ctx)
  401. {
  402. fz_free_xml(ctx, root.down);
  403. fz_rethrow(ctx);
  404. }
  405. return root.down;
  406. }