pdf_parse.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. #include "fitz-internal.h"
  2. #include "mupdf-internal.h"
  3. fz_rect *
  4. pdf_to_rect(fz_context *ctx, pdf_obj *array, fz_rect *r)
  5. {
  6. float a = pdf_to_real(pdf_array_get(array, 0));
  7. float b = pdf_to_real(pdf_array_get(array, 1));
  8. float c = pdf_to_real(pdf_array_get(array, 2));
  9. float d = pdf_to_real(pdf_array_get(array, 3));
  10. r->x0 = fz_min(a, c);
  11. r->y0 = fz_min(b, d);
  12. r->x1 = fz_max(a, c);
  13. r->y1 = fz_max(b, d);
  14. return r;
  15. }
  16. fz_matrix *
  17. pdf_to_matrix(fz_context *ctx, pdf_obj *array, fz_matrix *m)
  18. {
  19. m->a = pdf_to_real(pdf_array_get(array, 0));
  20. m->b = pdf_to_real(pdf_array_get(array, 1));
  21. m->c = pdf_to_real(pdf_array_get(array, 2));
  22. m->d = pdf_to_real(pdf_array_get(array, 3));
  23. m->e = pdf_to_real(pdf_array_get(array, 4));
  24. m->f = pdf_to_real(pdf_array_get(array, 5));
  25. return m;
  26. }
  27. /* Convert Unicode/PdfDocEncoding string into utf-8 */
  28. char *
  29. pdf_to_utf8(pdf_document *xref, pdf_obj *src)
  30. {
  31. fz_context *ctx = xref->ctx;
  32. fz_buffer *strmbuf = NULL;
  33. unsigned char *srcptr;
  34. char *dstptr, *dst;
  35. int srclen;
  36. int dstlen = 0;
  37. int ucs;
  38. int i;
  39. fz_var(strmbuf);
  40. fz_try(ctx)
  41. {
  42. if (pdf_is_string(src))
  43. {
  44. srcptr = (unsigned char *) pdf_to_str_buf(src);
  45. srclen = pdf_to_str_len(src);
  46. }
  47. else if (pdf_is_stream(xref, pdf_to_num(src), pdf_to_gen(src)))
  48. {
  49. strmbuf = pdf_load_stream(xref, pdf_to_num(src), pdf_to_gen(src));
  50. srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr);
  51. }
  52. else
  53. {
  54. srclen = 0;
  55. }
  56. if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
  57. {
  58. for (i = 2; i + 1 < srclen; i += 2)
  59. {
  60. ucs = srcptr[i] << 8 | srcptr[i+1];
  61. dstlen += fz_runelen(ucs);
  62. }
  63. dstptr = dst = fz_malloc(ctx, dstlen + 1);
  64. for (i = 2; i + 1 < srclen; i += 2)
  65. {
  66. ucs = srcptr[i] << 8 | srcptr[i+1];
  67. dstptr += fz_runetochar(dstptr, ucs);
  68. }
  69. }
  70. else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
  71. {
  72. for (i = 2; i + 1 < srclen; i += 2)
  73. {
  74. ucs = srcptr[i] | srcptr[i+1] << 8;
  75. dstlen += fz_runelen(ucs);
  76. }
  77. dstptr = dst = fz_malloc(ctx, dstlen + 1);
  78. for (i = 2; i + 1 < srclen; i += 2)
  79. {
  80. ucs = srcptr[i] | srcptr[i+1] << 8;
  81. dstptr += fz_runetochar(dstptr, ucs);
  82. }
  83. }
  84. else
  85. {
  86. for (i = 0; i < srclen; i++)
  87. dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]);
  88. dstptr = dst = fz_malloc(ctx, dstlen + 1);
  89. for (i = 0; i < srclen; i++)
  90. {
  91. ucs = pdf_doc_encoding[srcptr[i]];
  92. dstptr += fz_runetochar(dstptr, ucs);
  93. }
  94. }
  95. }
  96. fz_always(ctx)
  97. {
  98. fz_drop_buffer(ctx, strmbuf);
  99. }
  100. fz_catch(ctx)
  101. {
  102. fz_rethrow(ctx);
  103. }
  104. *dstptr = '\0';
  105. return dst;
  106. }
  107. /* Convert Unicode/PdfDocEncoding string into ucs-2 */
  108. unsigned short *
  109. pdf_to_ucs2(pdf_document *xref, pdf_obj *src)
  110. {
  111. fz_context *ctx = xref->ctx;
  112. unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src);
  113. unsigned short *dstptr, *dst;
  114. int srclen = pdf_to_str_len(src);
  115. int i;
  116. if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
  117. {
  118. dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
  119. for (i = 2; i + 1 < srclen; i += 2)
  120. *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
  121. }
  122. else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
  123. {
  124. dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
  125. for (i = 2; i + 1 < srclen; i += 2)
  126. *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
  127. }
  128. else
  129. {
  130. dstptr = dst = fz_malloc_array(ctx, srclen + 1, sizeof(short));
  131. for (i = 0; i < srclen; i++)
  132. *dstptr++ = pdf_doc_encoding[srcptr[i]];
  133. }
  134. *dstptr = '\0';
  135. return dst;
  136. }
  137. /* allow to convert to UCS-2 without the need for an fz_context */
  138. /* (buffer must be at least (fz_to_str_len(src) + 1) * 2 bytes in size) */
  139. void
  140. pdf_to_ucs2_buf(unsigned short *buffer, pdf_obj *src)
  141. {
  142. unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src);
  143. unsigned short *dstptr = buffer;
  144. int srclen = pdf_to_str_len(src);
  145. int i;
  146. if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
  147. {
  148. for (i = 2; i + 1 < srclen; i += 2)
  149. *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
  150. }
  151. else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
  152. {
  153. for (i = 2; i + 1 < srclen; i += 2)
  154. *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
  155. }
  156. else
  157. {
  158. for (i = 0; i < srclen; i++)
  159. *dstptr++ = pdf_doc_encoding[srcptr[i]];
  160. }
  161. *dstptr = '\0';
  162. }
  163. /* Convert UCS-2 string into PdfDocEncoding for authentication */
  164. char *
  165. pdf_from_ucs2(pdf_document *xref, unsigned short *src)
  166. {
  167. fz_context *ctx = xref->ctx;
  168. int i, j, len;
  169. char *docstr;
  170. len = 0;
  171. while (src[len])
  172. len++;
  173. docstr = fz_malloc(ctx, len + 1);
  174. for (i = 0; i < len; i++)
  175. {
  176. /* shortcut: check if the character has the same code point in both encodings */
  177. if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) {
  178. docstr[i] = src[i];
  179. continue;
  180. }
  181. /* search through pdf_docencoding for the character's code point */
  182. for (j = 0; j < 256; j++)
  183. if (pdf_doc_encoding[j] == src[i])
  184. break;
  185. docstr[i] = j;
  186. /* fail, if a character can't be encoded */
  187. if (!docstr[i])
  188. {
  189. fz_free(ctx, docstr);
  190. return NULL;
  191. }
  192. }
  193. docstr[len] = '\0';
  194. return docstr;
  195. }
  196. pdf_obj *
  197. pdf_to_utf8_name(pdf_document *xref, pdf_obj *src)
  198. {
  199. char *buf = pdf_to_utf8(xref, src);
  200. pdf_obj *dst = pdf_new_name(xref->ctx, buf);
  201. fz_free(xref->ctx, buf);
  202. return dst;
  203. }
  204. pdf_obj *
  205. pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
  206. {
  207. pdf_obj *ary = NULL;
  208. pdf_obj *obj = NULL;
  209. int a = 0, b = 0, n = 0;
  210. pdf_token tok;
  211. fz_context *ctx = file->ctx;
  212. pdf_obj *op;
  213. fz_var(obj);
  214. ary = pdf_new_array(ctx, 4);
  215. fz_try(ctx)
  216. {
  217. while (1)
  218. {
  219. tok = pdf_lex(file, buf);
  220. if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
  221. {
  222. if (n > 0)
  223. {
  224. obj = pdf_new_int(ctx, a);
  225. pdf_array_push(ary, obj);
  226. pdf_drop_obj(obj);
  227. obj = NULL;
  228. }
  229. if (n > 1)
  230. {
  231. obj = pdf_new_int(ctx, b);
  232. pdf_array_push(ary, obj);
  233. pdf_drop_obj(obj);
  234. obj = NULL;
  235. }
  236. n = 0;
  237. }
  238. if (tok == PDF_TOK_INT && n == 2)
  239. {
  240. obj = pdf_new_int(ctx, a);
  241. pdf_array_push(ary, obj);
  242. pdf_drop_obj(obj);
  243. obj = NULL;
  244. a = b;
  245. n --;
  246. }
  247. switch (tok)
  248. {
  249. case PDF_TOK_CLOSE_ARRAY:
  250. op = ary;
  251. goto end;
  252. case PDF_TOK_INT:
  253. if (n == 0)
  254. a = buf->i;
  255. if (n == 1)
  256. b = buf->i;
  257. n ++;
  258. break;
  259. case PDF_TOK_R:
  260. if (n != 2)
  261. fz_throw(ctx, "cannot parse indirect reference in array");
  262. obj = pdf_new_indirect(ctx, a, b, xref);
  263. pdf_array_push(ary, obj);
  264. pdf_drop_obj(obj);
  265. obj = NULL;
  266. n = 0;
  267. break;
  268. case PDF_TOK_OPEN_ARRAY:
  269. obj = pdf_parse_array(xref, file, buf);
  270. pdf_array_push(ary, obj);
  271. pdf_drop_obj(obj);
  272. obj = NULL;
  273. break;
  274. case PDF_TOK_OPEN_DICT:
  275. obj = pdf_parse_dict(xref, file, buf);
  276. pdf_array_push(ary, obj);
  277. pdf_drop_obj(obj);
  278. obj = NULL;
  279. break;
  280. case PDF_TOK_NAME:
  281. obj = pdf_new_name(ctx, buf->scratch);
  282. pdf_array_push(ary, obj);
  283. pdf_drop_obj(obj);
  284. obj = NULL;
  285. break;
  286. case PDF_TOK_REAL:
  287. obj = pdf_new_real(ctx, buf->f);
  288. pdf_array_push(ary, obj);
  289. pdf_drop_obj(obj);
  290. obj = NULL;
  291. break;
  292. case PDF_TOK_STRING:
  293. obj = pdf_new_string(ctx, buf->scratch, buf->len);
  294. pdf_array_push(ary, obj);
  295. pdf_drop_obj(obj);
  296. obj = NULL;
  297. break;
  298. case PDF_TOK_TRUE:
  299. obj = pdf_new_bool(ctx, 1);
  300. pdf_array_push(ary, obj);
  301. pdf_drop_obj(obj);
  302. obj = NULL;
  303. break;
  304. case PDF_TOK_FALSE:
  305. obj = pdf_new_bool(ctx, 0);
  306. pdf_array_push(ary, obj);
  307. pdf_drop_obj(obj);
  308. obj = NULL;
  309. break;
  310. case PDF_TOK_NULL:
  311. obj = pdf_new_null(ctx);
  312. pdf_array_push(ary, obj);
  313. pdf_drop_obj(obj);
  314. obj = NULL;
  315. break;
  316. default:
  317. fz_throw(ctx, "cannot parse token in array");
  318. }
  319. }
  320. end:
  321. {}
  322. }
  323. fz_catch(ctx)
  324. {
  325. pdf_drop_obj(obj);
  326. pdf_drop_obj(ary);
  327. fz_throw(ctx, "cannot parse array");
  328. }
  329. return op;
  330. }
  331. pdf_obj *
  332. pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
  333. {
  334. pdf_obj *dict;
  335. pdf_obj *key = NULL;
  336. pdf_obj *val = NULL;
  337. pdf_token tok;
  338. int a, b;
  339. fz_context *ctx = file->ctx;
  340. dict = pdf_new_dict(ctx, 8);
  341. fz_var(key);
  342. fz_var(val);
  343. fz_try(ctx)
  344. {
  345. while (1)
  346. {
  347. tok = pdf_lex(file, buf);
  348. skip:
  349. if (tok == PDF_TOK_CLOSE_DICT)
  350. break;
  351. /* for BI .. ID .. EI in content streams */
  352. if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
  353. break;
  354. if (tok != PDF_TOK_NAME)
  355. fz_throw(ctx, "invalid key in dict");
  356. key = pdf_new_name(ctx, buf->scratch);
  357. tok = pdf_lex(file, buf);
  358. switch (tok)
  359. {
  360. case PDF_TOK_OPEN_ARRAY:
  361. val = pdf_parse_array(xref, file, buf);
  362. break;
  363. case PDF_TOK_OPEN_DICT:
  364. val = pdf_parse_dict(xref, file, buf);
  365. break;
  366. case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
  367. case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
  368. case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
  369. case PDF_TOK_TRUE: val = pdf_new_bool(ctx, 1); break;
  370. case PDF_TOK_FALSE: val = pdf_new_bool(ctx, 0); break;
  371. case PDF_TOK_NULL: val = pdf_new_null(ctx); break;
  372. case PDF_TOK_INT:
  373. /* 64-bit to allow for numbers > INT_MAX and overflow */
  374. a = buf->i;
  375. tok = pdf_lex(file, buf);
  376. if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
  377. (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
  378. {
  379. val = pdf_new_int(ctx, a);
  380. pdf_dict_put(dict, key, val);
  381. pdf_drop_obj(val);
  382. val = NULL;
  383. pdf_drop_obj(key);
  384. key = NULL;
  385. goto skip;
  386. }
  387. if (tok == PDF_TOK_INT)
  388. {
  389. b = buf->i;
  390. tok = pdf_lex(file, buf);
  391. if (tok == PDF_TOK_R)
  392. {
  393. val = pdf_new_indirect(ctx, a, b, xref);
  394. break;
  395. }
  396. }
  397. fz_throw(ctx, "invalid indirect reference in dict");
  398. default:
  399. fz_throw(ctx, "unknown token in dict");
  400. }
  401. pdf_dict_put(dict, key, val);
  402. pdf_drop_obj(val);
  403. val = NULL;
  404. pdf_drop_obj(key);
  405. key = NULL;
  406. }
  407. }
  408. fz_catch(ctx)
  409. {
  410. pdf_drop_obj(dict);
  411. pdf_drop_obj(key);
  412. pdf_drop_obj(val);
  413. fz_throw(ctx, "cannot parse dict");
  414. }
  415. return dict;
  416. }
  417. pdf_obj *
  418. pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
  419. {
  420. pdf_token tok;
  421. fz_context *ctx = file->ctx;
  422. tok = pdf_lex(file, buf);
  423. switch (tok)
  424. {
  425. case PDF_TOK_OPEN_ARRAY:
  426. return pdf_parse_array(xref, file, buf);
  427. case PDF_TOK_OPEN_DICT:
  428. return pdf_parse_dict(xref, file, buf);
  429. case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch); break;
  430. case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); break;
  431. case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); break;
  432. case PDF_TOK_TRUE: return pdf_new_bool(ctx, 1); break;
  433. case PDF_TOK_FALSE: return pdf_new_bool(ctx, 0); break;
  434. case PDF_TOK_NULL: return pdf_new_null(ctx); break;
  435. case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); break;
  436. default: fz_throw(ctx, "unknown token in object stream");
  437. }
  438. return NULL; /* Stupid MSVC */
  439. }
  440. pdf_obj *
  441. pdf_parse_ind_obj(pdf_document *xref,
  442. fz_stream *file, pdf_lexbuf *buf,
  443. int *onum, int *ogen, int *ostmofs)
  444. {
  445. pdf_obj *obj = NULL;
  446. int num = 0, gen = 0, stm_ofs;
  447. pdf_token tok;
  448. int a, b;
  449. fz_context *ctx = file->ctx;
  450. fz_var(obj);
  451. tok = pdf_lex(file, buf);
  452. if (tok != PDF_TOK_INT)
  453. fz_throw(ctx, "expected object number");
  454. num = buf->i;
  455. tok = pdf_lex(file, buf);
  456. if (tok != PDF_TOK_INT)
  457. fz_throw(ctx, "expected generation number (%d ? obj)", num);
  458. gen = buf->i;
  459. tok = pdf_lex(file, buf);
  460. if (tok != PDF_TOK_OBJ)
  461. fz_throw(ctx, "expected 'obj' keyword (%d %d ?)", num, gen);
  462. tok = pdf_lex(file, buf);
  463. switch (tok)
  464. {
  465. case PDF_TOK_OPEN_ARRAY:
  466. obj = pdf_parse_array(xref, file, buf);
  467. break;
  468. case PDF_TOK_OPEN_DICT:
  469. obj = pdf_parse_dict(xref, file, buf);
  470. break;
  471. case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
  472. case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
  473. case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
  474. case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); break;
  475. case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); break;
  476. case PDF_TOK_NULL: obj = pdf_new_null(ctx); break;
  477. case PDF_TOK_INT:
  478. a = buf->i;
  479. tok = pdf_lex(file, buf);
  480. if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
  481. {
  482. obj = pdf_new_int(ctx, a);
  483. goto skip;
  484. }
  485. if (tok == PDF_TOK_INT)
  486. {
  487. b = buf->i;
  488. tok = pdf_lex(file, buf);
  489. if (tok == PDF_TOK_R)
  490. {
  491. obj = pdf_new_indirect(ctx, a, b, xref);
  492. break;
  493. }
  494. }
  495. fz_throw(ctx, "expected 'R' keyword (%d %d R)", num, gen);
  496. case PDF_TOK_ENDOBJ:
  497. obj = pdf_new_null(ctx);
  498. goto skip;
  499. default:
  500. fz_throw(ctx, "syntax error in object (%d %d R)", num, gen);
  501. }
  502. fz_try(ctx)
  503. {
  504. tok = pdf_lex(file, buf);
  505. }
  506. fz_catch(ctx)
  507. {
  508. pdf_drop_obj(obj);
  509. fz_throw(ctx, "cannot parse indirect object (%d %d R)", num, gen);
  510. }
  511. skip:
  512. if (tok == PDF_TOK_STREAM)
  513. {
  514. int c = fz_read_byte(file);
  515. while (c == ' ')
  516. c = fz_read_byte(file);
  517. if (c == '\r')
  518. {
  519. c = fz_peek_byte(file);
  520. if (c != '\n')
  521. fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
  522. else
  523. fz_read_byte(file);
  524. }
  525. stm_ofs = fz_tell(file);
  526. }
  527. else if (tok == PDF_TOK_ENDOBJ)
  528. {
  529. stm_ofs = 0;
  530. }
  531. else
  532. {
  533. fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
  534. stm_ofs = 0;
  535. }
  536. if (onum) *onum = num;
  537. if (ogen) *ogen = gen;
  538. if (ostmofs) *ostmofs = stm_ofs;
  539. return obj;
  540. }