pdf_write.c 58 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334
  1. #include "fitz-internal.h"
  2. #include "mupdf-internal.h"
  3. /* #define DEBUG_LINEARIZATION */
  4. /* #define DEBUG_HEAP_SORT */
  5. /* #define DEBUG_WRITING */
  6. typedef struct pdf_write_options_s pdf_write_options;
  7. /*
  8. As part of linearization, we need to keep a list of what objects are used
  9. by what page. We do this by recording the objects used in a given page
  10. in a page_objects structure. We have a list of these structures (one per
  11. page) in the page_objects_list structure.
  12. The page_objects structure maintains a heap in the object array, so
  13. insertion takes log n time, and we can heapsort and dedupe at the end for
  14. a total worse case n log n time.
  15. The magic heap invariant is that:
  16. entry[n] >= entry[(n+1)*2-1] & entry[n] >= entry[(n+1)*2]
  17. or equivalently:
  18. entry[(n-1)>>1] >= entry[n]
  19. For a discussion of the heap data structure (and heapsort) see Kingston,
  20. "Algorithms and Data Structures".
  21. */
  22. typedef struct {
  23. int num_shared;
  24. int page_object_number;
  25. int num_objects;
  26. int min_ofs;
  27. int max_ofs;
  28. /* Extensible list of objects used on this page */
  29. int cap;
  30. int len;
  31. int object[1];
  32. } page_objects;
  33. typedef struct {
  34. int cap;
  35. int len;
  36. page_objects *page[1];
  37. } page_objects_list;
  38. struct pdf_write_options_s
  39. {
  40. FILE *out;
  41. int do_ascii;
  42. int do_expand;
  43. int do_garbage;
  44. int do_linear;
  45. int *use_list;
  46. int *ofs_list;
  47. int *gen_list;
  48. int *renumber_map;
  49. int continue_on_error;
  50. int *errors;
  51. /* The following extras are required for linearization */
  52. int *rev_renumber_map;
  53. int *rev_gen_list;
  54. int start;
  55. int first_xref_offset;
  56. int main_xref_offset;
  57. int first_xref_entry_offset;
  58. int file_len;
  59. int hints_shared_offset;
  60. int hintstream_len;
  61. pdf_obj *linear_l;
  62. pdf_obj *linear_h0;
  63. pdf_obj *linear_h1;
  64. pdf_obj *linear_o;
  65. pdf_obj *linear_e;
  66. pdf_obj *linear_n;
  67. pdf_obj *linear_t;
  68. pdf_obj *hints_s;
  69. pdf_obj *hints_length;
  70. int page_count;
  71. page_objects_list *page_object_lists;
  72. };
  73. /*
  74. * Constants for use with use_list.
  75. *
  76. * If use_list[num] = 0, then object num is unused.
  77. * If use_list[num] & PARAMS, then object num is the linearisation params obj.
  78. * If use_list[num] & CATALOGUE, then object num is used by the catalogue.
  79. * If use_list[num] & PAGE1, then object num is used by page 1.
  80. * If use_list[num] & SHARED, then object num is shared between pages.
  81. * If use_list[num] & PAGE_OBJECT then this must be the first object in a page.
  82. * Otherwise object num is used by page (use_list[num]>>USE_PAGE_SHIFT).
  83. */
  84. enum
  85. {
  86. USE_CATALOGUE = 2,
  87. USE_PAGE1 = 4,
  88. USE_SHARED = 8,
  89. USE_PARAMS = 16,
  90. USE_HINTS = 32,
  91. USE_PAGE_OBJECT = 64,
  92. USE_PAGE_MASK = ~127,
  93. USE_PAGE_SHIFT = 7
  94. };
  95. /*
  96. * page_objects and page_object_list handling functions
  97. */
  98. static page_objects_list *
  99. page_objects_list_create(fz_context *ctx)
  100. {
  101. page_objects_list *pol = fz_calloc(ctx, 1, sizeof(*pol));
  102. pol->cap = 1;
  103. pol->len = 0;
  104. return pol;
  105. }
  106. static void
  107. page_objects_list_destroy(fz_context *ctx, page_objects_list *pol)
  108. {
  109. int i;
  110. if (!pol)
  111. return;
  112. for (i = 0; i < pol->len; i++)
  113. {
  114. fz_free(ctx, pol->page[i]);
  115. }
  116. fz_free(ctx, pol);
  117. }
  118. static void
  119. page_objects_list_ensure(fz_context *ctx, page_objects_list **pol, int newcap)
  120. {
  121. int oldcap = (*pol)->cap;
  122. if (newcap <= oldcap)
  123. return;
  124. *pol = fz_resize_array(ctx, *pol, 1, sizeof(page_objects_list) + (newcap-1)*sizeof(page_objects *));
  125. memset(&(*pol)->page[oldcap], 0, (newcap-oldcap)*sizeof(page_objects *));
  126. (*pol)->cap = newcap;
  127. }
  128. static page_objects *
  129. page_objects_create(fz_context *ctx)
  130. {
  131. int initial_cap = 8;
  132. page_objects *po = fz_calloc(ctx, 1, sizeof(*po) + (initial_cap-1) * sizeof(int));
  133. po->cap = initial_cap;
  134. po->len = 0;
  135. return po;
  136. }
  137. static void
  138. page_objects_insert(fz_context *ctx, page_objects **ppo, int i)
  139. {
  140. page_objects *po;
  141. /* Make a page_objects if we don't have one */
  142. if (*ppo == NULL)
  143. *ppo = page_objects_create(ctx);
  144. po = *ppo;
  145. /* page_objects insertion: extend the page_objects by 1, and put us on the end */
  146. if (po->len == po->cap)
  147. {
  148. po = fz_resize_array(ctx, po, 1, sizeof(page_objects) + (po->cap*2 - 1)*sizeof(int));
  149. po->cap *= 2;
  150. *ppo = po;
  151. }
  152. po->object[po->len++] = i;
  153. }
  154. static void
  155. page_objects_list_insert(fz_context *ctx, pdf_write_options *opts, int page, int object)
  156. {
  157. page_objects_list_ensure(ctx, &opts->page_object_lists, page+1);
  158. if (opts->page_object_lists->len < page+1)
  159. opts->page_object_lists->len = page+1;
  160. page_objects_insert(ctx, &opts->page_object_lists->page[page], object);
  161. }
  162. static void
  163. page_objects_list_set_page_object(fz_context *ctx, pdf_write_options *opts, int page, int object)
  164. {
  165. page_objects_list_ensure(ctx, &opts->page_object_lists, page+1);
  166. opts->page_object_lists->page[page]->page_object_number = object;
  167. }
  168. static void
  169. page_objects_sort(fz_context *ctx, page_objects *po)
  170. {
  171. int i, j;
  172. int n = po->len;
  173. /* Step 1: Make a heap */
  174. /* Invariant: Valid heap in [0..i), unsorted elements in [i..n) */
  175. for (i = 1; i < n; i++)
  176. {
  177. /* Now bubble backwards to maintain heap invariant */
  178. j = i;
  179. while (j != 0)
  180. {
  181. int tmp;
  182. int k = (j-1)>>1;
  183. if (po->object[k] >= po->object[j])
  184. break;
  185. tmp = po->object[k];
  186. po->object[k] = po->object[j];
  187. po->object[j] = tmp;
  188. j = k;
  189. }
  190. }
  191. /* Step 2: Heap sort */
  192. /* Invariant: valid heap in [0..i), sorted list in [i..n) */
  193. /* Initially: i = n */
  194. for (i = n-1; i > 0; i--)
  195. {
  196. /* Swap the maximum (0th) element from the page_objects into its place
  197. * in the sorted list (position i). */
  198. int tmp = po->object[0];
  199. po->object[0] = po->object[i];
  200. po->object[i] = tmp;
  201. /* Now, the page_objects is invalid because the 0th element is out
  202. * of place. Bubble it until the page_objects is valid. */
  203. j = 0;
  204. while (1)
  205. {
  206. /* Children are k and k+1 */
  207. int k = (j+1)*2-1;
  208. /* If both children out of the page_objects, we're done */
  209. if (k > i-1)
  210. break;
  211. /* If both are in the page_objects, pick the larger one */
  212. if (k < i-1 && po->object[k] < po->object[k+1])
  213. k++;
  214. /* If j is bigger than k (i.e. both of it's children),
  215. * we're done */
  216. if (po->object[j] > po->object[k])
  217. break;
  218. tmp = po->object[k];
  219. po->object[k] = po->object[j];
  220. po->object[j] = tmp;
  221. j = k;
  222. }
  223. }
  224. }
  225. static int
  226. order_ge(int ui, int uj)
  227. {
  228. /*
  229. For linearization, we need to order the sections as follows:
  230. Remaining pages
  231. Shared objects
  232. Objects not associated with any page
  233. (Linearization params)
  234. Catalogue (and other document level objects)
  235. First page
  236. (Primary Hint stream) (*)
  237. Any free objects
  238. Note, this is NOT the same order they appear in
  239. the final file!
  240. The PDF reference gives us the option of putting the hint stream
  241. after the first page, and we take it, for simplicity.
  242. */
  243. /* If the 2 objects are in the same section, then page object comes first. */
  244. if (((ui ^ uj) & ~USE_PAGE_OBJECT) == 0)
  245. return ((ui & USE_PAGE_OBJECT) == 0);
  246. /* Put unused objects last */
  247. else if (ui == 0)
  248. return 1;
  249. else if (uj == 0)
  250. return 0;
  251. /* Put the hint stream before that... */
  252. else if (ui & USE_HINTS)
  253. return 1;
  254. else if (uj & USE_HINTS)
  255. return 0;
  256. /* Put page 1 before that... */
  257. else if (ui & USE_PAGE1)
  258. return 1;
  259. else if (uj & USE_PAGE1)
  260. return 0;
  261. /* Put the calagoue before that... */
  262. else if (ui & USE_CATALOGUE)
  263. return 1;
  264. else if (uj & USE_CATALOGUE)
  265. return 0;
  266. /* Put the linearization params before that... */
  267. else if (ui & USE_PARAMS)
  268. return 1;
  269. else if (uj & USE_PARAMS)
  270. return 0;
  271. /* Put objects not associated with any page (anything
  272. * not touched by the catalogue) before that... */
  273. else if (ui == 0)
  274. return 1;
  275. else if (uj == 0)
  276. return 0;
  277. /* Put shared objects before that... */
  278. else if (ui & USE_SHARED)
  279. return 1;
  280. else if (uj & USE_SHARED)
  281. return 0;
  282. /* And otherwise, order by the page number on which
  283. * they are used. */
  284. return (ui>>USE_PAGE_SHIFT) >= (uj>>USE_PAGE_SHIFT);
  285. }
  286. static void
  287. heap_sort(int *list, int n, const int *val, int (*ge)(int, int))
  288. {
  289. int i, j;
  290. #ifdef DEBUG_HEAP_SORT
  291. fprintf(stderr, "Initially:\n");
  292. for (i=0; i < n; i++)
  293. {
  294. fprintf(stderr, "%d: %d %x\n", i, list[i], val[list[i]]);
  295. }
  296. #endif
  297. /* Step 1: Make a heap */
  298. /* Invariant: Valid heap in [0..i), unsorted elements in [i..n) */
  299. for (i = 1; i < n; i++)
  300. {
  301. /* Now bubble backwards to maintain heap invariant */
  302. j = i;
  303. while (j != 0)
  304. {
  305. int tmp;
  306. int k = (j-1)>>1;
  307. if (ge(val[list[k]], val[list[j]]))
  308. break;
  309. tmp = list[k];
  310. list[k] = list[j];
  311. list[j] = tmp;
  312. j = k;
  313. }
  314. }
  315. #ifdef DEBUG_HEAP_SORT
  316. fprintf(stderr, "Valid heap:\n");
  317. for (i=0; i < n; i++)
  318. {
  319. int k;
  320. fprintf(stderr, "%d: %d %x ", i, list[i], val[list[i]]);
  321. k = (i+1)*2-1;
  322. if (k < n)
  323. {
  324. if (ge(val[list[i]], val[list[k]]))
  325. fprintf(stderr, "OK ");
  326. else
  327. fprintf(stderr, "BAD ");
  328. }
  329. if (k+1 < n)
  330. {
  331. if (ge(val[list[i]], val[list[k+1]]))
  332. fprintf(stderr, "OK\n");
  333. else
  334. fprintf(stderr, "BAD\n");
  335. }
  336. else
  337. fprintf(stderr, "\n");
  338. }
  339. #endif
  340. /* Step 2: Heap sort */
  341. /* Invariant: valid heap in [0..i), sorted list in [i..n) */
  342. /* Initially: i = n */
  343. for (i = n-1; i > 0; i--)
  344. {
  345. /* Swap the maximum (0th) element from the page_objects into its place
  346. * in the sorted list (position i). */
  347. int tmp = list[0];
  348. list[0] = list[i];
  349. list[i] = tmp;
  350. /* Now, the page_objects is invalid because the 0th element is out
  351. * of place. Bubble it until the page_objects is valid. */
  352. j = 0;
  353. while (1)
  354. {
  355. /* Children are k and k+1 */
  356. int k = (j+1)*2-1;
  357. /* If both children out of the page_objects, we're done */
  358. if (k > i-1)
  359. break;
  360. /* If both are in the page_objects, pick the larger one */
  361. if (k < i-1 && ge(val[list[k+1]], val[list[k]]))
  362. k++;
  363. /* If j is bigger than k (i.e. both of it's children),
  364. * we're done */
  365. if (ge(val[list[j]], val[list[k]]))
  366. break;
  367. tmp = list[k];
  368. list[k] = list[j];
  369. list[j] = tmp;
  370. j = k;
  371. }
  372. }
  373. #ifdef DEBUG_HEAP_SORT
  374. fprintf(stderr, "Sorted:\n");
  375. for (i=0; i < n; i++)
  376. {
  377. fprintf(stderr, "%d: %d %x ", i, list[i], val[list[i]]);
  378. if (i+1 < n)
  379. {
  380. if (ge(val[list[i+1]], val[list[i]]))
  381. fprintf(stderr, "OK");
  382. else
  383. fprintf(stderr, "BAD");
  384. }
  385. fprintf(stderr, "\n");
  386. }
  387. #endif
  388. }
  389. static void
  390. page_objects_dedupe(fz_context *ctx, page_objects *po)
  391. {
  392. int i, j;
  393. int n = po->len-1;
  394. for (i = 0; i < n; i++)
  395. {
  396. if (po->object[i] == po->object[i+1])
  397. break;
  398. }
  399. j = i; /* j points to the last valid one */
  400. i++; /* i points to the first one we haven't looked at */
  401. for (; i < n; i++)
  402. {
  403. if (po->object[j] != po->object[i])
  404. po->object[++j] = po->object[i];
  405. }
  406. po->len = j+1;
  407. }
  408. static void
  409. page_objects_list_sort_and_dedupe(fz_context *ctx, page_objects_list *pol)
  410. {
  411. int i;
  412. int n = pol->len;
  413. for (i = 0; i < n; i++)
  414. {
  415. page_objects_sort(ctx, pol->page[i]);
  416. page_objects_dedupe(ctx, pol->page[i]);
  417. }
  418. }
  419. #ifdef DEBUG_LINEARIZATION
  420. static void
  421. page_objects_dump(pdf_write_options *opts)
  422. {
  423. page_objects_list *pol = opts->page_object_lists;
  424. int i, j;
  425. for (i = 0; i < pol->len; i++)
  426. {
  427. page_objects *p = pol->page[i];
  428. fprintf(stderr, "Page %d\n", i+1);
  429. for (j = 0; j < p->len; j++)
  430. {
  431. int o = p->object[j];
  432. fprintf(stderr, "\tObject %d: use=%x\n", o, opts->use_list[o]);
  433. }
  434. fprintf(stderr, "Byte range=%d->%d\n", p->min_ofs, p->max_ofs);
  435. fprintf(stderr, "Number of objects=%d, Number of shared objects=%d\n", p->num_objects, p->num_shared);
  436. fprintf(stderr, "Page object number=%d\n", p->page_object_number);
  437. }
  438. }
  439. static void
  440. objects_dump(pdf_document *xref, pdf_write_options *opts)
  441. {
  442. int i;
  443. for (i=0; i < xref->len; i++)
  444. {
  445. fprintf(stderr, "Object %d use=%x offset=%d\n", i, opts->use_list[i], opts->ofs_list[i]);
  446. }
  447. }
  448. #endif
  449. /*
  450. * Garbage collect objects not reachable from the trailer.
  451. */
  452. static pdf_obj *sweepref(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj)
  453. {
  454. int num = pdf_to_num(obj);
  455. int gen = pdf_to_gen(obj);
  456. fz_context *ctx = xref->ctx;
  457. if (num < 0 || num >= xref->len)
  458. return NULL;
  459. if (opts->use_list[num])
  460. return NULL;
  461. opts->use_list[num] = 1;
  462. /* Bake in /Length in stream objects */
  463. fz_try(ctx)
  464. {
  465. if (pdf_is_stream(xref, num, gen))
  466. {
  467. pdf_obj *len = pdf_dict_gets(obj, "Length");
  468. if (pdf_is_indirect(len))
  469. {
  470. opts->use_list[pdf_to_num(len)] = 0;
  471. len = pdf_resolve_indirect(len);
  472. pdf_dict_puts(obj, "Length", len);
  473. }
  474. }
  475. }
  476. fz_catch(ctx)
  477. {
  478. /* Leave broken */
  479. }
  480. return pdf_resolve_indirect(obj);
  481. }
  482. static void sweepobj(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj)
  483. {
  484. int i;
  485. if (pdf_is_indirect(obj))
  486. obj = sweepref(xref, opts, obj);
  487. if (pdf_is_dict(obj))
  488. {
  489. int n = pdf_dict_len(obj);
  490. for (i = 0; i < n; i++)
  491. sweepobj(xref, opts, pdf_dict_get_val(obj, i));
  492. }
  493. else if (pdf_is_array(obj))
  494. {
  495. int n = pdf_array_len(obj);
  496. for (i = 0; i < n; i++)
  497. sweepobj(xref, opts, pdf_array_get(obj, i));
  498. }
  499. }
  500. /*
  501. * Scan for and remove duplicate objects (slow)
  502. */
  503. static void removeduplicateobjs(pdf_document *xref, pdf_write_options *opts)
  504. {
  505. int num, other;
  506. fz_context *ctx = xref->ctx;
  507. for (num = 1; num < xref->len; num++)
  508. {
  509. /* Only compare an object to objects preceding it */
  510. for (other = 1; other < num; other++)
  511. {
  512. pdf_obj *a, *b;
  513. int differ, newnum, streama, streamb;
  514. if (num == other || !opts->use_list[num] || !opts->use_list[other])
  515. continue;
  516. /*
  517. * Comparing stream objects data contents would take too long.
  518. *
  519. * pdf_is_stream calls pdf_cache_object and ensures
  520. * that the xref table has the objects loaded.
  521. */
  522. fz_try(ctx)
  523. {
  524. streama = pdf_is_stream(xref, num, 0);
  525. streamb = pdf_is_stream(xref, other, 0);
  526. differ = streama || streamb;
  527. if (streama && streamb && opts->do_garbage >= 4)
  528. differ = 0;
  529. }
  530. fz_catch(ctx)
  531. {
  532. /* Assume different */
  533. differ = 1;
  534. }
  535. if (differ)
  536. continue;
  537. a = xref->table[num].obj;
  538. b = xref->table[other].obj;
  539. a = pdf_resolve_indirect(a);
  540. b = pdf_resolve_indirect(b);
  541. if (pdf_objcmp(a, b))
  542. continue;
  543. if (streama && streamb)
  544. {
  545. /* Check to see if streams match too. */
  546. fz_buffer *sa = NULL;
  547. fz_buffer *sb = NULL;
  548. fz_var(sa);
  549. fz_var(sb);
  550. differ = 1;
  551. fz_try(ctx)
  552. {
  553. unsigned char *dataa, *datab;
  554. int lena, lenb;
  555. sa = pdf_load_raw_renumbered_stream(xref, num, 0, num, 0);
  556. sb = pdf_load_raw_renumbered_stream(xref, other, 0, other, 0);
  557. lena = fz_buffer_storage(ctx, sa, &dataa);
  558. lenb = fz_buffer_storage(ctx, sb, &datab);
  559. if (lena == lenb && memcmp(dataa, datab, lena) == 0)
  560. differ = 0;
  561. }
  562. fz_always(ctx)
  563. {
  564. fz_drop_buffer(ctx, sa);
  565. fz_drop_buffer(ctx, sb);
  566. }
  567. fz_catch(ctx)
  568. {
  569. fz_rethrow(ctx);
  570. }
  571. if (differ)
  572. continue;
  573. }
  574. /* Keep the lowest numbered object */
  575. newnum = fz_mini(num, other);
  576. opts->renumber_map[num] = newnum;
  577. opts->renumber_map[other] = newnum;
  578. opts->rev_renumber_map[newnum] = num; /* Either will do */
  579. opts->use_list[fz_maxi(num, other)] = 0;
  580. /* One duplicate was found, do not look for another */
  581. break;
  582. }
  583. }
  584. }
  585. /*
  586. * Renumber objects sequentially so the xref is more compact
  587. *
  588. * This code assumes that any opts->renumber_map[n] <= n for all n.
  589. */
  590. static void compactxref(pdf_document *xref, pdf_write_options *opts)
  591. {
  592. int num, newnum;
  593. /*
  594. * Update renumber_map in-place, clustering all used
  595. * objects together at low object ids. Objects that
  596. * already should be renumbered will have their new
  597. * object ids be updated to reflect the compaction.
  598. */
  599. newnum = 1;
  600. for (num = 1; num < xref->len; num++)
  601. {
  602. /* If it's not used, map it to zero */
  603. if (!opts->use_list[opts->renumber_map[num]])
  604. {
  605. opts->renumber_map[num] = 0;
  606. }
  607. /* If it's not moved, compact it. */
  608. else if (opts->renumber_map[num] == num)
  609. {
  610. opts->rev_renumber_map[newnum] = opts->rev_renumber_map[num];
  611. opts->rev_gen_list[newnum] = opts->rev_gen_list[num];
  612. opts->renumber_map[num] = newnum++;
  613. }
  614. /* Otherwise it's used, and moved. We know that it must have
  615. * moved down, so the place it's moved to will be in the right
  616. * place already. */
  617. else
  618. {
  619. opts->renumber_map[num] = opts->renumber_map[opts->renumber_map[num]];
  620. }
  621. }
  622. }
  623. /*
  624. * Update indirect objects according to renumbering established when
  625. * removing duplicate objects and compacting the xref.
  626. */
  627. static void renumberobj(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj)
  628. {
  629. int i;
  630. fz_context *ctx = xref->ctx;
  631. if (pdf_is_dict(obj))
  632. {
  633. int n = pdf_dict_len(obj);
  634. for (i = 0; i < n; i++)
  635. {
  636. pdf_obj *key = pdf_dict_get_key(obj, i);
  637. pdf_obj *val = pdf_dict_get_val(obj, i);
  638. if (pdf_is_indirect(val))
  639. {
  640. val = pdf_new_indirect(ctx, opts->renumber_map[pdf_to_num(val)], 0, xref);
  641. pdf_dict_put(obj, key, val);
  642. pdf_drop_obj(val);
  643. }
  644. else
  645. {
  646. renumberobj(xref, opts, val);
  647. }
  648. }
  649. }
  650. else if (pdf_is_array(obj))
  651. {
  652. int n = pdf_array_len(obj);
  653. for (i = 0; i < n; i++)
  654. {
  655. pdf_obj *val = pdf_array_get(obj, i);
  656. if (pdf_is_indirect(val))
  657. {
  658. val = pdf_new_indirect(ctx, opts->renumber_map[pdf_to_num(val)], 0, xref);
  659. pdf_array_put(obj, i, val);
  660. pdf_drop_obj(val);
  661. }
  662. else
  663. {
  664. renumberobj(xref, opts, val);
  665. }
  666. }
  667. }
  668. }
  669. static void renumberobjs(pdf_document *xref, pdf_write_options *opts)
  670. {
  671. pdf_xref_entry *oldxref;
  672. int newlen;
  673. int num;
  674. fz_context *ctx = xref->ctx;
  675. int *new_use_list;
  676. new_use_list = fz_calloc(ctx, xref->len+3, sizeof(int));
  677. fz_try(ctx)
  678. {
  679. /* Apply renumber map to indirect references in all objects in xref */
  680. renumberobj(xref, opts, xref->trailer);
  681. for (num = 0; num < xref->len; num++)
  682. {
  683. pdf_obj *obj = xref->table[num].obj;
  684. if (pdf_is_indirect(obj))
  685. {
  686. obj = pdf_new_indirect(ctx, opts->renumber_map[pdf_to_num(obj)], 0, xref);
  687. pdf_update_object(xref, num, obj);
  688. pdf_drop_obj(obj);
  689. }
  690. else
  691. {
  692. renumberobj(xref, opts, obj);
  693. }
  694. }
  695. /* Create new table for the reordered, compacted xref */
  696. oldxref = xref->table;
  697. xref->table = fz_malloc_array(ctx, xref->len + 3, sizeof(pdf_xref_entry));
  698. xref->table[0] = oldxref[0];
  699. /* Move used objects into the new compacted xref */
  700. newlen = 0;
  701. for (num = 1; num < xref->len; num++)
  702. {
  703. if (opts->use_list[num])
  704. {
  705. if (newlen < opts->renumber_map[num])
  706. newlen = opts->renumber_map[num];
  707. xref->table[opts->renumber_map[num]] = oldxref[num];
  708. new_use_list[opts->renumber_map[num]] = opts->use_list[num];
  709. }
  710. else
  711. {
  712. pdf_drop_obj(oldxref[num].obj);
  713. }
  714. }
  715. }
  716. fz_catch(ctx)
  717. {
  718. fz_free(ctx, new_use_list);
  719. fz_rethrow(ctx);
  720. }
  721. fz_free(ctx, oldxref);
  722. fz_free(ctx, opts->use_list);
  723. opts->use_list = new_use_list;
  724. /* Update the used objects count in compacted xref */
  725. xref->len = newlen + 1;
  726. for (num = 1; num < xref->len; num++)
  727. {
  728. opts->renumber_map[num] = num;
  729. }
  730. }
  731. static void page_objects_list_renumber(pdf_write_options *opts)
  732. {
  733. int i, j;
  734. for (i = 0; i < opts->page_object_lists->len; i++)
  735. {
  736. page_objects *po = opts->page_object_lists->page[i];
  737. for (j = 0; j < po->len; j++)
  738. {
  739. po->object[j] = opts->renumber_map[po->object[j]];
  740. }
  741. po->page_object_number = opts->renumber_map[po->page_object_number];
  742. }
  743. }
  744. static void
  745. mark_all(pdf_document *xref, pdf_write_options *opts, pdf_obj *val, int flag, int page)
  746. {
  747. fz_context *ctx = xref->ctx;
  748. if (pdf_obj_mark(val))
  749. return;
  750. fz_try(ctx)
  751. {
  752. if (pdf_is_indirect(val))
  753. {
  754. int num = pdf_to_num(val);
  755. if (flag >= 16 && (opts->use_list[num] & USE_PAGE_MASK))
  756. /* Already used */
  757. opts->use_list[num] |= USE_SHARED;
  758. else
  759. opts->use_list[num] |= flag;
  760. if (page >= 0)
  761. page_objects_list_insert(ctx, opts, page, num);
  762. }
  763. if (pdf_is_dict(val))
  764. {
  765. int i, n = pdf_dict_len(val);
  766. for (i = 0; i < n; i++)
  767. {
  768. mark_all(xref, opts, pdf_dict_get_val(val, i), flag, page);
  769. }
  770. }
  771. else if (pdf_is_array(val))
  772. {
  773. int i, n = pdf_array_len(val);
  774. for (i = 0; i < n; i++)
  775. {
  776. mark_all(xref, opts, pdf_array_get(val, i), flag, page);
  777. }
  778. }
  779. }
  780. fz_always(ctx)
  781. {
  782. pdf_obj_unmark(val);
  783. }
  784. fz_catch(ctx)
  785. {
  786. fz_rethrow(ctx);
  787. }
  788. }
  789. static int
  790. mark_pages(pdf_document *xref, pdf_write_options *opts, pdf_obj *val, int pagenum)
  791. {
  792. fz_context *ctx = xref->ctx;
  793. if (pdf_obj_mark(val))
  794. return pagenum;
  795. fz_try(ctx)
  796. {
  797. if (pdf_is_dict(val))
  798. {
  799. if (!strcmp("Page", pdf_to_name(pdf_dict_gets(val, "Type"))))
  800. {
  801. int num = pdf_to_num(val);
  802. pdf_obj_unmark(val);
  803. mark_all(xref, opts, val, pagenum == 0 ? USE_PAGE1 : (pagenum<<USE_PAGE_SHIFT), pagenum);
  804. page_objects_list_set_page_object(ctx, opts, pagenum, num);
  805. pagenum++;
  806. opts->use_list[num] |= USE_PAGE_OBJECT;
  807. }
  808. else
  809. {
  810. int i, n = pdf_dict_len(val);
  811. for (i = 0; i < n; i++)
  812. {
  813. pdf_obj *key = pdf_dict_get_key(val, i);
  814. pdf_obj *obj = pdf_dict_get_val(val, i);
  815. if (!strcmp("Kids", pdf_to_name(key)))
  816. pagenum = mark_pages(xref, opts, obj, pagenum);
  817. else
  818. mark_all(xref, opts, obj, USE_CATALOGUE, -1);
  819. }
  820. if (pdf_is_indirect(val))
  821. {
  822. int num = pdf_to_num(val);
  823. opts->use_list[num] |= USE_CATALOGUE;
  824. }
  825. }
  826. }
  827. else if (pdf_is_array(val))
  828. {
  829. int i, n = pdf_array_len(val);
  830. for (i = 0; i < n; i++)
  831. {
  832. pagenum = mark_pages(xref, opts, pdf_array_get(val, i), pagenum);
  833. }
  834. if (pdf_is_indirect(val))
  835. {
  836. int num = pdf_to_num(val);
  837. opts->use_list[num] |= USE_CATALOGUE;
  838. }
  839. }
  840. }
  841. fz_always(ctx)
  842. {
  843. pdf_obj_unmark(val);
  844. }
  845. fz_catch(ctx)
  846. {
  847. fz_rethrow(ctx);
  848. }
  849. return pagenum;
  850. }
  851. static void
  852. mark_root(pdf_document *xref, pdf_write_options *opts, pdf_obj *dict)
  853. {
  854. fz_context *ctx = xref->ctx;
  855. int i, n = pdf_dict_len(dict);
  856. if (pdf_obj_mark(dict))
  857. return;
  858. fz_try(ctx)
  859. {
  860. if (pdf_is_indirect(dict))
  861. {
  862. int num = pdf_to_num(dict);
  863. opts->use_list[num] |= USE_CATALOGUE;
  864. }
  865. for (i = 0; i < n; i++)
  866. {
  867. char *key = pdf_to_name(pdf_dict_get_key(dict, i));
  868. pdf_obj *val = pdf_dict_get_val(dict, i);
  869. if (!strcmp("Pages", key))
  870. opts->page_count = mark_pages(xref, opts, val, 0);
  871. else if (!strcmp("Outlines", key))
  872. {
  873. /* FIXME: Look at PageMode to decide whether to
  874. * USE_OTHERPAGES or USE_PAGE1 here. */
  875. if (0 /* PageMode == "Outlines" */)
  876. mark_all(xref, opts, val, USE_PAGE1, -1);
  877. }
  878. else
  879. mark_all(xref, opts, val, USE_CATALOGUE, -1);
  880. }
  881. }
  882. fz_always(ctx)
  883. {
  884. pdf_obj_unmark(dict);
  885. }
  886. fz_catch(ctx)
  887. {
  888. fz_rethrow(ctx);
  889. }
  890. }
  891. static void
  892. mark_trailer(pdf_document *xref, pdf_write_options *opts, pdf_obj *dict)
  893. {
  894. fz_context *ctx = xref->ctx;
  895. int i, n = pdf_dict_len(dict);
  896. if (pdf_obj_mark(dict))
  897. return;
  898. fz_try(ctx)
  899. {
  900. for (i = 0; i < n; i++)
  901. {
  902. char *key = pdf_to_name(pdf_dict_get_key(dict, i));
  903. pdf_obj *val = pdf_dict_get_val(dict, i);
  904. if (!strcmp("Root", key))
  905. mark_root(xref, opts, val);
  906. else
  907. mark_all(xref, opts, val, USE_CATALOGUE, -1);
  908. }
  909. }
  910. fz_always(ctx)
  911. {
  912. pdf_obj_unmark(dict);
  913. }
  914. fz_catch(ctx)
  915. {
  916. fz_rethrow(ctx);
  917. }
  918. }
  919. static void
  920. add_linearization_objs(pdf_document *xref, pdf_write_options *opts)
  921. {
  922. pdf_obj *params_obj = NULL;
  923. pdf_obj *params_ref = NULL;
  924. pdf_obj *hint_obj = NULL;
  925. pdf_obj *hint_ref = NULL;
  926. pdf_obj *o = NULL;
  927. int params_num, hint_num;
  928. fz_context *ctx = xref->ctx;
  929. fz_var(params_obj);
  930. fz_var(params_ref);
  931. fz_var(hint_obj);
  932. fz_var(hint_ref);
  933. fz_var(o);
  934. fz_try(ctx)
  935. {
  936. /* Linearization params */
  937. params_obj = pdf_new_dict(ctx, 10);
  938. params_ref = pdf_new_ref(xref, params_obj);
  939. params_num = pdf_to_num(params_ref);
  940. opts->use_list[params_num] = USE_PARAMS;
  941. opts->renumber_map[params_num] = params_num;
  942. opts->rev_renumber_map[params_num] = params_num;
  943. opts->gen_list[params_num] = 0;
  944. opts->rev_gen_list[params_num] = 0;
  945. pdf_dict_puts_drop(params_obj, "Linearized", pdf_new_real(ctx, 1.0));
  946. opts->linear_l = pdf_new_int(ctx, INT_MIN);
  947. pdf_dict_puts(params_obj, "L", opts->linear_l);
  948. opts->linear_h0 = pdf_new_int(ctx, INT_MIN);
  949. o = pdf_new_array(ctx, 2);
  950. pdf_array_push(o, opts->linear_h0);
  951. opts->linear_h1 = pdf_new_int(ctx, INT_MIN);
  952. pdf_array_push(o, opts->linear_h1);
  953. pdf_dict_puts_drop(params_obj, "H", o);
  954. o = NULL;
  955. opts->linear_o = pdf_new_int(ctx, INT_MIN);
  956. pdf_dict_puts(params_obj, "O", opts->linear_o);
  957. opts->linear_e = pdf_new_int(ctx, INT_MIN);
  958. pdf_dict_puts(params_obj, "E", opts->linear_e);
  959. opts->linear_n = pdf_new_int(ctx, INT_MIN);
  960. pdf_dict_puts(params_obj, "N", opts->linear_n);
  961. opts->linear_t = pdf_new_int(ctx, INT_MIN);
  962. pdf_dict_puts(params_obj, "T", opts->linear_t);
  963. /* Primary hint stream */
  964. hint_obj = pdf_new_dict(ctx, 10);
  965. hint_ref = pdf_new_ref(xref, hint_obj);
  966. hint_num = pdf_to_num(hint_ref);
  967. opts->use_list[hint_num] = USE_HINTS;
  968. opts->renumber_map[hint_num] = hint_num;
  969. opts->rev_renumber_map[hint_num] = hint_num;
  970. opts->gen_list[hint_num] = 0;
  971. opts->rev_gen_list[hint_num] = 0;
  972. pdf_dict_puts_drop(hint_obj, "P", pdf_new_int(ctx, 0));
  973. opts->hints_s = pdf_new_int(ctx, INT_MIN);
  974. pdf_dict_puts(hint_obj, "S", opts->hints_s);
  975. /* FIXME: Do we have thumbnails? Do a T entry */
  976. /* FIXME: Do we have outlines? Do an O entry */
  977. /* FIXME: Do we have article threads? Do an A entry */
  978. /* FIXME: Do we have named destinations? Do a E entry */
  979. /* FIXME: Do we have interactive forms? Do a V entry */
  980. /* FIXME: Do we have document information? Do an I entry */
  981. /* FIXME: Do we have logical structure heirarchy? Do a C entry */
  982. /* FIXME: Do L, Page Label hint table */
  983. pdf_dict_puts_drop(hint_obj, "Filter", pdf_new_name(ctx, "FlateDecode"));
  984. opts->hints_length = pdf_new_int(ctx, INT_MIN);
  985. pdf_dict_puts(hint_obj, "Length", opts->hints_length);
  986. xref->table[hint_num].stm_ofs = -1;
  987. }
  988. fz_always(ctx)
  989. {
  990. pdf_drop_obj(params_obj);
  991. pdf_drop_obj(params_ref);
  992. pdf_drop_obj(hint_ref);
  993. pdf_drop_obj(hint_obj);
  994. pdf_drop_obj(o);
  995. }
  996. fz_catch(ctx)
  997. {
  998. fz_rethrow(ctx);
  999. }
  1000. }
  1001. static void
  1002. lpr_inherit_res_contents(fz_context *ctx, pdf_obj *res, pdf_obj *dict, char *text)
  1003. {
  1004. pdf_obj *o, *r;
  1005. int i, n;
  1006. /* If the parent node doesn't have an entry of this type, give up. */
  1007. o = pdf_dict_gets(dict, text);
  1008. if (!o)
  1009. return;
  1010. /* If the resources dict we are building doesn't have an entry of this
  1011. * type yet, then just copy it (ensuring it's not a reference) */
  1012. r = pdf_dict_gets(res, text);
  1013. if (r == NULL)
  1014. {
  1015. o = pdf_resolve_indirect(o);
  1016. if (pdf_is_dict(o))
  1017. o = pdf_copy_dict(ctx, o);
  1018. else if (pdf_is_array(o))
  1019. o = pdf_copy_array(ctx, o);
  1020. else
  1021. o = NULL;
  1022. if (o)
  1023. pdf_dict_puts(res, text, o);
  1024. return;
  1025. }
  1026. /* Otherwise we need to merge o into r */
  1027. if (pdf_is_dict(o))
  1028. {
  1029. n = pdf_dict_len(o);
  1030. for (i = 0; i < n; i++)
  1031. {
  1032. pdf_obj *key = pdf_dict_get_key(o, i);
  1033. pdf_obj *val = pdf_dict_get_val(o, i);
  1034. if (pdf_dict_gets(res, pdf_to_name(key)))
  1035. continue;
  1036. pdf_dict_puts(res, pdf_to_name(key), val);
  1037. }
  1038. }
  1039. }
  1040. static void
  1041. lpr_inherit_res(fz_context *ctx, pdf_obj *node, int depth, pdf_obj *dict)
  1042. {
  1043. while (1)
  1044. {
  1045. pdf_obj *o;
  1046. node = pdf_dict_gets(node, "Parent");
  1047. depth--;
  1048. if (!node || depth < 0)
  1049. break;
  1050. o = pdf_dict_gets(node, "Resources");
  1051. if (o)
  1052. {
  1053. lpr_inherit_res_contents(ctx, dict, o, "ExtGState");
  1054. lpr_inherit_res_contents(ctx, dict, o, "ColorSpace");
  1055. lpr_inherit_res_contents(ctx, dict, o, "Pattern");
  1056. lpr_inherit_res_contents(ctx, dict, o, "Shading");
  1057. lpr_inherit_res_contents(ctx, dict, o, "XObject");
  1058. lpr_inherit_res_contents(ctx, dict, o, "Font");
  1059. lpr_inherit_res_contents(ctx, dict, o, "ProcSet");
  1060. lpr_inherit_res_contents(ctx, dict, o, "Properties");
  1061. }
  1062. }
  1063. }
  1064. static pdf_obj *
  1065. lpr_inherit(fz_context *ctx, pdf_obj *node, char *text, int depth)
  1066. {
  1067. do
  1068. {
  1069. pdf_obj *o = pdf_dict_gets(node, text);
  1070. if (o)
  1071. return pdf_resolve_indirect(o);
  1072. node = pdf_dict_gets(node, "Parent");
  1073. depth--;
  1074. }
  1075. while (depth >= 0 && node);
  1076. return NULL;
  1077. }
  1078. static int
  1079. lpr(fz_context *ctx, pdf_obj *node, int depth, int page)
  1080. {
  1081. pdf_obj *kids;
  1082. pdf_obj *o = NULL;
  1083. int i, n;
  1084. if (pdf_obj_mark(node))
  1085. return page;
  1086. fz_var(o);
  1087. fz_try(ctx)
  1088. {
  1089. if (!strcmp("Page", pdf_to_name(pdf_dict_gets(node, "Type"))))
  1090. {
  1091. pdf_obj *r; /* r is deliberately not cleaned up */
  1092. /* Copy resources down to the child */
  1093. o = pdf_keep_obj(pdf_dict_gets(node, "Resources"));
  1094. if (!o)
  1095. {
  1096. o = pdf_keep_obj(pdf_new_dict(ctx, 2));
  1097. pdf_dict_puts(node, "Resources", o);
  1098. }
  1099. lpr_inherit_res(ctx, node, depth, o);
  1100. r = lpr_inherit(ctx, node, "MediaBox", depth);
  1101. if (r)
  1102. pdf_dict_puts(node, "MediaBox", r);
  1103. r = lpr_inherit(ctx, node, "CropBox", depth);
  1104. if (r)
  1105. pdf_dict_puts(node, "CropBox", r);
  1106. r = lpr_inherit(ctx, node, "BleedBox", depth);
  1107. if (r)
  1108. pdf_dict_puts(node, "BleedBox", r);
  1109. r = lpr_inherit(ctx, node, "TrimBox", depth);
  1110. if (r)
  1111. pdf_dict_puts(node, "TrimBox", r);
  1112. r = lpr_inherit(ctx, node, "ArtBox", depth);
  1113. if (r)
  1114. pdf_dict_puts(node, "ArtBox", r);
  1115. r = lpr_inherit(ctx, node, "Rotate", depth);
  1116. if (r)
  1117. pdf_dict_puts(node, "Rotate", r);
  1118. page++;
  1119. }
  1120. else
  1121. {
  1122. kids = pdf_dict_gets(node, "Kids");
  1123. n = pdf_array_len(kids);
  1124. for(i = 0; i < n; i++)
  1125. {
  1126. page = lpr(ctx, pdf_array_get(kids, i), depth+1, page);
  1127. }
  1128. pdf_dict_dels(node, "Resources");
  1129. pdf_dict_dels(node, "MediaBox");
  1130. pdf_dict_dels(node, "CropBox");
  1131. pdf_dict_dels(node, "BleedBox");
  1132. pdf_dict_dels(node, "TrimBox");
  1133. pdf_dict_dels(node, "ArtBox");
  1134. pdf_dict_dels(node, "Rotate");
  1135. }
  1136. }
  1137. fz_always(ctx)
  1138. {
  1139. pdf_drop_obj(o);
  1140. }
  1141. fz_catch(ctx)
  1142. {
  1143. fz_rethrow(ctx);
  1144. }
  1145. pdf_obj_unmark(node);
  1146. return page;
  1147. }
  1148. void
  1149. pdf_localise_page_resources(pdf_document *xref)
  1150. {
  1151. fz_context *ctx = xref->ctx;
  1152. if (xref->resources_localised)
  1153. return;
  1154. lpr(ctx, pdf_dict_getp(xref->trailer, "Root/Pages"), 0, 0);
  1155. xref->resources_localised = 1;
  1156. }
  1157. static void
  1158. linearize(pdf_document *xref, pdf_write_options *opts)
  1159. {
  1160. int i;
  1161. int n = xref->len + 2;
  1162. int *reorder;
  1163. int *rev_renumber_map;
  1164. int *rev_gen_list;
  1165. fz_context *ctx = xref->ctx;
  1166. opts->page_object_lists = page_objects_list_create(ctx);
  1167. /* Ensure that every page has local references of its resources */
  1168. /* FIXME: We could 'thin' the resources according to what is actually
  1169. * required for each page, but this would require us to run the page
  1170. * content streams. */
  1171. pdf_localise_page_resources(xref);
  1172. /* Walk the objects for each page, marking which ones are used, where */
  1173. memset(opts->use_list, 0, n * sizeof(int));
  1174. mark_trailer(xref, opts, xref->trailer);
  1175. /* Add new objects required for linearization */
  1176. add_linearization_objs(xref, opts);
  1177. #ifdef DEBUG_WRITING
  1178. fprintf(stderr, "Usage calculated:\n");
  1179. for (i=0; i < xref->len; i++)
  1180. {
  1181. fprintf(stderr, "%d: use=%d\n", i, opts->use_list[i]);
  1182. }
  1183. #endif
  1184. /* Allocate/init the structures used for renumbering the objects */
  1185. reorder = fz_calloc(ctx, n, sizeof(int));
  1186. rev_renumber_map = fz_calloc(ctx, n, sizeof(int));
  1187. rev_gen_list = fz_calloc(ctx, n, sizeof(int));
  1188. for (i = 0; i < n; i++)
  1189. {
  1190. reorder[i] = i;
  1191. }
  1192. /* Heap sort the reordering */
  1193. heap_sort(reorder+1, n-1, opts->use_list, &order_ge);
  1194. #ifdef DEBUG_WRITING
  1195. fprintf(stderr, "Reordered:\n");
  1196. for (i=1; i < xref->len; i++)
  1197. {
  1198. fprintf(stderr, "%d: use=%d\n", i, opts->use_list[reorder[i]]);
  1199. }
  1200. #endif
  1201. /* Find the split point */
  1202. for (i = 1; (opts->use_list[reorder[i]] & USE_PARAMS) == 0; i++);
  1203. opts->start = i;
  1204. /* Roll the reordering into the renumber_map */
  1205. for (i = 0; i < n; i++)
  1206. {
  1207. opts->renumber_map[reorder[i]] = i;
  1208. rev_renumber_map[i] = opts->rev_renumber_map[reorder[i]];
  1209. rev_gen_list[i] = opts->rev_gen_list[reorder[i]];
  1210. }
  1211. fz_free(ctx, opts->rev_renumber_map);
  1212. fz_free(ctx, opts->rev_gen_list);
  1213. opts->rev_renumber_map = rev_renumber_map;
  1214. opts->rev_gen_list = rev_gen_list;
  1215. fz_free(ctx, reorder);
  1216. /* Apply the renumber_map */
  1217. page_objects_list_renumber(opts);
  1218. renumberobjs(xref, opts);
  1219. page_objects_list_sort_and_dedupe(ctx, opts->page_object_lists);
  1220. }
  1221. static void
  1222. update_linearization_params(pdf_document *xref, pdf_write_options *opts)
  1223. {
  1224. int offset;
  1225. pdf_set_int(opts->linear_l, opts->file_len);
  1226. /* Primary hint stream offset (of object, not stream!) */
  1227. pdf_set_int(opts->linear_h0, opts->ofs_list[xref->len-1]);
  1228. /* Primary hint stream length (of object, not stream!) */
  1229. offset = (opts->start == 1 ? opts->main_xref_offset : opts->ofs_list[1] + opts->hintstream_len);
  1230. pdf_set_int(opts->linear_h1, offset - opts->ofs_list[xref->len-1]);
  1231. /* Object number of first pages page object (the first object of page 0) */
  1232. pdf_set_int(opts->linear_o, opts->page_object_lists->page[0]->object[0]);
  1233. /* Offset of end of first page (first page is followed by primary
  1234. * hint stream (object n-1) then remaining pages (object 1...). The
  1235. * primary hint stream counts as part of the first pages data, I think.
  1236. */
  1237. offset = (opts->start == 1 ? opts->main_xref_offset : opts->ofs_list[1] + opts->hintstream_len);
  1238. pdf_set_int(opts->linear_e, offset);
  1239. /* Number of pages in document */
  1240. pdf_set_int(opts->linear_n, opts->page_count);
  1241. /* Offset of first entry in main xref table */
  1242. pdf_set_int(opts->linear_t, opts->first_xref_entry_offset + opts->hintstream_len);
  1243. /* Offset of shared objects hint table in the primary hint stream */
  1244. pdf_set_int(opts->hints_s, opts->hints_shared_offset);
  1245. /* Primary hint stream length */
  1246. pdf_set_int(opts->hints_length, opts->hintstream_len);
  1247. }
  1248. /*
  1249. * Make sure we have loaded objects from object streams.
  1250. */
  1251. static void preloadobjstms(pdf_document *xref)
  1252. {
  1253. pdf_obj *obj;
  1254. int num;
  1255. for (num = 0; num < xref->len; num++)
  1256. {
  1257. if (xref->table[num].type == 'o')
  1258. {
  1259. obj = pdf_load_object(xref, num, 0);
  1260. pdf_drop_obj(obj);
  1261. }
  1262. }
  1263. }
  1264. /*
  1265. * Save streams and objects to the output
  1266. */
  1267. static inline int isbinary(int c)
  1268. {
  1269. if (c == '\n' || c == '\r' || c == '\t')
  1270. return 0;
  1271. return c < 32 || c > 127;
  1272. }
  1273. static int isbinarystream(fz_buffer *buf)
  1274. {
  1275. int i;
  1276. for (i = 0; i < buf->len; i++)
  1277. if (isbinary(buf->data[i]))
  1278. return 1;
  1279. return 0;
  1280. }
  1281. static fz_buffer *hexbuf(fz_context *ctx, unsigned char *p, int n)
  1282. {
  1283. static const char hex[16] = "0123456789abcdef";
  1284. fz_buffer *buf;
  1285. int x = 0;
  1286. buf = fz_new_buffer(ctx, n * 2 + (n / 32) + 2);
  1287. while (n--)
  1288. {
  1289. buf->data[buf->len++] = hex[*p >> 4];
  1290. buf->data[buf->len++] = hex[*p & 15];
  1291. if (++x == 32)
  1292. {
  1293. buf->data[buf->len++] = '\n';
  1294. x = 0;
  1295. }
  1296. p++;
  1297. }
  1298. buf->data[buf->len++] = '>';
  1299. buf->data[buf->len++] = '\n';
  1300. return buf;
  1301. }
  1302. static void addhexfilter(pdf_document *xref, pdf_obj *dict)
  1303. {
  1304. pdf_obj *f, *dp, *newf, *newdp;
  1305. pdf_obj *ahx, *nullobj;
  1306. fz_context *ctx = xref->ctx;
  1307. ahx = pdf_new_name(ctx, "ASCIIHexDecode");
  1308. nullobj = pdf_new_null(ctx);
  1309. newf = newdp = NULL;
  1310. f = pdf_dict_gets(dict, "Filter");
  1311. dp = pdf_dict_gets(dict, "DecodeParms");
  1312. if (pdf_is_name(f))
  1313. {
  1314. newf = pdf_new_array(ctx, 2);
  1315. pdf_array_push(newf, ahx);
  1316. pdf_array_push(newf, f);
  1317. f = newf;
  1318. if (pdf_is_dict(dp))
  1319. {
  1320. newdp = pdf_new_array(ctx, 2);
  1321. pdf_array_push(newdp, nullobj);
  1322. pdf_array_push(newdp, dp);
  1323. dp = newdp;
  1324. }
  1325. }
  1326. else if (pdf_is_array(f))
  1327. {
  1328. pdf_array_insert(f, ahx);
  1329. if (pdf_is_array(dp))
  1330. pdf_array_insert(dp, nullobj);
  1331. }
  1332. else
  1333. f = ahx;
  1334. pdf_dict_puts(dict, "Filter", f);
  1335. if (dp)
  1336. pdf_dict_puts(dict, "DecodeParms", dp);
  1337. pdf_drop_obj(ahx);
  1338. pdf_drop_obj(nullobj);
  1339. pdf_drop_obj(newf);
  1340. pdf_drop_obj(newdp);
  1341. }
  1342. static void copystream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj_orig, int num, int gen)
  1343. {
  1344. fz_buffer *buf, *tmp;
  1345. pdf_obj *newlen;
  1346. pdf_obj *obj;
  1347. fz_context *ctx = xref->ctx;
  1348. int orig_num = opts->rev_renumber_map[num];
  1349. int orig_gen = opts->rev_gen_list[num];
  1350. buf = pdf_load_raw_renumbered_stream(xref, num, gen, orig_num, orig_gen);
  1351. obj = pdf_copy_dict(ctx, obj_orig);
  1352. if (opts->do_ascii && isbinarystream(buf))
  1353. {
  1354. tmp = hexbuf(ctx, buf->data, buf->len);
  1355. fz_drop_buffer(ctx, buf);
  1356. buf = tmp;
  1357. addhexfilter(xref, obj);
  1358. newlen = pdf_new_int(ctx, buf->len);
  1359. pdf_dict_puts(obj, "Length", newlen);
  1360. pdf_drop_obj(newlen);
  1361. }
  1362. fprintf(opts->out, "%d %d obj\n", num, gen);
  1363. pdf_fprint_obj(opts->out, obj, opts->do_expand == 0);
  1364. fprintf(opts->out, "stream\n");
  1365. fwrite(buf->data, 1, buf->len, opts->out);
  1366. fprintf(opts->out, "endstream\nendobj\n\n");
  1367. fz_drop_buffer(ctx, buf);
  1368. pdf_drop_obj(obj);
  1369. }
  1370. static void expandstream(pdf_document *xref, pdf_write_options *opts, pdf_obj *obj_orig, int num, int gen)
  1371. {
  1372. fz_buffer *buf, *tmp;
  1373. pdf_obj *newlen;
  1374. pdf_obj *obj;
  1375. fz_context *ctx = xref->ctx;
  1376. int orig_num = opts->rev_renumber_map[num];
  1377. int orig_gen = opts->rev_gen_list[num];
  1378. int truncated = 0;
  1379. buf = pdf_load_renumbered_stream(xref, num, gen, orig_num, orig_gen, (opts->continue_on_error ? &truncated : NULL));
  1380. if (truncated && opts->errors)
  1381. (*opts->errors)++;
  1382. obj = pdf_copy_dict(ctx, obj_orig);
  1383. pdf_dict_dels(obj, "Filter");
  1384. pdf_dict_dels(obj, "DecodeParms");
  1385. if (opts->do_ascii && isbinarystream(buf))
  1386. {
  1387. tmp = hexbuf(ctx, buf->data, buf->len);
  1388. fz_drop_buffer(ctx, buf);
  1389. buf = tmp;
  1390. addhexfilter(xref, obj);
  1391. }
  1392. newlen = pdf_new_int(ctx, buf->len);
  1393. pdf_dict_puts(obj, "Length", newlen);
  1394. pdf_drop_obj(newlen);
  1395. fprintf(opts->out, "%d %d obj\n", num, gen);
  1396. pdf_fprint_obj(opts->out, obj, opts->do_expand == 0);
  1397. fprintf(opts->out, "stream\n");
  1398. fwrite(buf->data, 1, buf->len, opts->out);
  1399. fprintf(opts->out, "endstream\nendobj\n\n");
  1400. fz_drop_buffer(ctx, buf);
  1401. pdf_drop_obj(obj);
  1402. }
  1403. static int is_image_filter(char *s)
  1404. {
  1405. if (!strcmp(s, "CCITTFaxDecode") || !strcmp(s, "CCF") ||
  1406. !strcmp(s, "DCTDecode") || !strcmp(s, "DCT") ||
  1407. !strcmp(s, "RunLengthDecode") || !strcmp(s, "RL") ||
  1408. !strcmp(s, "JBIG2Decode") ||
  1409. !strcmp(s, "JPXDecode"))
  1410. return 1;
  1411. return 0;
  1412. }
  1413. static int filter_implies_image(pdf_document *xref, pdf_obj *o)
  1414. {
  1415. if (!o)
  1416. return 0;
  1417. if (pdf_is_name(o))
  1418. return is_image_filter(pdf_to_name(o));
  1419. if (pdf_is_array(o))
  1420. {
  1421. int i, len;
  1422. len = pdf_array_len(o);
  1423. for (i = 0; i < len; i++)
  1424. if (is_image_filter(pdf_to_name(pdf_array_get(o, i))))
  1425. return 1;
  1426. }
  1427. return 0;
  1428. }
  1429. static void writeobject(pdf_document *xref, pdf_write_options *opts, int num, int gen)
  1430. {
  1431. pdf_obj *obj;
  1432. pdf_obj *type;
  1433. fz_context *ctx = xref->ctx;
  1434. fz_try(ctx)
  1435. {
  1436. obj = pdf_load_object(xref, num, gen);
  1437. }
  1438. fz_catch(ctx)
  1439. {
  1440. if (opts->continue_on_error)
  1441. {
  1442. fprintf(opts->out, "%d %d obj\nnull\nendobj\n", num, gen);
  1443. if (opts->errors)
  1444. (*opts->errors)++;
  1445. fz_warn(ctx, "%s", fz_caught(ctx));
  1446. return;
  1447. }
  1448. else
  1449. fz_rethrow(ctx);
  1450. }
  1451. /* skip ObjStm and XRef objects */
  1452. if (pdf_is_dict(obj))
  1453. {
  1454. type = pdf_dict_gets(obj, "Type");
  1455. if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "ObjStm"))
  1456. {
  1457. opts->use_list[num] = 0;
  1458. pdf_drop_obj(obj);
  1459. return;
  1460. }
  1461. if (pdf_is_name(type) && !strcmp(pdf_to_name(type), "XRef"))
  1462. {
  1463. opts->use_list[num] = 0;
  1464. pdf_drop_obj(obj);
  1465. return;
  1466. }
  1467. }
  1468. if (!pdf_is_stream(xref, num, gen))
  1469. {
  1470. fprintf(opts->out, "%d %d obj\n", num, gen);
  1471. pdf_fprint_obj(opts->out, obj, opts->do_expand == 0);
  1472. fprintf(opts->out, "endobj\n\n");
  1473. }
  1474. else if (xref->table[num].stm_ofs < 0 && xref->table[num].stm_buf == NULL)
  1475. {
  1476. fprintf(opts->out, "%d %d obj\n", num, gen);
  1477. pdf_fprint_obj(opts->out, obj, opts->do_expand == 0);
  1478. fprintf(opts->out, "stream\nendstream\nendobj\n\n");
  1479. }
  1480. else
  1481. {
  1482. int dontexpand = 0;
  1483. if (opts->do_expand != 0 && opts->do_expand != fz_expand_all)
  1484. {
  1485. pdf_obj *o;
  1486. if ((o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "XObject")) &&
  1487. (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Image")))
  1488. dontexpand = !(opts->do_expand & fz_expand_images);
  1489. if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "Font"))
  1490. dontexpand = !(opts->do_expand & fz_expand_fonts);
  1491. if (o = pdf_dict_gets(obj, "Type"), !strcmp(pdf_to_name(o), "FontDescriptor"))
  1492. dontexpand = !(opts->do_expand & fz_expand_fonts);
  1493. if ((o = pdf_dict_gets(obj, "Length1")) != NULL)
  1494. dontexpand = !(opts->do_expand & fz_expand_fonts);
  1495. if ((o = pdf_dict_gets(obj, "Length2")) != NULL)
  1496. dontexpand = !(opts->do_expand & fz_expand_fonts);
  1497. if ((o = pdf_dict_gets(obj, "Length3")) != NULL)
  1498. dontexpand = !(opts->do_expand & fz_expand_fonts);
  1499. if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "Type1C"))
  1500. dontexpand = !(opts->do_expand & fz_expand_fonts);
  1501. if (o = pdf_dict_gets(obj, "Subtype"), !strcmp(pdf_to_name(o), "CIDFontType0C"))
  1502. dontexpand = !(opts->do_expand & fz_expand_fonts);
  1503. if (o = pdf_dict_gets(obj, "Filter"), filter_implies_image(xref, o))
  1504. dontexpand = !(opts->do_expand & fz_expand_images);
  1505. if (pdf_dict_gets(obj, "Width") != NULL && pdf_dict_gets(obj, "Height") != NULL)
  1506. dontexpand = !(opts->do_expand & fz_expand_images);
  1507. }
  1508. fz_try(ctx)
  1509. {
  1510. if (opts->do_expand && !dontexpand && !pdf_is_jpx_image(ctx, obj))
  1511. expandstream(xref, opts, obj, num, gen);
  1512. else
  1513. copystream(xref, opts, obj, num, gen);
  1514. }
  1515. fz_catch(ctx)
  1516. {
  1517. if (opts->continue_on_error)
  1518. {
  1519. fprintf(opts->out, "%d %d obj\nnull\nendobj\n", num, gen);
  1520. if (opts->errors)
  1521. (*opts->errors)++;
  1522. fz_warn(ctx, "%s", fz_caught(ctx));
  1523. }
  1524. else
  1525. {
  1526. pdf_drop_obj(obj);
  1527. fz_rethrow(ctx);
  1528. }
  1529. }
  1530. }
  1531. pdf_drop_obj(obj);
  1532. }
  1533. static void writexref(pdf_document *xref, pdf_write_options *opts, int from, int to, int first, int main_xref_offset, int startxref)
  1534. {
  1535. pdf_obj *trailer = NULL;
  1536. pdf_obj *obj;
  1537. pdf_obj *nobj = NULL;
  1538. int num;
  1539. fz_context *ctx = xref->ctx;
  1540. fprintf(opts->out, "xref\n%d %d\n", from, to - from);
  1541. opts->first_xref_entry_offset = ftell(opts->out);
  1542. for (num = from; num < to; num++)
  1543. {
  1544. if (opts->use_list[num])
  1545. fprintf(opts->out, "%010d %05d n \n", opts->ofs_list[num], opts->gen_list[num]);
  1546. else
  1547. fprintf(opts->out, "%010d %05d f \n", opts->ofs_list[num], opts->gen_list[num]);
  1548. }
  1549. fprintf(opts->out, "\n");
  1550. fz_var(trailer);
  1551. fz_var(nobj);
  1552. fz_try(ctx)
  1553. {
  1554. trailer = pdf_new_dict(ctx, 5);
  1555. nobj = pdf_new_int(ctx, to);
  1556. pdf_dict_puts(trailer, "Size", nobj);
  1557. pdf_drop_obj(nobj);
  1558. nobj = NULL;
  1559. if (first)
  1560. {
  1561. obj = pdf_dict_gets(xref->trailer, "Info");
  1562. if (obj)
  1563. pdf_dict_puts(trailer, "Info", obj);
  1564. obj = pdf_dict_gets(xref->trailer, "Root");
  1565. if (obj)
  1566. pdf_dict_puts(trailer, "Root", obj);
  1567. obj = pdf_dict_gets(xref->trailer, "ID");
  1568. if (obj)
  1569. pdf_dict_puts(trailer, "ID", obj);
  1570. }
  1571. if (main_xref_offset != 0)
  1572. {
  1573. nobj = pdf_new_int(ctx, main_xref_offset);
  1574. pdf_dict_puts(trailer, "Prev", nobj);
  1575. pdf_drop_obj(nobj);
  1576. nobj = NULL;
  1577. }
  1578. }
  1579. fz_always(ctx)
  1580. {
  1581. pdf_drop_obj(nobj);
  1582. }
  1583. fz_catch(ctx)
  1584. {
  1585. fz_rethrow(ctx);
  1586. }
  1587. fprintf(opts->out, "trailer\n");
  1588. pdf_fprint_obj(opts->out, trailer, opts->do_expand == 0);
  1589. fprintf(opts->out, "\n");
  1590. pdf_drop_obj(trailer);
  1591. fprintf(opts->out, "startxref\n%d\n%%%%EOF\n", startxref);
  1592. }
  1593. static void
  1594. padto(FILE *file, int target)
  1595. {
  1596. int pos = ftell(file);
  1597. assert(pos <= target);
  1598. while (pos < target)
  1599. {
  1600. fputc('\n', file);
  1601. pos++;
  1602. }
  1603. }
  1604. static void
  1605. dowriteobject(pdf_document *xref, pdf_write_options *opts, int num, int pass)
  1606. {
  1607. if (xref->table[num].type == 'f')
  1608. opts->gen_list[num] = xref->table[num].gen;
  1609. if (xref->table[num].type == 'n')
  1610. opts->gen_list[num] = xref->table[num].gen;
  1611. if (xref->table[num].type == 'o')
  1612. opts->gen_list[num] = 0;
  1613. /* If we are renumbering, then make sure all generation numbers are
  1614. * zero (except object 0 which must be free, and have a gen number of
  1615. * 65535). Changing the generation numbers (and indeed object numbers)
  1616. * will break encryption - so only do this if we are renumbering
  1617. * anyway. */
  1618. if (opts->do_garbage >= 2)
  1619. opts->gen_list[num] = (num == 0 ? 65535 : 0);
  1620. if (opts->do_garbage && !opts->use_list[num])
  1621. return;
  1622. if (xref->table[num].type == 'n' || xref->table[num].type == 'o')
  1623. {
  1624. if (pass > 0)
  1625. padto(opts->out, opts->ofs_list[num]);
  1626. opts->ofs_list[num] = ftell(opts->out);
  1627. writeobject(xref, opts, num, opts->gen_list[num]);
  1628. }
  1629. else
  1630. opts->use_list[num] = 0;
  1631. }
  1632. static void
  1633. writeobjects(pdf_document *xref, pdf_write_options *opts, int pass)
  1634. {
  1635. int num;
  1636. fprintf(opts->out, "%%PDF-%d.%d\n", xref->version / 10, xref->version % 10);
  1637. fprintf(opts->out, "%%\316\274\341\277\246\n\n");
  1638. dowriteobject(xref, opts, opts->start, pass);
  1639. if (opts->do_linear)
  1640. {
  1641. /* Write first xref */
  1642. if (pass == 0)
  1643. opts->first_xref_offset = ftell(opts->out);
  1644. else
  1645. padto(opts->out, opts->first_xref_offset);
  1646. writexref(xref, opts, opts->start, xref->len, 1, opts->main_xref_offset, 0);
  1647. }
  1648. for (num = opts->start+1; num < xref->len; num++)
  1649. dowriteobject(xref, opts, num, pass);
  1650. if (opts->do_linear && pass == 1)
  1651. {
  1652. int offset = (opts->start == 1 ? opts->main_xref_offset : opts->ofs_list[1] + opts->hintstream_len);
  1653. padto(opts->out, offset);
  1654. }
  1655. for (num = 1; num < opts->start; num++)
  1656. {
  1657. if (pass == 1)
  1658. opts->ofs_list[num] += opts->hintstream_len;
  1659. dowriteobject(xref, opts, num, pass);
  1660. }
  1661. }
  1662. static int
  1663. my_log2(int x)
  1664. {
  1665. int i = 0;
  1666. if (x <= 0)
  1667. return 0;
  1668. while ((1<<i) <= x && (1<<i) > 0)
  1669. i++;
  1670. if ((1<<i) <= 0)
  1671. return 0;
  1672. return i;
  1673. }
  1674. static void
  1675. make_page_offset_hints(pdf_document *xref, pdf_write_options *opts, fz_buffer *buf)
  1676. {
  1677. fz_context *ctx = xref->ctx;
  1678. int i, j;
  1679. int min_objs_per_page, max_objs_per_page;
  1680. int min_page_length, max_page_length;
  1681. int objs_per_page_bits;
  1682. int min_shared_object, max_shared_object;
  1683. int max_shared_object_refs;
  1684. int min_shared_length, max_shared_length;
  1685. page_objects **pop = &opts->page_object_lists->page[0];
  1686. int page_len_bits, shared_object_bits, shared_object_id_bits;
  1687. int shared_length_bits;
  1688. min_shared_object = xref->len;
  1689. max_shared_object = 1;
  1690. min_shared_length = opts->file_len;
  1691. max_shared_length = 0;
  1692. for (i=1; i < xref->len; i++)
  1693. {
  1694. int min, max, page;
  1695. min = opts->ofs_list[i];
  1696. if (i == opts->start-1 || (opts->start == 1 && i == xref->len-1))
  1697. max = opts->main_xref_offset;
  1698. else if (i == xref->len-1)
  1699. max = opts->ofs_list[1];
  1700. else
  1701. max = opts->ofs_list[i+1];
  1702. assert(max > min);
  1703. if (opts->use_list[i] & USE_SHARED)
  1704. {
  1705. page = -1;
  1706. if (i < min_shared_object)
  1707. min_shared_object = i;
  1708. if (i > max_shared_object)
  1709. max_shared_object = i;
  1710. if (min_shared_length > max - min)
  1711. min_shared_length = max - min;
  1712. if (max_shared_length < max - min)
  1713. max_shared_length = max - min;
  1714. }
  1715. else if (opts->use_list[i] & (USE_CATALOGUE | USE_HINTS | USE_PARAMS))
  1716. page = -1;
  1717. else if (opts->use_list[i] & USE_PAGE1)
  1718. {
  1719. page = 0;
  1720. if (min_shared_length > max - min)
  1721. min_shared_length = max - min;
  1722. if (max_shared_length < max - min)
  1723. max_shared_length = max - min;
  1724. }
  1725. else if (opts->use_list[i] == 0)
  1726. page = -1;
  1727. else
  1728. page = opts->use_list[i]>>USE_PAGE_SHIFT;
  1729. if (page >= 0)
  1730. {
  1731. pop[page]->num_objects++;
  1732. if (pop[page]->min_ofs > min)
  1733. pop[page]->min_ofs = min;
  1734. if (pop[page]->max_ofs < max)
  1735. pop[page]->max_ofs = max;
  1736. }
  1737. }
  1738. min_objs_per_page = max_objs_per_page = pop[0]->num_objects;
  1739. min_page_length = max_page_length = pop[0]->max_ofs - pop[0]->min_ofs;
  1740. for (i=1; i < opts->page_count; i++)
  1741. {
  1742. int tmp;
  1743. if (min_objs_per_page > pop[i]->num_objects)
  1744. min_objs_per_page = pop[i]->num_objects;
  1745. if (max_objs_per_page < pop[i]->num_objects)
  1746. max_objs_per_page = pop[i]->num_objects;
  1747. tmp = pop[i]->max_ofs - pop[i]->min_ofs;
  1748. if (tmp < min_page_length)
  1749. min_page_length = tmp;
  1750. if (tmp > max_page_length)
  1751. max_page_length = tmp;
  1752. }
  1753. for (i=0; i < opts->page_count; i++)
  1754. {
  1755. int count = 0;
  1756. int j;
  1757. page_objects *po = opts->page_object_lists->page[i];
  1758. for (j = 0; j < po->len; j++)
  1759. {
  1760. if (i == 0 && opts->use_list[po->object[j]] & USE_PAGE1)
  1761. count++;
  1762. else if (i != 0 && opts->use_list[po->object[j]] & USE_SHARED)
  1763. count++;
  1764. }
  1765. po->num_shared = count;
  1766. if (i == 0 || count > max_shared_object_refs)
  1767. max_shared_object_refs = count;
  1768. }
  1769. if (min_shared_object > max_shared_object)
  1770. min_shared_object = max_shared_object = 0;
  1771. /* Table F.3 - Header */
  1772. /* Header Item 1: Least number of objects in a page */
  1773. fz_write_buffer_bits(ctx, buf, min_objs_per_page, 32);
  1774. /* Header Item 2: Location of first pages page object */
  1775. fz_write_buffer_bits(ctx, buf, opts->ofs_list[pop[0]->page_object_number], 32);
  1776. /* Header Item 3: Number of bits required to represent the difference
  1777. * between the greatest and least number of objects in a page. */
  1778. objs_per_page_bits = my_log2(max_objs_per_page - min_objs_per_page);
  1779. fz_write_buffer_bits(ctx, buf, objs_per_page_bits, 16);
  1780. /* Header Item 4: Least length of a page. */
  1781. fz_write_buffer_bits(ctx, buf, min_page_length, 32);
  1782. /* Header Item 5: Number of bits needed to represent the difference
  1783. * between the greatest and least length of a page. */
  1784. page_len_bits = my_log2(max_page_length - min_page_length);
  1785. fz_write_buffer_bits(ctx, buf, page_len_bits, 16);
  1786. /* Header Item 6: Least offset to start of content stream (Acrobat
  1787. * sets this to always be 0) */
  1788. fz_write_buffer_bits(ctx, buf, 0, 32);
  1789. /* Header Item 7: Number of bits needed to represent the difference
  1790. * between the greatest and least offset to content stream (Acrobat
  1791. * sets this to always be 0) */
  1792. fz_write_buffer_bits(ctx, buf, 0, 16);
  1793. /* Header Item 8: Least content stream length. (Acrobat
  1794. * sets this to always be 0) */
  1795. fz_write_buffer_bits(ctx, buf, 0, 32);
  1796. /* Header Item 9: Number of bits needed to represent the difference
  1797. * between the greatest and least content stream length (Acrobat
  1798. * sets this to always be the same as item 5) */
  1799. fz_write_buffer_bits(ctx, buf, page_len_bits, 16);
  1800. /* Header Item 10: Number of bits needed to represent the greatest
  1801. * number of shared object references. */
  1802. shared_object_bits = my_log2(max_shared_object_refs);
  1803. fz_write_buffer_bits(ctx, buf, shared_object_bits, 16);
  1804. /* Header Item 11: Number of bits needed to represent the greatest
  1805. * shared object identifier. */
  1806. shared_object_id_bits = my_log2(max_shared_object - min_shared_object + pop[0]->num_shared);
  1807. fz_write_buffer_bits(ctx, buf, shared_object_id_bits, 16);
  1808. /* Header Item 12: Number of bits needed to represent the numerator
  1809. * of the fractions. We always send 0. */
  1810. fz_write_buffer_bits(ctx, buf, 0, 16);
  1811. /* Header Item 13: Number of bits needed to represent the denominator
  1812. * of the fractions. We always send 0. */
  1813. fz_write_buffer_bits(ctx, buf, 0, 16);
  1814. /* Table F.4 - Page offset hint table (per page) */
  1815. /* Item 1: A number that, when added to the least number of objects
  1816. * on a page, gives the number of objects in the page. */
  1817. for (i = 0; i < opts->page_count; i++)
  1818. {
  1819. fz_write_buffer_bits(ctx, buf, pop[i]->num_objects - min_objs_per_page, objs_per_page_bits);
  1820. }
  1821. fz_write_buffer_pad(ctx, buf);
  1822. /* Item 2: A number that, when added to the least page length, gives
  1823. * the length of the page in bytes. */
  1824. for (i = 0; i < opts->page_count; i++)
  1825. {
  1826. fz_write_buffer_bits(ctx, buf, pop[i]->max_ofs - pop[i]->min_ofs - min_page_length, page_len_bits);
  1827. }
  1828. fz_write_buffer_pad(ctx, buf);
  1829. /* Item 3: The number of shared objects referenced from the page. */
  1830. for (i = 0; i < opts->page_count; i++)
  1831. {
  1832. fz_write_buffer_bits(ctx, buf, pop[i]->num_shared, shared_object_bits);
  1833. }
  1834. fz_write_buffer_pad(ctx, buf);
  1835. /* Item 4: Shared object id for each shared object ref in every page.
  1836. * Spec says "not for page 1", but acrobat does send page 1's - all
  1837. * as zeros. */
  1838. for (i = 0; i < opts->page_count; i++)
  1839. {
  1840. for (j = 0; j < pop[i]->len; j++)
  1841. {
  1842. int o = pop[i]->object[j];
  1843. if (i == 0 && opts->use_list[o] & USE_PAGE1)
  1844. fz_write_buffer_bits(ctx, buf, 0 /* o - pop[0]->page_object_number */, shared_object_id_bits);
  1845. if (i != 0 && opts->use_list[o] & USE_SHARED)
  1846. fz_write_buffer_bits(ctx, buf, o - min_shared_object + pop[0]->num_shared, shared_object_id_bits);
  1847. }
  1848. }
  1849. fz_write_buffer_pad(ctx, buf);
  1850. /* Item 5: Numerator of fractional position for each shared object reference. */
  1851. /* We always send 0 in 0 bits */
  1852. /* Item 6: A number that, when added to the least offset to the start
  1853. * of the content stream (F.3 Item 6), gives the offset in bytes of
  1854. * start of the pages content stream object relative to the beginning
  1855. * of the page. Always 0 in 0 bits. */
  1856. /* Item 7: A number that, when added to the least content stream length
  1857. * (F.3 Item 8), gives the length of the pages content stream object.
  1858. * Always == Item 2 as least content stream length = least page stream
  1859. * length.
  1860. */
  1861. for (i = 0; i < opts->page_count; i++)
  1862. {
  1863. fz_write_buffer_bits(ctx, buf, pop[i]->max_ofs - pop[i]->min_ofs - min_page_length, page_len_bits);
  1864. }
  1865. /* Pad, and then do shared object hint table */
  1866. fz_write_buffer_pad(ctx, buf);
  1867. opts->hints_shared_offset = buf->len;
  1868. /* Table F.5: */
  1869. /* Header Item 1: Object number of the first object in the shared
  1870. * objects section. */
  1871. fz_write_buffer_bits(ctx, buf, min_shared_object, 32);
  1872. /* Header Item 2: Location of first object in the shared objects
  1873. * section. */
  1874. fz_write_buffer_bits(ctx, buf, opts->ofs_list[min_shared_object], 32);
  1875. /* Header Item 3: The number of shared object entries for the first
  1876. * page. */
  1877. fz_write_buffer_bits(ctx, buf, pop[0]->num_shared, 32);
  1878. /* Header Item 4: The number of shared object entries for the shared
  1879. * objects section + first page. */
  1880. fz_write_buffer_bits(ctx, buf, max_shared_object - min_shared_object + pop[0]->num_shared, 32);
  1881. /* Header Item 5: The number of bits needed to represent the greatest
  1882. * number of objects in a shared object group (Always 0). */
  1883. fz_write_buffer_bits(ctx, buf, 0, 16);
  1884. /* Header Item 6: The least length of a shared object group in bytes. */
  1885. fz_write_buffer_bits(ctx, buf, min_shared_length, 32);
  1886. /* Header Item 7: The number of bits required to represent the
  1887. * difference between the greatest and least length of a shared object
  1888. * group. */
  1889. shared_length_bits = my_log2(max_shared_length - min_shared_length);
  1890. fz_write_buffer_bits(ctx, buf, shared_length_bits, 16);
  1891. /* Table F.6 */
  1892. /* Item 1: Shared object group length (page 1 objects) */
  1893. for (j = 0; j < pop[0]->len; j++)
  1894. {
  1895. int o = pop[0]->object[j];
  1896. int min, max;
  1897. min = opts->ofs_list[o];
  1898. if (o == opts->start-1)
  1899. max = opts->main_xref_offset;
  1900. else if (o < xref->len-1)
  1901. max = opts->ofs_list[o+1];
  1902. else
  1903. max = opts->ofs_list[1];
  1904. if (opts->use_list[o] & USE_PAGE1)
  1905. fz_write_buffer_bits(ctx, buf, max - min - min_shared_length, shared_length_bits);
  1906. }
  1907. /* Item 1: Shared object group length (shared objects) */
  1908. for (i = min_shared_object; i <= max_shared_object; i++)
  1909. {
  1910. int min, max;
  1911. min = opts->ofs_list[i];
  1912. if (i == opts->start-1)
  1913. max = opts->main_xref_offset;
  1914. else if (i < xref->len-1)
  1915. max = opts->ofs_list[i+1];
  1916. else
  1917. max = opts->ofs_list[1];
  1918. fz_write_buffer_bits(ctx, buf, max - min - min_shared_length, shared_length_bits);
  1919. }
  1920. fz_write_buffer_pad(ctx, buf);
  1921. /* Item 2: MD5 presence flags */
  1922. for (i = max_shared_object - min_shared_object + pop[0]->num_shared; i > 0; i--)
  1923. {
  1924. fz_write_buffer_bits(ctx, buf, 0, 1);
  1925. }
  1926. fz_write_buffer_pad(ctx, buf);
  1927. /* Item 3: MD5 sums (not present) */
  1928. fz_write_buffer_pad(ctx, buf);
  1929. /* Item 4: Number of objects in the group (not present) */
  1930. }
  1931. static void
  1932. make_hint_stream(pdf_document *xref, pdf_write_options *opts)
  1933. {
  1934. fz_context *ctx = xref->ctx;
  1935. fz_buffer *buf = fz_new_buffer(ctx, 100);
  1936. fz_try(ctx)
  1937. {
  1938. make_page_offset_hints(xref, opts, buf);
  1939. pdf_update_stream(xref, xref->len-1, buf);
  1940. opts->hintstream_len = buf->len;
  1941. fz_drop_buffer(ctx, buf);
  1942. }
  1943. fz_catch(ctx)
  1944. {
  1945. fz_drop_buffer(ctx, buf);
  1946. fz_rethrow(ctx);
  1947. }
  1948. }
  1949. #ifdef DEBUG_WRITING
  1950. static void dump_object_details(pdf_document *xref, pdf_write_options *opts)
  1951. {
  1952. int i;
  1953. for (i = 0; i < xref->len; i++)
  1954. {
  1955. fprintf(stderr, "%d@%d: use=%d\n", i, opts->ofs_list[i], opts->use_list[i]);
  1956. }
  1957. }
  1958. #endif
  1959. void pdf_write_document(pdf_document *xref, char *filename, fz_write_options *fz_opts)
  1960. {
  1961. int lastfree;
  1962. int num;
  1963. pdf_write_options opts = { 0 };
  1964. fz_context *ctx;
  1965. if (!xref)
  1966. return;
  1967. ctx = xref->ctx;
  1968. opts.out = fopen(filename, "wb");
  1969. if (!opts.out)
  1970. fz_throw(ctx, "cannot open output file '%s'", filename);
  1971. fz_try(ctx)
  1972. {
  1973. opts.do_expand = fz_opts ? fz_opts->do_expand : 0;
  1974. opts.do_garbage = fz_opts ? fz_opts->do_garbage : 0;
  1975. opts.do_ascii = fz_opts ? fz_opts->do_ascii: 0;
  1976. opts.do_linear = fz_opts ? fz_opts->do_linear: 0;
  1977. opts.start = 0;
  1978. opts.main_xref_offset = INT_MIN;
  1979. /* We deliberately make these arrays long enough to cope with
  1980. * 1 to n access rather than 0..n-1, and add space for 2 new
  1981. * extra entries that may be required for linearization. */
  1982. opts.use_list = fz_malloc_array(ctx, xref->len + 3, sizeof(int));
  1983. opts.ofs_list = fz_malloc_array(ctx, xref->len + 3, sizeof(int));
  1984. opts.gen_list = fz_calloc(ctx, xref->len + 3, sizeof(int));
  1985. opts.renumber_map = fz_malloc_array(ctx, xref->len + 3, sizeof(int));
  1986. opts.rev_renumber_map = fz_malloc_array(ctx, xref->len + 3, sizeof(int));
  1987. opts.rev_gen_list = fz_malloc_array(ctx, xref->len + 3, sizeof(int));
  1988. opts.continue_on_error = fz_opts->continue_on_error;
  1989. opts.errors = fz_opts->errors;
  1990. for (num = 0; num < xref->len; num++)
  1991. {
  1992. opts.use_list[num] = 0;
  1993. opts.ofs_list[num] = 0;
  1994. opts.renumber_map[num] = num;
  1995. opts.rev_renumber_map[num] = num;
  1996. opts.rev_gen_list[num] = xref->table[num].gen;
  1997. }
  1998. /* Make sure any objects hidden in compressed streams have been loaded */
  1999. preloadobjstms(xref);
  2000. /* Sweep & mark objects from the trailer */
  2001. if (opts.do_garbage >= 1)
  2002. sweepobj(xref, &opts, xref->trailer);
  2003. else
  2004. for (num = 0; num < xref->len; num++)
  2005. opts.use_list[num] = 1;
  2006. /* Coalesce and renumber duplicate objects */
  2007. if (opts.do_garbage >= 3)
  2008. removeduplicateobjs(xref, &opts);
  2009. /* Compact xref by renumbering and removing unused objects */
  2010. if (opts.do_garbage >= 2 || opts.do_linear)
  2011. compactxref(xref, &opts);
  2012. /* Make renumbering affect all indirect references and update xref */
  2013. if (opts.do_garbage >= 2 || opts.do_linear)
  2014. renumberobjs(xref, &opts);
  2015. if (opts.do_linear)
  2016. {
  2017. linearize(xref, &opts);
  2018. }
  2019. writeobjects(xref, &opts, 0);
  2020. #ifdef DEBUG_WRITING
  2021. dump_object_details(xref, &opts);
  2022. #endif
  2023. /* Construct linked list of free object slots */
  2024. lastfree = 0;
  2025. for (num = 0; num < xref->len; num++)
  2026. {
  2027. if (!opts.use_list[num])
  2028. {
  2029. opts.gen_list[num]++;
  2030. opts.ofs_list[lastfree] = num;
  2031. lastfree = num;
  2032. }
  2033. }
  2034. if (opts.do_linear)
  2035. {
  2036. opts.main_xref_offset = ftell(opts.out);
  2037. writexref(xref, &opts, 0, opts.start, 0, 0, opts.first_xref_offset);
  2038. opts.file_len = ftell(opts.out);
  2039. make_hint_stream(xref, &opts);
  2040. opts.file_len += opts.hintstream_len;
  2041. opts.main_xref_offset += opts.hintstream_len;
  2042. update_linearization_params(xref, &opts);
  2043. fseek(opts.out, 0, 0);
  2044. writeobjects(xref, &opts, 1);
  2045. padto(opts.out, opts.main_xref_offset);
  2046. writexref(xref, &opts, 0, opts.start, 0, 0, opts.first_xref_offset);
  2047. }
  2048. else
  2049. {
  2050. opts.first_xref_offset = ftell(opts.out);
  2051. writexref(xref, &opts, 0, xref->len, 1, 0, opts.first_xref_offset);
  2052. }
  2053. xref->dirty = 0;
  2054. }
  2055. fz_always(ctx)
  2056. {
  2057. #ifdef DEBUG_LINEARIZATION
  2058. page_objects_dump(&opts);
  2059. objects_dump(xref, &opts);
  2060. #endif
  2061. fz_free(ctx, opts.use_list);
  2062. fz_free(ctx, opts.ofs_list);
  2063. fz_free(ctx, opts.gen_list);
  2064. fz_free(ctx, opts.renumber_map);
  2065. fz_free(ctx, opts.rev_renumber_map);
  2066. fz_free(ctx, opts.rev_gen_list);
  2067. pdf_drop_obj(opts.linear_l);
  2068. pdf_drop_obj(opts.linear_h0);
  2069. pdf_drop_obj(opts.linear_h1);
  2070. pdf_drop_obj(opts.linear_o);
  2071. pdf_drop_obj(opts.linear_e);
  2072. pdf_drop_obj(opts.linear_n);
  2073. pdf_drop_obj(opts.linear_t);
  2074. pdf_drop_obj(opts.hints_s);
  2075. pdf_drop_obj(opts.hints_length);
  2076. page_objects_list_destroy(ctx, opts.page_object_lists);
  2077. fclose(opts.out);
  2078. }
  2079. fz_catch(ctx)
  2080. {
  2081. fz_rethrow(ctx);
  2082. }
  2083. }