dev_text.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990
  1. #include "fitz-internal.h"
  2. #define LINE_DIST 0.9f
  3. #define SPACE_DIST 0.2f
  4. #define SPACE_MAX_DIST 15.0f
  5. #define PARAGRAPH_DIST 0.5f
  6. #include <ft2build.h>
  7. #include FT_FREETYPE_H
  8. #include FT_ADVANCES_H
  9. typedef struct fz_text_device_s fz_text_device;
  10. struct fz_text_device_s
  11. {
  12. fz_text_sheet *sheet;
  13. fz_text_page *page;
  14. fz_text_line cur_line;
  15. fz_text_span cur_span;
  16. fz_point point;
  17. int lastchar;
  18. };
  19. fz_text_sheet *
  20. fz_new_text_sheet(fz_context *ctx)
  21. {
  22. fz_text_sheet *sheet = fz_malloc(ctx, sizeof *sheet);
  23. sheet->maxid = 0;
  24. sheet->style = NULL;
  25. return sheet;
  26. }
  27. void
  28. fz_free_text_sheet(fz_context *ctx, fz_text_sheet *sheet)
  29. {
  30. fz_text_style *style = sheet->style;
  31. while (style)
  32. {
  33. fz_text_style *next = style->next;
  34. fz_drop_font(ctx, style->font);
  35. fz_free(ctx, style);
  36. style = next;
  37. }
  38. fz_free(ctx, sheet);
  39. }
  40. static fz_text_style *
  41. fz_lookup_text_style_imp(fz_context *ctx, fz_text_sheet *sheet,
  42. float size, fz_font *font, int wmode, int script)
  43. {
  44. fz_text_style *style;
  45. for (style = sheet->style; style; style = style->next)
  46. {
  47. if (style->font == font &&
  48. style->size == size &&
  49. style->wmode == wmode &&
  50. style->script == script) /* FIXME: others */
  51. {
  52. return style;
  53. }
  54. }
  55. /* Better make a new one and add it to our list */
  56. style = fz_malloc(ctx, sizeof *style);
  57. style->id = sheet->maxid++;
  58. style->font = fz_keep_font(ctx, font);
  59. style->size = size;
  60. style->wmode = wmode;
  61. style->script = script;
  62. style->next = sheet->style;
  63. sheet->style = style;
  64. return style;
  65. }
  66. static fz_text_style *
  67. fz_lookup_text_style(fz_context *ctx, fz_text_sheet *sheet, fz_text *text, const fz_matrix *ctm,
  68. fz_colorspace *colorspace, float *color, float alpha, fz_stroke_state *stroke)
  69. {
  70. float size = 1.0f;
  71. fz_font *font = text ? text->font : NULL;
  72. int wmode = text ? text->wmode : 0;
  73. if (ctm && text)
  74. {
  75. fz_matrix tm = text->trm;
  76. fz_matrix trm;
  77. tm.e = 0;
  78. tm.f = 0;
  79. fz_concat(&trm, &tm, ctm);
  80. size = fz_matrix_expansion(&trm);
  81. }
  82. return fz_lookup_text_style_imp(ctx, sheet, size, font, wmode, 0);
  83. }
  84. fz_text_page *
  85. fz_new_text_page(fz_context *ctx, const fz_rect *mediabox)
  86. {
  87. fz_text_page *page = fz_malloc(ctx, sizeof(*page));
  88. page->mediabox = *mediabox;
  89. page->len = 0;
  90. page->cap = 0;
  91. page->blocks = NULL;
  92. return page;
  93. }
  94. void
  95. fz_free_text_page(fz_context *ctx, fz_text_page *page)
  96. {
  97. fz_text_block *block;
  98. fz_text_line *line;
  99. fz_text_span *span;
  100. for (block = page->blocks; block < page->blocks + page->len; block++)
  101. {
  102. for (line = block->lines; line < block->lines + block->len; line++)
  103. {
  104. for (span = line->spans; span < line->spans + line->len; span++)
  105. {
  106. fz_free(ctx, span->text);
  107. }
  108. fz_free(ctx, line->spans);
  109. }
  110. fz_free(ctx, block->lines);
  111. }
  112. fz_free(ctx, page->blocks);
  113. fz_free(ctx, page);
  114. }
  115. static void
  116. append_char(fz_context *ctx, fz_text_span *span, int c, fz_rect bbox)
  117. {
  118. if (span->len == span->cap)
  119. {
  120. int new_cap = fz_maxi(64, span->cap * 2);
  121. span->text = fz_resize_array(ctx, span->text, new_cap, sizeof(*span->text));
  122. span->cap = new_cap;
  123. }
  124. fz_union_rect(&span->bbox, &bbox);
  125. span->text[span->len].c = c;
  126. span->text[span->len].bbox = bbox;
  127. span->len++;
  128. }
  129. static void
  130. init_span(fz_context *ctx, fz_text_span *span, fz_text_style *style)
  131. {
  132. span->style = style;
  133. span->bbox = fz_empty_rect;
  134. span->len = span->cap = 0;
  135. span->text = NULL;
  136. }
  137. static void
  138. append_span(fz_context *ctx, fz_text_line *line, fz_text_span *span)
  139. {
  140. if (span->len == 0)
  141. return;
  142. if (line->len == line->cap)
  143. {
  144. int new_cap = fz_maxi(8, line->cap * 2);
  145. line->spans = fz_resize_array(ctx, line->spans, new_cap, sizeof(*line->spans));
  146. line->cap = new_cap;
  147. }
  148. fz_union_rect(&line->bbox, &span->bbox);
  149. line->spans[line->len++] = *span;
  150. }
  151. static void
  152. init_line(fz_context *ctx, fz_text_line *line)
  153. {
  154. line->bbox = fz_empty_rect;
  155. line->len = line->cap = 0;
  156. line->spans = NULL;
  157. }
  158. static void
  159. append_line(fz_context *ctx, fz_text_block *block, fz_text_line *line)
  160. {
  161. if (block->len == block->cap)
  162. {
  163. int new_cap = fz_maxi(16, block->cap * 2);
  164. block->lines = fz_resize_array(ctx, block->lines, new_cap, sizeof *block->lines);
  165. block->cap = new_cap;
  166. }
  167. fz_union_rect(&block->bbox, &line->bbox);
  168. block->lines[block->len++] = *line;
  169. }
  170. static fz_text_block *
  171. lookup_block_for_line(fz_context *ctx, fz_text_page *page, fz_text_line *line)
  172. {
  173. float size = line->len > 0 && line->spans[0].len > 0 ? line->spans[0].style->size : 1;
  174. int i;
  175. for (i = 0; i < page->len; i++)
  176. {
  177. fz_text_block *block = page->blocks + i;
  178. float w = block->bbox.x1 - block->bbox.x0;
  179. float dx = line->bbox.x0 - block->bbox.x0;
  180. float dy = line->bbox.y0 - block->bbox.y1;
  181. if (dy > -size * 1.5f && dy < size * PARAGRAPH_DIST)
  182. if (line->bbox.x0 <= block->bbox.x1 && line->bbox.x1 >= block->bbox.x0)
  183. if (fz_abs(dx) < w / 2)
  184. return block;
  185. }
  186. if (page->len == page->cap)
  187. {
  188. int new_cap = fz_maxi(16, page->cap * 2);
  189. page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks));
  190. page->cap = new_cap;
  191. }
  192. page->blocks[page->len].bbox = fz_empty_rect;
  193. page->blocks[page->len].len = 0;
  194. page->blocks[page->len].cap = 0;
  195. page->blocks[page->len].lines = NULL;
  196. return &page->blocks[page->len++];
  197. }
  198. static void
  199. insert_line(fz_context *ctx, fz_text_page *page, fz_text_line *line)
  200. {
  201. if (line->len == 0)
  202. return;
  203. append_line(ctx, lookup_block_for_line(ctx, page, line), line);
  204. }
  205. static fz_rect
  206. fz_split_bbox(fz_rect bbox, int i, int n)
  207. {
  208. float w = (bbox.x1 - bbox.x0) / n;
  209. float x0 = bbox.x0;
  210. bbox.x0 = x0 + i * w;
  211. bbox.x1 = x0 + (i + 1) * w;
  212. return bbox;
  213. }
  214. static void
  215. fz_flush_text_line(fz_context *ctx, fz_text_device *dev, fz_text_style *style)
  216. {
  217. append_span(ctx, &dev->cur_line, &dev->cur_span);
  218. insert_line(ctx, dev->page, &dev->cur_line);
  219. init_span(ctx, &dev->cur_span, style);
  220. init_line(ctx, &dev->cur_line);
  221. }
  222. static void
  223. fz_add_text_char_imp(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_rect bbox)
  224. {
  225. if (!dev->cur_span.style)
  226. dev->cur_span.style = style;
  227. if (style != dev->cur_span.style)
  228. {
  229. append_span(ctx, &dev->cur_line, &dev->cur_span);
  230. init_span(ctx, &dev->cur_span, style);
  231. }
  232. append_char(ctx, &dev->cur_span, c, bbox);
  233. }
  234. static void
  235. fz_add_text_char(fz_context *ctx, fz_text_device *dev, fz_text_style *style, int c, fz_rect bbox)
  236. {
  237. switch (c)
  238. {
  239. case -1: /* ignore when one unicode character maps to multiple glyphs */
  240. break;
  241. case 0xFB00: /* ff */
  242. fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2));
  243. fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 2));
  244. break;
  245. case 0xFB01: /* fi */
  246. fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2));
  247. fz_add_text_char_imp(ctx, dev, style, 'i', fz_split_bbox(bbox, 1, 2));
  248. break;
  249. case 0xFB02: /* fl */
  250. fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 2));
  251. fz_add_text_char_imp(ctx, dev, style, 'l', fz_split_bbox(bbox, 1, 2));
  252. break;
  253. case 0xFB03: /* ffi */
  254. fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 3));
  255. fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 3));
  256. fz_add_text_char_imp(ctx, dev, style, 'i', fz_split_bbox(bbox, 2, 3));
  257. break;
  258. case 0xFB04: /* ffl */
  259. fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 0, 3));
  260. fz_add_text_char_imp(ctx, dev, style, 'f', fz_split_bbox(bbox, 1, 3));
  261. fz_add_text_char_imp(ctx, dev, style, 'l', fz_split_bbox(bbox, 2, 3));
  262. break;
  263. case 0xFB05: /* long st */
  264. case 0xFB06: /* st */
  265. fz_add_text_char_imp(ctx, dev, style, 's', fz_split_bbox(bbox, 0, 2));
  266. fz_add_text_char_imp(ctx, dev, style, 't', fz_split_bbox(bbox, 1, 2));
  267. break;
  268. default:
  269. fz_add_text_char_imp(ctx, dev, style, c, bbox);
  270. break;
  271. }
  272. }
  273. static void
  274. fz_text_extract(fz_context *ctx, fz_text_device *dev, fz_text *text, const fz_matrix *ctm, fz_text_style *style)
  275. {
  276. fz_point *pen = &dev->point;
  277. fz_font *font = text->font;
  278. FT_Face face = font->ft_face;
  279. fz_matrix tm = text->trm;
  280. fz_matrix trm;
  281. float size;
  282. float adv;
  283. fz_rect rect;
  284. fz_point dir, ndir;
  285. fz_point delta, ndelta;
  286. float dist, dot;
  287. float ascender = 1;
  288. float descender = 0;
  289. int multi;
  290. int i, j, err;
  291. if (text->len == 0)
  292. return;
  293. if (font->ft_face)
  294. {
  295. fz_lock(ctx, FZ_LOCK_FREETYPE);
  296. err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72);
  297. if (err)
  298. fz_warn(ctx, "freetype set character size: %s", ft_error_string(err));
  299. ascender = (float)face->ascender / face->units_per_EM;
  300. descender = (float)face->descender / face->units_per_EM;
  301. fz_unlock(ctx, FZ_LOCK_FREETYPE);
  302. }
  303. else if (font->t3procs && !fz_is_empty_rect(&font->bbox))
  304. {
  305. ascender = font->bbox.y1;
  306. descender = font->bbox.y0;
  307. }
  308. rect = fz_empty_rect;
  309. if (text->wmode == 0)
  310. {
  311. dir.x = 1;
  312. dir.y = 0;
  313. }
  314. else
  315. {
  316. dir.x = 0;
  317. dir.y = 1;
  318. }
  319. tm.e = 0;
  320. tm.f = 0;
  321. fz_concat(&trm, &tm, ctm);
  322. fz_transform_vector(&dir, &trm);
  323. dist = sqrtf(dir.x * dir.x + dir.y * dir.y);
  324. ndir.x = dir.x / dist;
  325. ndir.y = dir.y / dist;
  326. size = fz_matrix_expansion(&trm);
  327. for (i = 0; i < text->len; i++)
  328. {
  329. /* Calculate new pen location and delta */
  330. tm.e = text->items[i].x;
  331. tm.f = text->items[i].y;
  332. fz_concat(&trm, &tm, ctm);
  333. delta.x = pen->x - trm.e;
  334. delta.y = pen->y - trm.f;
  335. if (pen->x == -1 && pen->y == -1)
  336. delta.x = delta.y = 0;
  337. dist = sqrtf(delta.x * delta.x + delta.y * delta.y);
  338. /* Add space and newlines based on pen movement */
  339. if (dist > 0)
  340. {
  341. ndelta.x = delta.x / dist;
  342. ndelta.y = delta.y / dist;
  343. dot = ndelta.x * ndir.x + ndelta.y * ndir.y;
  344. if (fabsf(dot) > 0.9995f && dist > size * SPACE_DIST && dist < size * SPACE_MAX_DIST)
  345. {
  346. if (dev->lastchar != ' ')
  347. {
  348. fz_rect spacerect;
  349. spacerect.x0 = -0.2f;
  350. spacerect.y0 = descender;
  351. spacerect.x1 = 0;
  352. spacerect.y1 = ascender;
  353. fz_transform_rect(&spacerect, &trm);
  354. fz_add_text_char(ctx, dev, style, ' ', spacerect);
  355. dev->lastchar = ' ';
  356. }
  357. }
  358. else if (dist > size * LINE_DIST)
  359. {
  360. fz_flush_text_line(ctx, dev, style);
  361. dev->lastchar = ' ';
  362. }
  363. }
  364. /* Calculate bounding box and new pen position based on font metrics */
  365. if (font->ft_face)
  366. {
  367. FT_Fixed ftadv = 0;
  368. int mask = FT_LOAD_NO_BITMAP | FT_LOAD_NO_HINTING | FT_LOAD_IGNORE_TRANSFORM;
  369. /* TODO: freetype returns broken vertical metrics */
  370. /* if (text->wmode) mask |= FT_LOAD_VERTICAL_LAYOUT; */
  371. fz_lock(ctx, FZ_LOCK_FREETYPE);
  372. err = FT_Set_Char_Size(font->ft_face, 64, 64, 72, 72);
  373. if (err)
  374. fz_warn(ctx, "freetype set character size: %s", ft_error_string(err));
  375. FT_Get_Advance(font->ft_face, text->items[i].gid, mask, &ftadv);
  376. adv = ftadv / 65536.0f;
  377. fz_unlock(ctx, FZ_LOCK_FREETYPE);
  378. rect.x0 = 0;
  379. rect.y0 = descender;
  380. rect.x1 = adv;
  381. rect.y1 = ascender;
  382. }
  383. else
  384. {
  385. adv = font->t3widths[text->items[i].gid];
  386. rect.x0 = 0;
  387. rect.y0 = descender;
  388. rect.x1 = adv;
  389. rect.y1 = ascender;
  390. }
  391. fz_transform_rect(&rect, &trm);
  392. pen->x = trm.e + dir.x * adv;
  393. pen->y = trm.f + dir.y * adv;
  394. /* Check for one glyph to many char mapping */
  395. for (j = i + 1; j < text->len; j++)
  396. if (text->items[j].gid >= 0)
  397. break;
  398. multi = j - i;
  399. if (multi == 1)
  400. {
  401. fz_add_text_char(ctx, dev, style, text->items[i].ucs, rect);
  402. }
  403. else
  404. {
  405. for (j = 0; j < multi; j++)
  406. {
  407. fz_rect part = fz_split_bbox(rect, j, multi);
  408. fz_add_text_char(ctx, dev, style, text->items[i + j].ucs, part);
  409. }
  410. i += j - 1;
  411. }
  412. dev->lastchar = text->items[i].ucs;
  413. }
  414. }
  415. static void
  416. fz_text_fill_text(fz_device *dev, fz_text *text, const fz_matrix *ctm,
  417. fz_colorspace *colorspace, float *color, float alpha)
  418. {
  419. fz_text_device *tdev = dev->user;
  420. fz_text_style *style;
  421. style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, colorspace, color, alpha, NULL);
  422. fz_text_extract(dev->ctx, tdev, text, ctm, style);
  423. }
  424. static void
  425. fz_text_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, const fz_matrix *ctm,
  426. fz_colorspace *colorspace, float *color, float alpha)
  427. {
  428. fz_text_device *tdev = dev->user;
  429. fz_text_style *style;
  430. style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, colorspace, color, alpha, stroke);
  431. fz_text_extract(dev->ctx, tdev, text, ctm, style);
  432. }
  433. static void
  434. fz_text_clip_text(fz_device *dev, fz_text *text, const fz_matrix *ctm, int accumulate)
  435. {
  436. fz_text_device *tdev = dev->user;
  437. fz_text_style *style;
  438. style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, NULL, NULL, 0, NULL);
  439. fz_text_extract(dev->ctx, tdev, text, ctm, style);
  440. }
  441. static void
  442. fz_text_clip_stroke_text(fz_device *dev, fz_text *text, fz_stroke_state *stroke, const fz_matrix *ctm)
  443. {
  444. fz_text_device *tdev = dev->user;
  445. fz_text_style *style;
  446. style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, NULL, NULL, 0, stroke);
  447. fz_text_extract(dev->ctx, tdev, text, ctm, style);
  448. }
  449. static void
  450. fz_text_ignore_text(fz_device *dev, fz_text *text, const fz_matrix *ctm)
  451. {
  452. fz_text_device *tdev = dev->user;
  453. fz_text_style *style;
  454. style = fz_lookup_text_style(dev->ctx, tdev->sheet, text, ctm, NULL, NULL, 0, NULL);
  455. fz_text_extract(dev->ctx, tdev, text, ctm, style);
  456. }
  457. static void
  458. fz_text_free_user(fz_device *dev)
  459. {
  460. fz_context *ctx = dev->ctx;
  461. fz_text_device *tdev = dev->user;
  462. append_span(ctx, &tdev->cur_line, &tdev->cur_span);
  463. insert_line(ctx, tdev->page, &tdev->cur_line);
  464. /* TODO: smart sorting of blocks in reading order */
  465. /* TODO: unicode NFC normalization */
  466. /* TODO: bidi logical reordering */
  467. fz_free(dev->ctx, tdev);
  468. }
  469. fz_device *
  470. fz_new_text_device(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
  471. {
  472. fz_device *dev;
  473. fz_text_device *tdev = fz_malloc_struct(ctx, fz_text_device);
  474. tdev->sheet = sheet;
  475. tdev->page = page;
  476. tdev->point.x = -1;
  477. tdev->point.y = -1;
  478. tdev->lastchar = ' ';
  479. init_line(ctx, &tdev->cur_line);
  480. init_span(ctx, &tdev->cur_span, NULL);
  481. dev = fz_new_device(ctx, tdev);
  482. dev->hints = FZ_IGNORE_IMAGE | FZ_IGNORE_SHADE;
  483. dev->free_user = fz_text_free_user;
  484. dev->fill_text = fz_text_fill_text;
  485. dev->stroke_text = fz_text_stroke_text;
  486. dev->clip_text = fz_text_clip_text;
  487. dev->clip_stroke_text = fz_text_clip_stroke_text;
  488. dev->ignore_text = fz_text_ignore_text;
  489. return dev;
  490. }
  491. /* XML, HTML and plain-text output */
  492. static int font_is_bold(fz_font *font)
  493. {
  494. FT_Face face = font->ft_face;
  495. if (face && (face->style_flags & FT_STYLE_FLAG_BOLD))
  496. return 1;
  497. if (strstr(font->name, "Bold"))
  498. return 1;
  499. return 0;
  500. }
  501. static int font_is_italic(fz_font *font)
  502. {
  503. FT_Face face = font->ft_face;
  504. if (face && (face->style_flags & FT_STYLE_FLAG_ITALIC))
  505. return 1;
  506. if (strstr(font->name, "Italic") || strstr(font->name, "Oblique"))
  507. return 1;
  508. return 0;
  509. }
  510. static void
  511. fz_print_style_begin(fz_output *out, fz_text_style *style)
  512. {
  513. int script = style->script;
  514. fz_printf(out, "<span class=\"s%d\">", style->id);
  515. while (script-- > 0)
  516. fz_printf(out, "<sup>");
  517. while (++script < 0)
  518. fz_printf(out, "<sub>");
  519. }
  520. static void
  521. fz_print_style_end(fz_output *out, fz_text_style *style)
  522. {
  523. int script = style->script;
  524. while (script-- > 0)
  525. fz_printf(out, "</sup>");
  526. while (++script < 0)
  527. fz_printf(out, "</sub>");
  528. fz_printf(out, "</span>");
  529. }
  530. static void
  531. fz_print_style(fz_output *out, fz_text_style *style)
  532. {
  533. char *s = strchr(style->font->name, '+');
  534. s = s ? s + 1 : style->font->name;
  535. fz_printf(out, "span.s%d{font-family:\"%s\";font-size:%gpt;",
  536. style->id, s, style->size);
  537. if (font_is_italic(style->font))
  538. fz_printf(out, "font-style:italic;");
  539. if (font_is_bold(style->font))
  540. fz_printf(out, "font-weight:bold;");
  541. fz_printf(out, "}\n");
  542. }
  543. void
  544. fz_print_text_sheet(fz_context *ctx, fz_output *out, fz_text_sheet *sheet)
  545. {
  546. fz_text_style *style;
  547. for (style = sheet->style; style; style = style->next)
  548. fz_print_style(out, style);
  549. }
  550. void
  551. fz_print_text_page_html(fz_context *ctx, fz_output *out, fz_text_page *page)
  552. {
  553. int block_n, line_n, span_n, ch_n;
  554. fz_text_style *style = NULL;
  555. fz_text_block *block;
  556. fz_text_line *line;
  557. fz_text_span *span;
  558. fz_printf(out, "<div class=\"page\">\n");
  559. for (block_n = 0; block_n < page->len; block_n++)
  560. {
  561. block = &page->blocks[block_n];
  562. fz_printf(out, "<div class=\"block\"><p>\n");
  563. for (line_n = 0; line_n < block->len; line_n++)
  564. {
  565. line = &block->lines[line_n];
  566. fz_printf(out, "<span>");
  567. style = NULL;
  568. for (span_n = 0; span_n < line->len; span_n++)
  569. {
  570. span = &line->spans[span_n];
  571. if (style != span->style)
  572. {
  573. if (style)
  574. fz_print_style_end(out, style);
  575. fz_print_style_begin(out, span->style);
  576. style = span->style;
  577. }
  578. for (ch_n = 0; ch_n < span->len; ch_n++)
  579. {
  580. fz_text_char *ch = &span->text[ch_n];
  581. if (ch->c == '<')
  582. fz_printf(out, "&lt;");
  583. else if (ch->c == '>')
  584. fz_printf(out, "&gt;");
  585. else if (ch->c == '&')
  586. fz_printf(out, "&amp;");
  587. else if (ch->c >= 32 && ch->c <= 127)
  588. fz_printf(out, "%c", ch->c);
  589. else
  590. fz_printf(out, "&#x%x;", ch->c);
  591. }
  592. }
  593. if (style)
  594. fz_print_style_end(out, style);
  595. fz_printf(out, "</span>\n");
  596. }
  597. fz_printf(out, "</p></div>\n");
  598. }
  599. fz_printf(out, "</div>\n");
  600. }
  601. void
  602. fz_print_text_page_xml(fz_context *ctx, fz_output *out, fz_text_page *page)
  603. {
  604. fz_text_block *block;
  605. fz_text_line *line;
  606. fz_text_span *span;
  607. fz_text_char *ch;
  608. char *s;
  609. fz_printf(out, "<page>\n");
  610. for (block = page->blocks; block < page->blocks + page->len; block++)
  611. {
  612. fz_printf(out, "<block bbox=\"%g %g %g %g\">\n",
  613. block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
  614. for (line = block->lines; line < block->lines + block->len; line++)
  615. {
  616. fz_printf(out, "<line bbox=\"%g %g %g %g\">\n",
  617. line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1);
  618. for (span = line->spans; span < line->spans + line->len; span++)
  619. {
  620. fz_text_style *style = span->style;
  621. s = strchr(style->font->name, '+');
  622. s = s ? s + 1 : style->font->name;
  623. fz_printf(out, "<span bbox=\"%g %g %g %g\" font=\"%s\" size=\"%g\">\n",
  624. span->bbox.x0, span->bbox.y0, span->bbox.x1, span->bbox.y1,
  625. s, style->size);
  626. for (ch = span->text; ch < span->text + span->len; ch++)
  627. {
  628. fz_printf(out, "<char bbox=\"%g %g %g %g\" c=\"",
  629. ch->bbox.x0, ch->bbox.y0, ch->bbox.x1, ch->bbox.y1);
  630. switch (ch->c)
  631. {
  632. case '<': fz_printf(out, "&lt;"); break;
  633. case '>': fz_printf(out, "&gt;"); break;
  634. case '&': fz_printf(out, "&amp;"); break;
  635. case '"': fz_printf(out, "&quot;"); break;
  636. case '\'': fz_printf(out, "&apos;"); break;
  637. default:
  638. if (ch->c >= 32 && ch->c <= 127)
  639. fz_printf(out, "%c", ch->c);
  640. else
  641. fz_printf(out, "&#x%x;", ch->c);
  642. break;
  643. }
  644. fz_printf(out, "\"/>\n");
  645. }
  646. fz_printf(out, "</span>\n");
  647. }
  648. fz_printf(out, "</line>\n");
  649. }
  650. fz_printf(out, "</block>\n");
  651. }
  652. fz_printf(out, "</page>\n");
  653. }
  654. void
  655. fz_print_text_page(fz_context *ctx, fz_output *out, fz_text_page *page)
  656. {
  657. fz_text_block *block;
  658. fz_text_line *line;
  659. fz_text_span *span;
  660. fz_text_char *ch;
  661. char utf[10];
  662. int i, n;
  663. for (block = page->blocks; block < page->blocks + page->len; block++)
  664. {
  665. for (line = block->lines; line < block->lines + block->len; line++)
  666. {
  667. for (span = line->spans; span < line->spans + line->len; span++)
  668. {
  669. for (ch = span->text; ch < span->text + span->len; ch++)
  670. {
  671. n = fz_runetochar(utf, ch->c);
  672. for (i = 0; i < n; i++)
  673. fz_printf(out, "%c", utf[i]);
  674. }
  675. }
  676. fz_printf(out, "\n");
  677. }
  678. fz_printf(out, "\n");
  679. }
  680. }
  681. typedef struct line_height_s
  682. {
  683. float height;
  684. int count;
  685. fz_text_style *style;
  686. } line_height;
  687. typedef struct line_heights_s
  688. {
  689. fz_context *ctx;
  690. int cap;
  691. int len;
  692. line_height *lh;
  693. } line_heights;
  694. static line_heights *
  695. new_line_heights(fz_context *ctx)
  696. {
  697. line_heights *lh = fz_malloc_struct(ctx, line_heights);
  698. lh->ctx = ctx;
  699. return lh;
  700. }
  701. static void
  702. insert_line_height(line_heights *lh, fz_text_style *style, float height)
  703. {
  704. int i;
  705. /* If we have one already, add it in */
  706. for (i=0; i < lh->cap; i++)
  707. {
  708. /* Match if we are within 5% */
  709. if (lh->lh[i].style == style && lh->lh[i].height * 0.95 <= height && lh->lh[i].height * 1.05 >= height)
  710. {
  711. /* Ensure that the average height is correct */
  712. lh->lh[i].height = (lh->lh[i].height * lh->lh[i].count + height) / (lh->lh[i].count+1);
  713. lh->lh[i].count++;
  714. return;
  715. }
  716. }
  717. /* Otherwise extend (if required) and add it */
  718. if (lh->cap == lh->len)
  719. {
  720. int newcap = (lh->cap ? lh->cap * 2 : 4);
  721. lh->lh = fz_resize_array(lh->ctx, lh->lh, newcap, sizeof(line_height));
  722. lh->cap = newcap;
  723. }
  724. lh->lh[lh->len].count = 1;
  725. lh->lh[lh->len].height = height;
  726. lh->lh[lh->len].style = style;
  727. lh->len++;
  728. }
  729. static void
  730. cull_line_heights(line_heights *lh)
  731. {
  732. int i, j, k;
  733. for (i = 0; i < lh->len; i++)
  734. {
  735. fz_text_style *style = lh->lh[i].style;
  736. int count = lh->lh[i].count;
  737. int max = i;
  738. /* Find the max for this style */
  739. for (j = i+1; j < lh->len; j++)
  740. {
  741. if (lh->lh[j].style == style && lh->lh[j].count > count)
  742. {
  743. max = j;
  744. count = lh->lh[j].count;
  745. }
  746. }
  747. /* Destroy all the ones other than the max */
  748. if (max != i)
  749. {
  750. lh->lh[i].count = count;
  751. lh->lh[i].height = lh->lh[max].height;
  752. lh->lh[max].count = 0;
  753. }
  754. j = i+1;
  755. for (k = j; k < lh->len; k++)
  756. {
  757. if (lh->lh[k].style == style)
  758. {
  759. k++;
  760. }
  761. else
  762. {
  763. lh->lh[j++] = lh->lh[k];
  764. }
  765. }
  766. lh->len = j;
  767. }
  768. }
  769. static float
  770. line_height_for_style(line_heights *lh, fz_text_style *style)
  771. {
  772. int i;
  773. for (i=0; i < lh->len; i++)
  774. {
  775. if (lh->lh[i].style == style)
  776. return lh->lh[i].height;
  777. }
  778. return 0.0; /* Never reached */
  779. }
  780. static void
  781. split_block(fz_context *ctx, fz_text_page *page, int blocknum, int linenum)
  782. {
  783. int split_len;
  784. if (page->len == page->cap)
  785. {
  786. int new_cap = fz_maxi(16, page->cap * 2);
  787. page->blocks = fz_resize_array(ctx, page->blocks, new_cap, sizeof(*page->blocks));
  788. page->cap = new_cap;
  789. }
  790. memmove(page->blocks+blocknum+1, page->blocks+blocknum, (page->len - blocknum)*sizeof(*page->blocks));
  791. page->len++;
  792. split_len = page->blocks[blocknum].len - linenum;
  793. page->blocks[blocknum+1].bbox = page->blocks[blocknum].bbox; /* FIXME! */
  794. page->blocks[blocknum+1].cap = 0;
  795. page->blocks[blocknum+1].len = 0;
  796. page->blocks[blocknum+1].lines = NULL;
  797. page->blocks[blocknum+1].lines = fz_malloc_array(ctx, split_len, sizeof(fz_text_line));
  798. page->blocks[blocknum+1].cap = page->blocks[blocknum+1].len;
  799. page->blocks[blocknum+1].len = split_len;
  800. page->blocks[blocknum].len = linenum;
  801. memcpy(page->blocks[blocknum+1].lines, page->blocks[blocknum].lines + linenum, split_len * sizeof(fz_text_line));
  802. }
  803. void
  804. fz_text_analysis(fz_context *ctx, fz_text_sheet *sheet, fz_text_page *page)
  805. {
  806. fz_text_block *block;
  807. fz_text_line *line;
  808. fz_text_span *span;
  809. fz_text_line *prev_line;
  810. line_heights *lh;
  811. int blocknum;
  812. /* Simple paragraph analysis; look for the most common 'inter line'
  813. * spacing. This will be assumed to be our line spacing. Anything
  814. * more than 25% wider than this will be assumed to be a paragraph
  815. * space. */
  816. /* Step 1: Gather the line height information */
  817. lh = new_line_heights(ctx);
  818. prev_line = NULL;
  819. for (block = page->blocks; block < page->blocks + page->len; block++)
  820. {
  821. for (line = block->lines; line < block->lines + block->len; line++)
  822. {
  823. /* In a line made up of several spans, find the tallest
  824. * span. This line difference will count as being a
  825. * difference in a line of that style. */
  826. fz_text_span *tallest_span = NULL;
  827. float tallest = 0;
  828. float span_height;
  829. for (span = line->spans; span < line->spans + line->len; span++)
  830. {
  831. span_height = span->bbox.y1 - span->bbox.y0;
  832. if (tallest_span == NULL || span_height > tallest)
  833. {
  834. tallest_span = span;
  835. tallest = span_height;
  836. }
  837. }
  838. if (prev_line)
  839. {
  840. /* Should really work on the baseline positions,
  841. * but we don't have that at this stage. */
  842. float line_step = line->bbox.y1 - prev_line->bbox.y1;
  843. if (line_step > 0)
  844. {
  845. insert_line_height(lh, tallest_span->style, line_step);
  846. }
  847. }
  848. prev_line = line;
  849. }
  850. }
  851. /* Step 2: Find the most popular line height for each style */
  852. cull_line_heights(lh);
  853. /* Step 3: Run through the blocks, breaking each block into two if
  854. * the line height isn't right. */
  855. prev_line = NULL;
  856. for (blocknum = 0; blocknum < page->len; blocknum++)
  857. {
  858. block = &page->blocks[blocknum];
  859. for (line = block->lines; line < block->lines + block->len; line++)
  860. {
  861. /* In a line made up of several spans, find the tallest
  862. * span. This line difference will count as being a
  863. * difference in a line of that style. */
  864. fz_text_span *tallest_span = NULL;
  865. float tallest = 0;
  866. float span_height;
  867. for (span = line->spans; span < line->spans + line->len; span++)
  868. {
  869. span_height = span->bbox.y1 - span->bbox.y0;
  870. if (tallest_span == NULL || span_height > tallest)
  871. {
  872. tallest_span = span;
  873. tallest = span_height;
  874. }
  875. }
  876. if (prev_line)
  877. {
  878. float proper_step = line_height_for_style(lh, tallest_span->style);
  879. float line_step = line->bbox.y1 - prev_line->bbox.y1;
  880. if (proper_step * 0.95 > line_step || line_step > proper_step * 1.05)
  881. {
  882. split_block(ctx, page, block - page->blocks, line - block->lines);
  883. prev_line = NULL;
  884. break;
  885. }
  886. }
  887. prev_line = line;
  888. }
  889. }
  890. }