| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612 |
- #include "fitz-internal.h"
- #include "mupdf-internal.h"
- fz_rect *
- pdf_to_rect(fz_context *ctx, pdf_obj *array, fz_rect *r)
- {
- float a = pdf_to_real(pdf_array_get(array, 0));
- float b = pdf_to_real(pdf_array_get(array, 1));
- float c = pdf_to_real(pdf_array_get(array, 2));
- float d = pdf_to_real(pdf_array_get(array, 3));
- r->x0 = fz_min(a, c);
- r->y0 = fz_min(b, d);
- r->x1 = fz_max(a, c);
- r->y1 = fz_max(b, d);
- return r;
- }
- fz_matrix *
- pdf_to_matrix(fz_context *ctx, pdf_obj *array, fz_matrix *m)
- {
- m->a = pdf_to_real(pdf_array_get(array, 0));
- m->b = pdf_to_real(pdf_array_get(array, 1));
- m->c = pdf_to_real(pdf_array_get(array, 2));
- m->d = pdf_to_real(pdf_array_get(array, 3));
- m->e = pdf_to_real(pdf_array_get(array, 4));
- m->f = pdf_to_real(pdf_array_get(array, 5));
- return m;
- }
- /* Convert Unicode/PdfDocEncoding string into utf-8 */
- char *
- pdf_to_utf8(pdf_document *xref, pdf_obj *src)
- {
- fz_context *ctx = xref->ctx;
- fz_buffer *strmbuf = NULL;
- unsigned char *srcptr;
- char *dstptr, *dst;
- int srclen;
- int dstlen = 0;
- int ucs;
- int i;
- fz_var(strmbuf);
- fz_try(ctx)
- {
- if (pdf_is_string(src))
- {
- srcptr = (unsigned char *) pdf_to_str_buf(src);
- srclen = pdf_to_str_len(src);
- }
- else if (pdf_is_stream(xref, pdf_to_num(src), pdf_to_gen(src)))
- {
- strmbuf = pdf_load_stream(xref, pdf_to_num(src), pdf_to_gen(src));
- srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr);
- }
- else
- {
- srclen = 0;
- }
- if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
- {
- for (i = 2; i + 1 < srclen; i += 2)
- {
- ucs = srcptr[i] << 8 | srcptr[i+1];
- dstlen += fz_runelen(ucs);
- }
- dstptr = dst = fz_malloc(ctx, dstlen + 1);
- for (i = 2; i + 1 < srclen; i += 2)
- {
- ucs = srcptr[i] << 8 | srcptr[i+1];
- dstptr += fz_runetochar(dstptr, ucs);
- }
- }
- else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
- {
- for (i = 2; i + 1 < srclen; i += 2)
- {
- ucs = srcptr[i] | srcptr[i+1] << 8;
- dstlen += fz_runelen(ucs);
- }
- dstptr = dst = fz_malloc(ctx, dstlen + 1);
- for (i = 2; i + 1 < srclen; i += 2)
- {
- ucs = srcptr[i] | srcptr[i+1] << 8;
- dstptr += fz_runetochar(dstptr, ucs);
- }
- }
- else
- {
- for (i = 0; i < srclen; i++)
- dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]);
- dstptr = dst = fz_malloc(ctx, dstlen + 1);
- for (i = 0; i < srclen; i++)
- {
- ucs = pdf_doc_encoding[srcptr[i]];
- dstptr += fz_runetochar(dstptr, ucs);
- }
- }
- }
- fz_always(ctx)
- {
- fz_drop_buffer(ctx, strmbuf);
- }
- fz_catch(ctx)
- {
- fz_rethrow(ctx);
- }
- *dstptr = '\0';
- return dst;
- }
- /* Convert Unicode/PdfDocEncoding string into ucs-2 */
- unsigned short *
- pdf_to_ucs2(pdf_document *xref, pdf_obj *src)
- {
- fz_context *ctx = xref->ctx;
- unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src);
- unsigned short *dstptr, *dst;
- int srclen = pdf_to_str_len(src);
- int i;
- if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
- {
- dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
- for (i = 2; i + 1 < srclen; i += 2)
- *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
- }
- else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
- {
- dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
- for (i = 2; i + 1 < srclen; i += 2)
- *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
- }
- else
- {
- dstptr = dst = fz_malloc_array(ctx, srclen + 1, sizeof(short));
- for (i = 0; i < srclen; i++)
- *dstptr++ = pdf_doc_encoding[srcptr[i]];
- }
- *dstptr = '\0';
- return dst;
- }
- /* allow to convert to UCS-2 without the need for an fz_context */
- /* (buffer must be at least (fz_to_str_len(src) + 1) * 2 bytes in size) */
- void
- pdf_to_ucs2_buf(unsigned short *buffer, pdf_obj *src)
- {
- unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(src);
- unsigned short *dstptr = buffer;
- int srclen = pdf_to_str_len(src);
- int i;
- if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
- {
- for (i = 2; i + 1 < srclen; i += 2)
- *dstptr++ = srcptr[i] << 8 | srcptr[i+1];
- }
- else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
- {
- for (i = 2; i + 1 < srclen; i += 2)
- *dstptr++ = srcptr[i] | srcptr[i+1] << 8;
- }
- else
- {
- for (i = 0; i < srclen; i++)
- *dstptr++ = pdf_doc_encoding[srcptr[i]];
- }
- *dstptr = '\0';
- }
- /* Convert UCS-2 string into PdfDocEncoding for authentication */
- char *
- pdf_from_ucs2(pdf_document *xref, unsigned short *src)
- {
- fz_context *ctx = xref->ctx;
- int i, j, len;
- char *docstr;
- len = 0;
- while (src[len])
- len++;
- docstr = fz_malloc(ctx, len + 1);
- for (i = 0; i < len; i++)
- {
- /* shortcut: check if the character has the same code point in both encodings */
- if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) {
- docstr[i] = src[i];
- continue;
- }
- /* search through pdf_docencoding for the character's code point */
- for (j = 0; j < 256; j++)
- if (pdf_doc_encoding[j] == src[i])
- break;
- docstr[i] = j;
- /* fail, if a character can't be encoded */
- if (!docstr[i])
- {
- fz_free(ctx, docstr);
- return NULL;
- }
- }
- docstr[len] = '\0';
- return docstr;
- }
- pdf_obj *
- pdf_to_utf8_name(pdf_document *xref, pdf_obj *src)
- {
- char *buf = pdf_to_utf8(xref, src);
- pdf_obj *dst = pdf_new_name(xref->ctx, buf);
- fz_free(xref->ctx, buf);
- return dst;
- }
- pdf_obj *
- pdf_parse_array(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
- {
- pdf_obj *ary = NULL;
- pdf_obj *obj = NULL;
- int a = 0, b = 0, n = 0;
- pdf_token tok;
- fz_context *ctx = file->ctx;
- pdf_obj *op;
- fz_var(obj);
- ary = pdf_new_array(ctx, 4);
- fz_try(ctx)
- {
- while (1)
- {
- tok = pdf_lex(file, buf);
- if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
- {
- if (n > 0)
- {
- obj = pdf_new_int(ctx, a);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- }
- if (n > 1)
- {
- obj = pdf_new_int(ctx, b);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- }
- n = 0;
- }
- if (tok == PDF_TOK_INT && n == 2)
- {
- obj = pdf_new_int(ctx, a);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- a = b;
- n --;
- }
- switch (tok)
- {
- case PDF_TOK_CLOSE_ARRAY:
- op = ary;
- goto end;
- case PDF_TOK_INT:
- if (n == 0)
- a = buf->i;
- if (n == 1)
- b = buf->i;
- n ++;
- break;
- case PDF_TOK_R:
- if (n != 2)
- fz_throw(ctx, "cannot parse indirect reference in array");
- obj = pdf_new_indirect(ctx, a, b, xref);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- n = 0;
- break;
- case PDF_TOK_OPEN_ARRAY:
- obj = pdf_parse_array(xref, file, buf);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_OPEN_DICT:
- obj = pdf_parse_dict(xref, file, buf);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_NAME:
- obj = pdf_new_name(ctx, buf->scratch);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_REAL:
- obj = pdf_new_real(ctx, buf->f);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_STRING:
- obj = pdf_new_string(ctx, buf->scratch, buf->len);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_TRUE:
- obj = pdf_new_bool(ctx, 1);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_FALSE:
- obj = pdf_new_bool(ctx, 0);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- case PDF_TOK_NULL:
- obj = pdf_new_null(ctx);
- pdf_array_push(ary, obj);
- pdf_drop_obj(obj);
- obj = NULL;
- break;
- default:
- fz_throw(ctx, "cannot parse token in array");
- }
- }
- end:
- {}
- }
- fz_catch(ctx)
- {
- pdf_drop_obj(obj);
- pdf_drop_obj(ary);
- fz_throw(ctx, "cannot parse array");
- }
- return op;
- }
- pdf_obj *
- pdf_parse_dict(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
- {
- pdf_obj *dict;
- pdf_obj *key = NULL;
- pdf_obj *val = NULL;
- pdf_token tok;
- int a, b;
- fz_context *ctx = file->ctx;
- dict = pdf_new_dict(ctx, 8);
- fz_var(key);
- fz_var(val);
- fz_try(ctx)
- {
- while (1)
- {
- tok = pdf_lex(file, buf);
- skip:
- if (tok == PDF_TOK_CLOSE_DICT)
- break;
- /* for BI .. ID .. EI in content streams */
- if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
- break;
- if (tok != PDF_TOK_NAME)
- fz_throw(ctx, "invalid key in dict");
- key = pdf_new_name(ctx, buf->scratch);
- tok = pdf_lex(file, buf);
- switch (tok)
- {
- case PDF_TOK_OPEN_ARRAY:
- val = pdf_parse_array(xref, file, buf);
- break;
- case PDF_TOK_OPEN_DICT:
- val = pdf_parse_dict(xref, file, buf);
- break;
- case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
- case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
- case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
- case PDF_TOK_TRUE: val = pdf_new_bool(ctx, 1); break;
- case PDF_TOK_FALSE: val = pdf_new_bool(ctx, 0); break;
- case PDF_TOK_NULL: val = pdf_new_null(ctx); break;
- case PDF_TOK_INT:
- /* 64-bit to allow for numbers > INT_MAX and overflow */
- a = buf->i;
- tok = pdf_lex(file, buf);
- if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
- (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
- {
- val = pdf_new_int(ctx, a);
- pdf_dict_put(dict, key, val);
- pdf_drop_obj(val);
- val = NULL;
- pdf_drop_obj(key);
- key = NULL;
- goto skip;
- }
- if (tok == PDF_TOK_INT)
- {
- b = buf->i;
- tok = pdf_lex(file, buf);
- if (tok == PDF_TOK_R)
- {
- val = pdf_new_indirect(ctx, a, b, xref);
- break;
- }
- }
- fz_throw(ctx, "invalid indirect reference in dict");
- default:
- fz_throw(ctx, "unknown token in dict");
- }
- pdf_dict_put(dict, key, val);
- pdf_drop_obj(val);
- val = NULL;
- pdf_drop_obj(key);
- key = NULL;
- }
- }
- fz_catch(ctx)
- {
- pdf_drop_obj(dict);
- pdf_drop_obj(key);
- pdf_drop_obj(val);
- fz_throw(ctx, "cannot parse dict");
- }
- return dict;
- }
- pdf_obj *
- pdf_parse_stm_obj(pdf_document *xref, fz_stream *file, pdf_lexbuf *buf)
- {
- pdf_token tok;
- fz_context *ctx = file->ctx;
- tok = pdf_lex(file, buf);
- switch (tok)
- {
- case PDF_TOK_OPEN_ARRAY:
- return pdf_parse_array(xref, file, buf);
- case PDF_TOK_OPEN_DICT:
- return pdf_parse_dict(xref, file, buf);
- case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch); break;
- case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f); break;
- case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len); break;
- case PDF_TOK_TRUE: return pdf_new_bool(ctx, 1); break;
- case PDF_TOK_FALSE: return pdf_new_bool(ctx, 0); break;
- case PDF_TOK_NULL: return pdf_new_null(ctx); break;
- case PDF_TOK_INT: return pdf_new_int(ctx, buf->i); break;
- default: fz_throw(ctx, "unknown token in object stream");
- }
- return NULL; /* Stupid MSVC */
- }
- pdf_obj *
- pdf_parse_ind_obj(pdf_document *xref,
- fz_stream *file, pdf_lexbuf *buf,
- int *onum, int *ogen, int *ostmofs)
- {
- pdf_obj *obj = NULL;
- int num = 0, gen = 0, stm_ofs;
- pdf_token tok;
- int a, b;
- fz_context *ctx = file->ctx;
- fz_var(obj);
- tok = pdf_lex(file, buf);
- if (tok != PDF_TOK_INT)
- fz_throw(ctx, "expected object number");
- num = buf->i;
- tok = pdf_lex(file, buf);
- if (tok != PDF_TOK_INT)
- fz_throw(ctx, "expected generation number (%d ? obj)", num);
- gen = buf->i;
- tok = pdf_lex(file, buf);
- if (tok != PDF_TOK_OBJ)
- fz_throw(ctx, "expected 'obj' keyword (%d %d ?)", num, gen);
- tok = pdf_lex(file, buf);
- switch (tok)
- {
- case PDF_TOK_OPEN_ARRAY:
- obj = pdf_parse_array(xref, file, buf);
- break;
- case PDF_TOK_OPEN_DICT:
- obj = pdf_parse_dict(xref, file, buf);
- break;
- case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
- case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
- case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
- case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, 1); break;
- case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, 0); break;
- case PDF_TOK_NULL: obj = pdf_new_null(ctx); break;
- case PDF_TOK_INT:
- a = buf->i;
- tok = pdf_lex(file, buf);
- if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
- {
- obj = pdf_new_int(ctx, a);
- goto skip;
- }
- if (tok == PDF_TOK_INT)
- {
- b = buf->i;
- tok = pdf_lex(file, buf);
- if (tok == PDF_TOK_R)
- {
- obj = pdf_new_indirect(ctx, a, b, xref);
- break;
- }
- }
- fz_throw(ctx, "expected 'R' keyword (%d %d R)", num, gen);
- case PDF_TOK_ENDOBJ:
- obj = pdf_new_null(ctx);
- goto skip;
- default:
- fz_throw(ctx, "syntax error in object (%d %d R)", num, gen);
- }
- fz_try(ctx)
- {
- tok = pdf_lex(file, buf);
- }
- fz_catch(ctx)
- {
- pdf_drop_obj(obj);
- fz_throw(ctx, "cannot parse indirect object (%d %d R)", num, gen);
- }
- skip:
- if (tok == PDF_TOK_STREAM)
- {
- int c = fz_read_byte(file);
- while (c == ' ')
- c = fz_read_byte(file);
- if (c == '\r')
- {
- c = fz_peek_byte(file);
- if (c != '\n')
- fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
- else
- fz_read_byte(file);
- }
- stm_ofs = fz_tell(file);
- }
- else if (tok == PDF_TOK_ENDOBJ)
- {
- stm_ofs = 0;
- }
- else
- {
- fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
- stm_ofs = 0;
- }
- if (onum) *onum = num;
- if (ogen) *ogen = gen;
- if (ostmofs) *ostmofs = stm_ofs;
- return obj;
- }
|