llex.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690
  1. /*
  2. ** $Id: llex.c,v 2.96 2016/05/02 14:02:12 roberto Exp $
  3. ** Lexical Analyzer
  4. ** See Copyright Notice in lua.h
  5. */
  6. #define llex_c
  7. #define LUA_CORE
  8. #include "lprefix.h"
  9. #include <locale.h>
  10. #include <string.h>
  11. #include "lua.h"
  12. #include "lctype.h"
  13. #include "ldebug.h"
  14. #include "ldo.h"
  15. #include "lgc.h"
  16. #include "llex.h"
  17. #include "lobject.h"
  18. #include "lparser.h"
  19. #include "lstate.h"
  20. #include "lstring.h"
  21. #include "ltable.h"
  22. #include "lzio.h"
  23. #define next(ls) (ls->current = zgetc(ls->z))
  24. #define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r')
  25. /* ORDER RESERVED */
  26. static const char *const luaX_tokens [] =
  27. {
  28. "and", "break", "do", "else", "elseif",
  29. "end", "false", "for", "function", "goto", "if",
  30. "in", "local", "nil", "not", "or", "repeat",
  31. "return", "then", "true", "until", "while",
  32. "//", "..", "...", "==", ">=", "<=", "~=",
  33. "<<", ">>", "::", "<eof>",
  34. "<number>", "<integer>", "<name>", "<string>"
  35. };
  36. #define save_and_next(ls) (save(ls, ls->current), next(ls))
  37. static l_noret lexerror(LexState *ls, const char *msg, int token);
  38. static void save(LexState *ls, int c)
  39. {
  40. Mbuffer *b = ls->buff;
  41. if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b))
  42. {
  43. size_t newsize;
  44. if (luaZ_sizebuffer(b) >= MAX_SIZE / 2)
  45. lexerror(ls, "lexical element too long", 0);
  46. newsize = luaZ_sizebuffer(b) * 2;
  47. luaZ_resizebuffer(ls->L, b, newsize);
  48. }
  49. b->buffer[luaZ_bufflen(b)++] = cast(char, c);
  50. }
  51. void luaX_init(lua_State *L)
  52. {
  53. int i;
  54. TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */
  55. luaC_fix(L, obj2gco(e)); /* never collect this name */
  56. for (i = 0; i < NUM_RESERVED; i++)
  57. {
  58. TString *ts = luaS_new(L, luaX_tokens[i]);
  59. luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */
  60. ts->extra = cast_byte(i + 1); /* reserved word */
  61. }
  62. }
  63. const char *luaX_token2str(LexState *ls, int token)
  64. {
  65. if (token < FIRST_RESERVED) /* single-byte symbols? */
  66. {
  67. lua_assert(token == cast_uchar(token));
  68. return luaO_pushfstring(ls->L, "'%c'", token);
  69. }
  70. else
  71. {
  72. const char *s = luaX_tokens[token - FIRST_RESERVED];
  73. if (token < TK_EOS) /* fixed format (symbols and reserved words)? */
  74. return luaO_pushfstring(ls->L, "'%s'", s);
  75. else /* names, strings, and numerals */
  76. return s;
  77. }
  78. }
  79. static const char *txtToken(LexState *ls, int token)
  80. {
  81. switch (token)
  82. {
  83. case TK_NAME:
  84. case TK_STRING:
  85. case TK_FLT:
  86. case TK_INT:
  87. save(ls, '\0');
  88. return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
  89. default:
  90. return luaX_token2str(ls, token);
  91. }
  92. }
  93. static l_noret lexerror(LexState *ls, const char *msg, int token)
  94. {
  95. msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
  96. if (token)
  97. luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
  98. luaD_throw(ls->L, LUA_ERRSYNTAX);
  99. }
  100. l_noret luaX_syntaxerror(LexState *ls, const char *msg)
  101. {
  102. lexerror(ls, msg, ls->t.token);
  103. }
  104. /*
  105. ** creates a new string and anchors it in scanner's table so that
  106. ** it will not be collected until the end of the compilation
  107. ** (by that time it should be anchored somewhere)
  108. */
  109. TString *luaX_newstring(LexState *ls, const char *str, size_t l)
  110. {
  111. lua_State *L = ls->L;
  112. TValue *o; /* entry for 'str' */
  113. TString *ts = luaS_newlstr(L, str, l); /* create new string */
  114. setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */
  115. o = luaH_set(L, ls->h, L->top - 1);
  116. if (ttisnil(o)) /* not in use yet? */
  117. {
  118. /* boolean value does not need GC barrier;
  119. table has no metatable, so it does not need to invalidate cache */
  120. setbvalue(o, 1); /* t[string] = true */
  121. luaC_checkGC(L);
  122. }
  123. else /* string already present */
  124. {
  125. ts = tsvalue(keyfromval(o)); /* re-use value previously stored */
  126. }
  127. L->top--; /* remove string from stack */
  128. return ts;
  129. }
  130. /*
  131. ** increment line number and skips newline sequence (any of
  132. ** \n, \r, \n\r, or \r\n)
  133. */
  134. static void inclinenumber(LexState *ls)
  135. {
  136. int old = ls->current;
  137. lua_assert(currIsNewline(ls));
  138. next(ls); /* skip '\n' or '\r' */
  139. if (currIsNewline(ls) && ls->current != old)
  140. next(ls); /* skip '\n\r' or '\r\n' */
  141. if (++ls->linenumber >= MAX_INT)
  142. lexerror(ls, "chunk has too many lines", 0);
  143. }
  144. void luaX_setinput(lua_State *L, LexState *ls, ZIO *z, TString *source,
  145. int firstchar)
  146. {
  147. ls->t.token = 0;
  148. ls->L = L;
  149. ls->current = firstchar;
  150. ls->lookahead.token = TK_EOS; /* no look-ahead token */
  151. ls->z = z;
  152. ls->fs = NULL;
  153. ls->linenumber = 1;
  154. ls->lastline = 1;
  155. ls->source = source;
  156. ls->envn = luaS_newliteral(L, LUA_ENV); /* get env name */
  157. luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */
  158. }
  159. /*
  160. ** =======================================================
  161. ** LEXICAL ANALYZER
  162. ** =======================================================
  163. */
  164. static int check_next1(LexState *ls, int c)
  165. {
  166. if (ls->current == c)
  167. {
  168. next(ls);
  169. return 1;
  170. }
  171. else return 0;
  172. }
  173. /*
  174. ** Check whether current char is in set 'set' (with two chars) and
  175. ** saves it
  176. */
  177. static int check_next2(LexState *ls, const char *set)
  178. {
  179. lua_assert(set[2] == '\0');
  180. if (ls->current == set[0] || ls->current == set[1])
  181. {
  182. save_and_next(ls);
  183. return 1;
  184. }
  185. else return 0;
  186. }
  187. /* LUA_NUMBER */
  188. /*
  189. ** this function is quite liberal in what it accepts, as 'luaO_str2num'
  190. ** will reject ill-formed numerals.
  191. */
  192. static int read_numeral(LexState *ls, SemInfo *seminfo)
  193. {
  194. TValue obj;
  195. const char *expo = "Ee";
  196. int first = ls->current;
  197. lua_assert(lisdigit(ls->current));
  198. save_and_next(ls);
  199. if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */
  200. expo = "Pp";
  201. for (;;)
  202. {
  203. if (check_next2(ls, expo)) /* exponent part? */
  204. check_next2(ls, "-+"); /* optional exponent sign */
  205. if (lisxdigit(ls->current))
  206. save_and_next(ls);
  207. else if (ls->current == '.')
  208. save_and_next(ls);
  209. else break;
  210. }
  211. save(ls, '\0');
  212. if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */
  213. lexerror(ls, "malformed number", TK_FLT);
  214. if (ttisinteger(&obj))
  215. {
  216. seminfo->i = ivalue(&obj);
  217. return TK_INT;
  218. }
  219. else
  220. {
  221. lua_assert(ttisfloat(&obj));
  222. seminfo->r = fltvalue(&obj);
  223. return TK_FLT;
  224. }
  225. }
  226. /*
  227. ** skip a sequence '[=*[' or ']=*]'; if sequence is well formed, return
  228. ** its number of '='s; otherwise, return a negative number (-1 iff there
  229. ** are no '='s after initial bracket)
  230. */
  231. static int skip_sep(LexState *ls)
  232. {
  233. int count = 0;
  234. int s = ls->current;
  235. lua_assert(s == '[' || s == ']');
  236. save_and_next(ls);
  237. while (ls->current == '=')
  238. {
  239. save_and_next(ls);
  240. count++;
  241. }
  242. return (ls->current == s) ? count : (-count) - 1;
  243. }
  244. static void read_long_string(LexState *ls, SemInfo *seminfo, int sep)
  245. {
  246. int line = ls->linenumber; /* initial line (for error message) */
  247. save_and_next(ls); /* skip 2nd '[' */
  248. if (currIsNewline(ls)) /* string starts with a newline? */
  249. inclinenumber(ls); /* skip it */
  250. for (;;)
  251. {
  252. switch (ls->current)
  253. {
  254. case EOZ: /* error */
  255. {
  256. const char *what = (seminfo ? "string" : "comment");
  257. const char *msg = luaO_pushfstring(ls->L,
  258. "unfinished long %s (starting at line %d)", what, line);
  259. lexerror(ls, msg, TK_EOS);
  260. break; /* to avoid warnings */
  261. }
  262. case ']':
  263. {
  264. if (skip_sep(ls) == sep)
  265. {
  266. save_and_next(ls); /* skip 2nd ']' */
  267. goto endloop;
  268. }
  269. break;
  270. }
  271. case '\n':
  272. case '\r':
  273. {
  274. save(ls, '\n');
  275. inclinenumber(ls);
  276. if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */
  277. break;
  278. }
  279. default:
  280. {
  281. if (seminfo) save_and_next(ls);
  282. else next(ls);
  283. }
  284. }
  285. }
  286. endloop:
  287. if (seminfo)
  288. seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
  289. luaZ_bufflen(ls->buff) - 2 * (2 + sep));
  290. }
  291. static void esccheck(LexState *ls, int c, const char *msg)
  292. {
  293. if (!c)
  294. {
  295. if (ls->current != EOZ)
  296. save_and_next(ls); /* add current to buffer for error message */
  297. lexerror(ls, msg, TK_STRING);
  298. }
  299. }
  300. static int gethexa(LexState *ls)
  301. {
  302. save_and_next(ls);
  303. esccheck(ls, lisxdigit(ls->current), "hexadecimal digit expected");
  304. return luaO_hexavalue(ls->current);
  305. }
  306. static int readhexaesc(LexState *ls)
  307. {
  308. int r = gethexa(ls);
  309. r = (r << 4) + gethexa(ls);
  310. luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */
  311. return r;
  312. }
  313. static unsigned long readutf8esc(LexState *ls)
  314. {
  315. unsigned long r;
  316. int i = 4; /* chars to be removed: '\', 'u', '{', and first digit */
  317. save_and_next(ls); /* skip 'u' */
  318. esccheck(ls, ls->current == '{', "missing '{'");
  319. r = gethexa(ls); /* must have at least one digit */
  320. while ((save_and_next(ls), lisxdigit(ls->current)))
  321. {
  322. i++;
  323. r = (r << 4) + luaO_hexavalue(ls->current);
  324. esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
  325. }
  326. esccheck(ls, ls->current == '}', "missing '}'");
  327. next(ls); /* skip '}' */
  328. luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */
  329. return r;
  330. }
  331. static void utf8esc(LexState *ls)
  332. {
  333. char buff[UTF8BUFFSZ];
  334. int n = luaO_utf8esc(buff, readutf8esc(ls));
  335. for (; n > 0; n--) /* add 'buff' to string */
  336. save(ls, buff[UTF8BUFFSZ - n]);
  337. }
  338. static int readdecesc(LexState *ls)
  339. {
  340. int i;
  341. int r = 0; /* result accumulator */
  342. for (i = 0; i < 3 && lisdigit(ls->current); i++) /* read up to 3 digits */
  343. {
  344. r = 10 * r + ls->current - '0';
  345. save_and_next(ls);
  346. }
  347. esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
  348. luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */
  349. return r;
  350. }
  351. static void read_string(LexState *ls, int del, SemInfo *seminfo)
  352. {
  353. save_and_next(ls); /* keep delimiter (for error messages) */
  354. while (ls->current != del)
  355. {
  356. switch (ls->current)
  357. {
  358. case EOZ:
  359. lexerror(ls, "unfinished string", TK_EOS);
  360. break; /* to avoid warnings */
  361. case '\n':
  362. case '\r':
  363. lexerror(ls, "unfinished string", TK_STRING);
  364. break; /* to avoid warnings */
  365. case '\\': /* escape sequences */
  366. {
  367. int c; /* final character to be saved */
  368. save_and_next(ls); /* keep '\\' for error messages */
  369. switch (ls->current)
  370. {
  371. case 'a':
  372. c = '\a';
  373. goto read_save;
  374. case 'b':
  375. c = '\b';
  376. goto read_save;
  377. case 'f':
  378. c = '\f';
  379. goto read_save;
  380. case 'n':
  381. c = '\n';
  382. goto read_save;
  383. case 'r':
  384. c = '\r';
  385. goto read_save;
  386. case 't':
  387. c = '\t';
  388. goto read_save;
  389. case 'v':
  390. c = '\v';
  391. goto read_save;
  392. case 'x':
  393. c = readhexaesc(ls);
  394. goto read_save;
  395. case 'u':
  396. utf8esc(ls);
  397. goto no_save;
  398. case '\n':
  399. case '\r':
  400. inclinenumber(ls);
  401. c = '\n';
  402. goto only_save;
  403. case '\\':
  404. case '\"':
  405. case '\'':
  406. c = ls->current;
  407. goto read_save;
  408. case EOZ:
  409. goto no_save; /* will raise an error next loop */
  410. case 'z': /* zap following span of spaces */
  411. {
  412. luaZ_buffremove(ls->buff, 1); /* remove '\\' */
  413. next(ls); /* skip the 'z' */
  414. while (lisspace(ls->current))
  415. {
  416. if (currIsNewline(ls)) inclinenumber(ls);
  417. else next(ls);
  418. }
  419. goto no_save;
  420. }
  421. default:
  422. {
  423. esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
  424. c = readdecesc(ls); /* digital escape '\ddd' */
  425. goto only_save;
  426. }
  427. }
  428. read_save:
  429. next(ls);
  430. /* go through */
  431. only_save:
  432. luaZ_buffremove(ls->buff, 1); /* remove '\\' */
  433. save(ls, c);
  434. /* go through */
  435. no_save:
  436. break;
  437. }
  438. default:
  439. save_and_next(ls);
  440. }
  441. }
  442. save_and_next(ls); /* skip delimiter */
  443. seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
  444. luaZ_bufflen(ls->buff) - 2);
  445. }
  446. static int llex(LexState *ls, SemInfo *seminfo)
  447. {
  448. luaZ_resetbuffer(ls->buff);
  449. for (;;)
  450. {
  451. switch (ls->current)
  452. {
  453. case '\n':
  454. case '\r': /* line breaks */
  455. {
  456. inclinenumber(ls);
  457. break;
  458. }
  459. case ' ':
  460. case '\f':
  461. case '\t':
  462. case '\v': /* spaces */
  463. {
  464. next(ls);
  465. break;
  466. }
  467. case '-': /* '-' or '--' (comment) */
  468. {
  469. next(ls);
  470. if (ls->current != '-') return '-';
  471. /* else is a comment */
  472. next(ls);
  473. if (ls->current == '[') /* long comment? */
  474. {
  475. int sep = skip_sep(ls);
  476. luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */
  477. if (sep >= 0)
  478. {
  479. read_long_string(ls, NULL, sep); /* skip long comment */
  480. luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */
  481. break;
  482. }
  483. }
  484. /* else short comment */
  485. while (!currIsNewline(ls) && ls->current != EOZ)
  486. next(ls); /* skip until end of line (or end of file) */
  487. break;
  488. }
  489. case '[': /* long string or simply '[' */
  490. {
  491. int sep = skip_sep(ls);
  492. if (sep >= 0)
  493. {
  494. read_long_string(ls, seminfo, sep);
  495. return TK_STRING;
  496. }
  497. else if (sep != -1) /* '[=...' missing second bracket */
  498. lexerror(ls, "invalid long string delimiter", TK_STRING);
  499. return '[';
  500. }
  501. case '=':
  502. {
  503. next(ls);
  504. if (check_next1(ls, '=')) return TK_EQ;
  505. else return '=';
  506. }
  507. case '<':
  508. {
  509. next(ls);
  510. if (check_next1(ls, '=')) return TK_LE;
  511. else if (check_next1(ls, '<')) return TK_SHL;
  512. else return '<';
  513. }
  514. case '>':
  515. {
  516. next(ls);
  517. if (check_next1(ls, '=')) return TK_GE;
  518. else if (check_next1(ls, '>')) return TK_SHR;
  519. else return '>';
  520. }
  521. case '/':
  522. {
  523. next(ls);
  524. if (check_next1(ls, '/')) return TK_IDIV;
  525. else return '/';
  526. }
  527. case '~':
  528. {
  529. next(ls);
  530. if (check_next1(ls, '=')) return TK_NE;
  531. else return '~';
  532. }
  533. case ':':
  534. {
  535. next(ls);
  536. if (check_next1(ls, ':')) return TK_DBCOLON;
  537. else return ':';
  538. }
  539. case '"':
  540. case '\'': /* short literal strings */
  541. {
  542. read_string(ls, ls->current, seminfo);
  543. return TK_STRING;
  544. }
  545. case '.': /* '.', '..', '...', or number */
  546. {
  547. save_and_next(ls);
  548. if (check_next1(ls, '.'))
  549. {
  550. if (check_next1(ls, '.'))
  551. return TK_DOTS; /* '...' */
  552. else return TK_CONCAT; /* '..' */
  553. }
  554. else if (!lisdigit(ls->current)) return '.';
  555. else return read_numeral(ls, seminfo);
  556. }
  557. case '0':
  558. case '1':
  559. case '2':
  560. case '3':
  561. case '4':
  562. case '5':
  563. case '6':
  564. case '7':
  565. case '8':
  566. case '9':
  567. {
  568. return read_numeral(ls, seminfo);
  569. }
  570. case EOZ:
  571. {
  572. return TK_EOS;
  573. }
  574. default:
  575. {
  576. if (lislalpha(ls->current)) /* identifier or reserved word? */
  577. {
  578. TString *ts;
  579. do
  580. {
  581. save_and_next(ls);
  582. }
  583. while (lislalnum(ls->current));
  584. ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
  585. luaZ_bufflen(ls->buff));
  586. seminfo->ts = ts;
  587. if (isreserved(ts)) /* reserved word? */
  588. return ts->extra - 1 + FIRST_RESERVED;
  589. else
  590. {
  591. return TK_NAME;
  592. }
  593. }
  594. else /* single-char tokens (+ - / ...) */
  595. {
  596. int c = ls->current;
  597. next(ls);
  598. return c;
  599. }
  600. }
  601. }
  602. }
  603. }
  604. void luaX_next(LexState *ls)
  605. {
  606. ls->lastline = ls->linenumber;
  607. if (ls->lookahead.token != TK_EOS) /* is there a look-ahead token? */
  608. {
  609. ls->t = ls->lookahead; /* use this one */
  610. ls->lookahead.token = TK_EOS; /* and discharge it */
  611. }
  612. else
  613. ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
  614. }
  615. int luaX_lookahead(LexState *ls)
  616. {
  617. lua_assert(ls->lookahead.token == TK_EOS);
  618. ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
  619. return ls->lookahead.token;
  620. }