jslex.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. /*
  2. * This file is part of Espruino, a JavaScript interpreter for Microcontrollers
  3. *
  4. * Copyright (C) 2013 Gordon Williams <gw@pur3.co.uk>
  5. *
  6. * This Source Code Form is subject to the terms of the Mozilla Public
  7. * License, v. 2.0. If a copy of the MPL was not distributed with this
  8. * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  9. *
  10. * ----------------------------------------------------------------------------
  11. * Lexer (convert JsVar strings into a series of tokens)
  12. * ----------------------------------------------------------------------------
  13. */
  14. #include "jslex.h"
  15. void jslSeek(JsLex *lex, JslCharPos seekToChar) {
  16. jsvStringIteratorFree(&lex->it);
  17. jsvStringIteratorNew(&lex->it, lex->sourceVar, seekToChar);
  18. }
  19. void NO_INLINE jslGetNextCh(JsLex *lex) {
  20. lex->currCh = lex->nextCh;
  21. lex->nextCh = jsvStringIteratorGetChar(&lex->it);
  22. jsvStringIteratorNextInline(&lex->it);
  23. }
  24. static inline void jslTokenAppendChar(JsLex *lex, char ch) {
  25. /* Add character to buffer but check it isn't too big.
  26. * Also Leave ONE character at the end for null termination */
  27. if (lex->tokenl < JSLEX_MAX_TOKEN_LENGTH-1) {
  28. lex->token[lex->tokenl++] = ch;
  29. }
  30. #ifdef DEBUG
  31. else {
  32. jsWarnAt("Token name is too long! skipping character", lex, lex->tokenStart);
  33. }
  34. #endif
  35. }
  36. static bool jslIsToken(JsLex *lex, const char *token, int startOffset) {
  37. int i;
  38. for (i=startOffset;i<lex->tokenl;i++) {
  39. if (lex->token[i]!=token[i]) return false;
  40. // if token is smaller than lex->token, there will be a null char
  41. // which will be different from the token
  42. }
  43. return token[lex->tokenl] == 0; // only match if token ends now
  44. }
  45. void jslGetNextToken(JsLex *lex) {
  46. lex->tk = LEX_EOF;
  47. lex->tokenl = 0; // clear token string
  48. if (lex->tokenValue) {
  49. jsvUnLock(lex->tokenValue);
  50. lex->tokenValue = 0;
  51. }
  52. while (lex->currCh && isWhitespace(lex->currCh)) jslGetNextCh(lex);
  53. // newline comments
  54. if (lex->currCh=='/' && lex->nextCh=='/') {
  55. while (lex->currCh && lex->currCh!='\n') jslGetNextCh(lex);
  56. jslGetNextCh(lex);
  57. jslGetNextToken(lex);
  58. return;
  59. }
  60. // block comments
  61. if (lex->currCh=='/' && lex->nextCh=='*') {
  62. while (lex->currCh && !(lex->currCh=='*' && lex->nextCh=='/'))
  63. jslGetNextCh(lex);
  64. if (!lex->currCh) {
  65. lex->tk = LEX_UNFINISHED_COMMENT;
  66. return; /* an unfinished multi-line comment. When in interactive console,
  67. detect this and make sure we accept new lines */
  68. }
  69. jslGetNextCh(lex);
  70. jslGetNextCh(lex);
  71. jslGetNextToken(lex);
  72. return;
  73. }
  74. // record beginning of this token
  75. lex->tokenLastStart = lex->tokenStart;
  76. lex->tokenStart = (JslCharPos)(lex->it.index-2);
  77. // tokens
  78. if (isAlpha(lex->currCh) || lex->currCh=='$') { // IDs
  79. while (isAlpha(lex->currCh) || isNumeric(lex->currCh) || lex->currCh=='$') {
  80. jslTokenAppendChar(lex, lex->currCh);
  81. jslGetNextCh(lex);
  82. }
  83. lex->tk = LEX_ID;
  84. // We do fancy stuff here to reduce number of compares (hopefully GCC creates a jump table)
  85. switch (lex->token[0]) {
  86. case 'b': if (jslIsToken(lex,"break", 1)) lex->tk = LEX_R_BREAK;
  87. break;
  88. case 'c': if (jslIsToken(lex,"case", 1)) lex->tk = LEX_R_CASE;
  89. else if (jslIsToken(lex,"continue", 1)) lex->tk = LEX_R_CONTINUE;
  90. break;
  91. case 'd': if (jslIsToken(lex,"default", 1)) lex->tk = LEX_R_DEFAULT;
  92. else if (jslIsToken(lex,"do", 1)) lex->tk = LEX_R_DO;
  93. break;
  94. case 'e': if (jslIsToken(lex,"else", 1)) lex->tk = LEX_R_ELSE;
  95. break;
  96. case 'f': if (jslIsToken(lex,"false", 1)) lex->tk = LEX_R_FALSE;
  97. else if (jslIsToken(lex,"for", 1)) lex->tk = LEX_R_FOR;
  98. else if (jslIsToken(lex,"function", 1)) lex->tk = LEX_R_FUNCTION;
  99. break;
  100. case 'i': if (jslIsToken(lex,"if", 1)) lex->tk = LEX_R_IF;
  101. else if (jslIsToken(lex,"in", 1)) lex->tk = LEX_R_IN;
  102. else if (jslIsToken(lex,"instanceof", 1)) lex->tk = LEX_R_INSTANCEOF;
  103. break;
  104. case 'n': if (jslIsToken(lex,"new", 1)) lex->tk = LEX_R_NEW;
  105. else if (jslIsToken(lex,"null", 1)) lex->tk = LEX_R_NULL;
  106. break;
  107. case 'r': if (jslIsToken(lex,"return", 1)) lex->tk = LEX_R_RETURN;
  108. break;
  109. case 's': if (jslIsToken(lex,"switch", 1)) lex->tk = LEX_R_SWITCH;
  110. break;
  111. case 't': if (jslIsToken(lex,"this", 1)) lex->tk = LEX_R_THIS;
  112. else if (jslIsToken(lex,"true", 1)) lex->tk = LEX_R_TRUE;
  113. else if (jslIsToken(lex,"typeof", 1)) lex->tk = LEX_R_TYPEOF;
  114. break;
  115. case 'u': if (jslIsToken(lex,"undefined", 1)) lex->tk = LEX_R_UNDEFINED;
  116. break;
  117. case 'w': if (jslIsToken(lex,"while", 1)) lex->tk = LEX_R_WHILE;
  118. break;
  119. case 'v': if (jslIsToken(lex,"var", 1)) lex->tk = LEX_R_VAR;
  120. else if (jslIsToken(lex,"void", 1)) lex->tk = LEX_R_VOID;
  121. break;
  122. default: break;
  123. }
  124. } else if (isNumeric(lex->currCh)) { // Numbers
  125. // TODO: check numbers aren't the wrong format
  126. bool canBeFloating = true;
  127. if (lex->currCh=='0') {
  128. jslTokenAppendChar(lex, lex->currCh);
  129. jslGetNextCh(lex);
  130. }
  131. if ((lex->currCh=='x' || lex->currCh=='X') ||
  132. (lex->currCh=='b' || lex->currCh=='B') ||
  133. (lex->currCh=='o' || lex->currCh=='O')) {
  134. canBeFloating = false;
  135. jslTokenAppendChar(lex, lex->currCh); jslGetNextCh(lex);
  136. }
  137. lex->tk = LEX_INT;
  138. while (isNumeric(lex->currCh) || (!canBeFloating && isHexadecimal(lex->currCh))) {
  139. jslTokenAppendChar(lex, lex->currCh);
  140. jslGetNextCh(lex);
  141. }
  142. if (canBeFloating && lex->currCh=='.') {
  143. lex->tk = LEX_FLOAT;
  144. jslTokenAppendChar(lex, '.');
  145. jslGetNextCh(lex);
  146. while (isNumeric(lex->currCh)) {
  147. jslTokenAppendChar(lex, lex->currCh);
  148. jslGetNextCh(lex);
  149. }
  150. }
  151. // do fancy e-style floating point
  152. if (canBeFloating && (lex->currCh=='e'||lex->currCh=='E')) {
  153. lex->tk = LEX_FLOAT;
  154. jslTokenAppendChar(lex, lex->currCh); jslGetNextCh(lex);
  155. if (lex->currCh=='-' || lex->currCh=='+') { jslTokenAppendChar(lex, lex->currCh); jslGetNextCh(lex); }
  156. while (isNumeric(lex->currCh)) {
  157. jslTokenAppendChar(lex, lex->currCh); jslGetNextCh(lex);
  158. }
  159. }
  160. } else if (lex->currCh=='"' || lex->currCh=='\'') {
  161. char delim = lex->currCh;
  162. lex->tokenValue = jsvNewFromEmptyString();
  163. // strings...
  164. jslGetNextCh(lex);
  165. while (lex->currCh && lex->currCh!=delim) {
  166. if (lex->currCh == '\\') {
  167. jslGetNextCh(lex);
  168. char ch = lex->currCh;
  169. switch (lex->currCh) {
  170. case 'n' : ch = '\n'; jslGetNextCh(lex); break;
  171. case 'a' : ch = '\a'; jslGetNextCh(lex); break;
  172. case 'r' : ch = '\r'; jslGetNextCh(lex); break;
  173. case 't' : ch = '\t'; jslGetNextCh(lex); break;
  174. case 'x' : { // hex digits
  175. char buf[5] = "0x??";
  176. jslGetNextCh(lex);
  177. buf[2] = lex->currCh; jslGetNextCh(lex);
  178. buf[3] = lex->currCh; jslGetNextCh(lex);
  179. ch = (char)stringToInt(buf);
  180. } break;
  181. default:
  182. if (lex->currCh>='0' && lex->currCh<='7') {
  183. // octal digits
  184. char buf[5] = "0";
  185. buf[1] = lex->currCh;
  186. int n=2;
  187. jslGetNextCh(lex);
  188. if (lex->currCh>='0' && lex->currCh<='7') {
  189. buf[n++] = lex->currCh; jslGetNextCh(lex);
  190. if (lex->currCh>='0' && lex->currCh<='7') {
  191. buf[n++] = lex->currCh; jslGetNextCh(lex);
  192. }
  193. }
  194. buf[n]=0;
  195. ch = (char)stringToInt(buf);
  196. } else {
  197. // for anything else, just push the character through
  198. jslGetNextCh(lex);
  199. }
  200. break;
  201. }
  202. if (lex->tokenValue) {
  203. jslTokenAppendChar(lex, ch);
  204. jsvAppendCharacter(lex->tokenValue, ch);
  205. }
  206. } else {
  207. if (lex->tokenValue) {
  208. jslTokenAppendChar(lex, lex->currCh);
  209. jsvAppendCharacter(lex->tokenValue, lex->currCh);
  210. }
  211. jslGetNextCh(lex);
  212. }
  213. }
  214. jslGetNextCh(lex);
  215. lex->tk = LEX_STR;
  216. } else {
  217. // single chars
  218. lex->tk = lex->currCh;
  219. jslGetNextCh(lex);
  220. if (lex->tk=='=' && lex->currCh=='=') { // ==
  221. lex->tk = LEX_EQUAL;
  222. jslGetNextCh(lex);
  223. if (lex->currCh=='=') { // ===
  224. lex->tk = LEX_TYPEEQUAL;
  225. jslGetNextCh(lex);
  226. }
  227. } else if (lex->tk=='!' && lex->currCh=='=') { // !=
  228. lex->tk = LEX_NEQUAL;
  229. jslGetNextCh(lex);
  230. if (lex->currCh=='=') { // !==
  231. lex->tk = LEX_NTYPEEQUAL;
  232. jslGetNextCh(lex);
  233. }
  234. } else if (lex->tk=='<') {
  235. if (lex->currCh=='=') {
  236. lex->tk = LEX_LEQUAL;
  237. jslGetNextCh(lex);
  238. } else if (lex->currCh=='<') {
  239. lex->tk = LEX_LSHIFT;
  240. jslGetNextCh(lex);
  241. if (lex->currCh=='=') { // <<=
  242. lex->tk = LEX_LSHIFTEQUAL;
  243. jslGetNextCh(lex);
  244. }
  245. }
  246. } else if (lex->tk=='>') {
  247. if (lex->currCh=='=') {
  248. lex->tk = LEX_GEQUAL;
  249. jslGetNextCh(lex);
  250. } else if (lex->currCh=='>') {
  251. lex->tk = LEX_RSHIFT;
  252. jslGetNextCh(lex);
  253. if (lex->currCh=='=') { // >>=
  254. lex->tk = LEX_RSHIFTEQUAL;
  255. jslGetNextCh(lex);
  256. } else if (lex->currCh=='>') { // >>>
  257. jslGetNextCh(lex);
  258. if (lex->currCh=='=') { // >>>=
  259. lex->tk = LEX_RSHIFTUNSIGNEDEQUAL;
  260. jslGetNextCh(lex);
  261. } else {
  262. lex->tk = LEX_RSHIFTUNSIGNED;
  263. }
  264. }
  265. }
  266. } else if (lex->tk=='+') {
  267. if (lex->currCh=='=') {
  268. lex->tk = LEX_PLUSEQUAL;
  269. jslGetNextCh(lex);
  270. } else if (lex->currCh=='+') {
  271. lex->tk = LEX_PLUSPLUS;
  272. jslGetNextCh(lex);
  273. }
  274. } else if (lex->tk=='-') {
  275. if (lex->currCh=='=') {
  276. lex->tk = LEX_MINUSEQUAL;
  277. jslGetNextCh(lex);
  278. } else if (lex->currCh=='-') {
  279. lex->tk = LEX_MINUSMINUS;
  280. jslGetNextCh(lex);
  281. }
  282. } else if (lex->tk=='&') {
  283. if (lex->currCh=='=') {
  284. lex->tk = LEX_ANDEQUAL;
  285. jslGetNextCh(lex);
  286. } else if (lex->currCh=='&') {
  287. lex->tk = LEX_ANDAND;
  288. jslGetNextCh(lex);
  289. }
  290. } else if (lex->tk=='|') {
  291. if (lex->currCh=='=') {
  292. lex->tk = LEX_OREQUAL;
  293. jslGetNextCh(lex);
  294. } else if (lex->tk=='|' && lex->currCh=='|') {
  295. lex->tk = LEX_OROR;
  296. jslGetNextCh(lex);
  297. }
  298. } else if (lex->tk=='^' && lex->currCh=='=') {
  299. lex->tk = LEX_XOREQUAL;
  300. jslGetNextCh(lex);
  301. } else if (lex->tk=='*' && lex->currCh=='=') {
  302. lex->tk = LEX_MULEQUAL;
  303. jslGetNextCh(lex);
  304. } else if (lex->tk=='/' && lex->currCh=='=') {
  305. lex->tk = LEX_DIVEQUAL;
  306. jslGetNextCh(lex);
  307. } else if (lex->tk=='%' && lex->currCh=='=') {
  308. lex->tk = LEX_MODEQUAL;
  309. jslGetNextCh(lex);
  310. }
  311. }
  312. /* This isn't quite right yet */
  313. lex->tokenLastEnd = lex->tokenEnd;
  314. lex->tokenEnd = (JslCharPos)(lex->it.index-3)/*because of nextCh/currCh/etc */;
  315. }
  316. static inline void jslPreload(JsLex *lex) {
  317. // set up..
  318. jslGetNextCh(lex);
  319. jslGetNextCh(lex);
  320. jslGetNextToken(lex);
  321. }
  322. void jslInit(JsLex *lex, JsVar *var) {
  323. lex->sourceVar = jsvLockAgain(var);
  324. // reset stuff
  325. lex->tk = 0;
  326. lex->tokenStart = 0;
  327. lex->tokenEnd = 0;
  328. lex->tokenLastStart = 0;
  329. lex->tokenLastEnd = 0;
  330. lex->tokenl = 0;
  331. lex->tokenValue = 0;
  332. // set up iterator
  333. jsvStringIteratorNew(&lex->it, lex->sourceVar, 0);
  334. jslPreload(lex);
  335. }
  336. void jslKill(JsLex *lex) {
  337. lex->tk = LEX_EOF; // safety ;)
  338. jsvStringIteratorFree(&lex->it);
  339. if (lex->tokenValue) {
  340. jsvUnLock(lex->tokenValue);
  341. lex->tokenValue = 0;
  342. }
  343. jsvUnLock(lex->sourceVar);
  344. }
  345. void jslSeekTo(JsLex *lex, JslCharPos seekToChar) {
  346. jslSeek(lex, seekToChar);
  347. jslPreload(lex);
  348. }
  349. void jslReset(JsLex *lex) {
  350. jslSeekTo(lex, 0);
  351. }
  352. void jslTokenAsString(int token, char *str, size_t len) {
  353. // see JS_ERROR_TOKEN_BUF_SIZE
  354. if (token>32 && token<128) {
  355. assert(len>=4);
  356. str[0] = '\'';
  357. str[1] = (char)token;
  358. str[2] = '\'';
  359. str[3] = 0;
  360. return;
  361. }
  362. switch (token) {
  363. case LEX_EOF : strncpy(str, "EOF", len); return;
  364. case LEX_ID : strncpy(str, "ID", len); return;
  365. case LEX_INT : strncpy(str, "INT", len); return;
  366. case LEX_FLOAT : strncpy(str, "FLOAT", len); return;
  367. case LEX_STR : strncpy(str, "STRING", len); return;
  368. }
  369. if (token>=LEX_EQUAL && token<LEX_R_LIST_END) {
  370. const char tokenNames[] =
  371. /* LEX_EQUAL : */ "==\0"
  372. /* LEX_TYPEEQUAL : */ "===\0"
  373. /* LEX_NEQUAL : */ "!=\0"
  374. /* LEX_NTYPEEQUAL : */ "!==\0"
  375. /* LEX_LEQUAL : */ "<=\0"
  376. /* LEX_LSHIFT : */ "<<\0"
  377. /* LEX_LSHIFTEQUAL : */ "<<=\0"
  378. /* LEX_GEQUAL : */ ">=\0"
  379. /* LEX_RSHIFT : */ ">>\0"
  380. /* LEX_RSHIFTUNSIGNED */ ">>>\0"
  381. /* LEX_RSHIFTEQUAL : */ ">>=\0"
  382. /* LEX_RSHIFTUNSIGNEDEQUAL */ ">>>=\0"
  383. /* LEX_PLUSEQUAL : */ "+=\0"
  384. /* LEX_MINUSEQUAL : */ "-=\0"
  385. /* LEX_PLUSPLUS : */ "++\0"
  386. /* LEX_MINUSMINUS */ "--\0"
  387. /* LEX_MULEQUAL : */ "*=\0"
  388. /* LEX_DIVEQUAL : */ "/=\0"
  389. /* LEX_MODEQUAL : */ "%=\0"
  390. /* LEX_ANDEQUAL : */ "&=\0"
  391. /* LEX_ANDAND : */ "&&\0"
  392. /* LEX_OREQUAL : */ "|=\0"
  393. /* LEX_OROR : */ "||\0"
  394. /* LEX_XOREQUAL : */ "^=\0"
  395. // reserved words
  396. /*LEX_R_IF : */ "if\0"
  397. /*LEX_R_ELSE : */ "else\0"
  398. /*LEX_R_DO : */ "do\0"
  399. /*LEX_R_WHILE : */ "while\0"
  400. /*LEX_R_FOR : */ "for\0"
  401. /*LEX_R_BREAK : */ "return\0"
  402. /*LEX_R_CONTINUE */ "continue\0"
  403. /*LEX_R_FUNCTION */ "function\0"
  404. /*LEX_R_RETURN */ "return\0"
  405. /*LEX_R_VAR : */ "var\0"
  406. /*LEX_R_THIS : */ "this\0"
  407. /*LEX_R_TRUE : */ "true\0"
  408. /*LEX_R_FALSE : */ "false\0"
  409. /*LEX_R_NULL : */ "null\0"
  410. /*LEX_R_UNDEFINED */ "undefined\0"
  411. /*LEX_R_NEW : */ "new\0"
  412. /*LEX_R_IN : */ "in\0"
  413. /*LEX_R_INSTANCEOF */ "instanceof\0"
  414. /*LEX_R_SWITCH */ "switch\0"
  415. /*LEX_R_CASE */ "case\0"
  416. /*LEX_R_DEFAULT */ "default\0"
  417. /*LEX_R_TYPEOF : */ "typeof\0"
  418. /*LEX_R_VOID : */ "void\0"
  419. ;
  420. unsigned int p = 0;
  421. int n = token-LEX_EQUAL;
  422. while (n>0 && p<sizeof(tokenNames)) {
  423. while (tokenNames[p] && p<sizeof(tokenNames)) p++;
  424. p++; // skip the zero
  425. n--; // next token
  426. }
  427. assert(n==0);
  428. strncpy(str, &tokenNames[p], len);
  429. return;
  430. }
  431. assert(len>=10);
  432. strncpy(str, "?[",len);
  433. itoa(token, &str[2], 10);
  434. strncat(str, "]",len);
  435. }
  436. void jslGetTokenString(JsLex *lex, char *str, size_t len) {
  437. if (lex->tk == LEX_ID) {
  438. strncpy(str, "ID:", len);
  439. strncat(str, jslGetTokenValueAsString(lex), len);
  440. } else if (lex->tk == LEX_STR) {
  441. strncpy(str, "String:'", len);
  442. strncat(str, jslGetTokenValueAsString(lex), len);
  443. strncat(str, "'", len);
  444. } else
  445. jslTokenAsString(lex->tk, str, len);
  446. }
  447. char *jslGetTokenValueAsString(JsLex *lex) {
  448. assert(lex->tokenl < JSLEX_MAX_TOKEN_LENGTH);
  449. lex->token[lex->tokenl] = 0; // add final null
  450. return lex->token;
  451. }
  452. JsVar *jslGetTokenValueAsVar(JsLex *lex) {
  453. if (lex->tokenValue) {
  454. return jsvLockAgain(lex->tokenValue);
  455. } else {
  456. assert(lex->tokenl < JSLEX_MAX_TOKEN_LENGTH);
  457. lex->token[lex->tokenl] = 0; // add final null
  458. return jsvNewFromString(lex->token);
  459. }
  460. }
  461. /// Match, and return true on success, false on failure
  462. bool jslMatch(JsLex *lex, int expected_tk) {
  463. if (lex->tk!=expected_tk) {
  464. char buf[JS_ERROR_BUF_SIZE];
  465. size_t bufpos = 0;
  466. strncpy(&buf[bufpos], "Got ", JS_ERROR_BUF_SIZE-bufpos);
  467. bufpos = strlen(buf);
  468. jslGetTokenString(lex, &buf[bufpos], JS_ERROR_BUF_SIZE-bufpos);
  469. bufpos = strlen(buf);
  470. strncpy(&buf[bufpos], " expected ", JS_ERROR_BUF_SIZE-bufpos);
  471. bufpos = strlen(buf);
  472. jslTokenAsString(expected_tk, &buf[bufpos], JS_ERROR_BUF_SIZE-bufpos);
  473. jsErrorAt(buf, lex, lex->tokenStart);
  474. // Sod it, skip this token anyway - stops us looping
  475. jslGetNextToken(lex);
  476. return false;
  477. }
  478. jslGetNextToken(lex);
  479. return true;
  480. }