diff options
Diffstat (limited to 'src/lexer.h')
| -rw-r--r-- | src/lexer.h | 117 |
1 files changed, 104 insertions, 13 deletions
diff --git a/src/lexer.h b/src/lexer.h index ca2b790..3d36bf4 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -1,4 +1,3 @@ - #ifndef INCLUDE_lexerlexer #define INCLUDE_lexerlexer @@ -15,7 +14,7 @@ typedef struct { int col; } _LX; -/* Error reporting with line/column info */ + static void perror_at(_LX *lx, const char *msg) { fprintf(stderr, "[LEXER] Error at line %d, column %d: %s\n", lx->line, lx->col, msg); exit(1); @@ -56,7 +55,32 @@ sit lxnext(_LX *lx) { lx->line++; lx->col = 0; lxget(lx); - return lxnext(lx); // recurse to get next token + return lxnext(lx); + } + + // Comments: // and /* */ + if (c == '/') { + // peek one ahead + int saved = lx->pos; + lxget(lx); + char c2 = lxpeek(lx); + if (c2 == '/') { + while (lxpeek(lx) != '\n' && lxpeek(lx) != 0) lxget(lx); + return lxnext(lx); + } else if (c2 == '*') { + lxget(lx); // consume '*' + for (;;) { + char ch = lxpeek(lx); + if (ch == 0) break; // unterminated, let it go + lxget(lx); + if (ch == '\n') { lx->line++; lx->col = 0; } + if (ch == '*' && lxpeek(lx) == '/') { lxget(lx); break; } + } + return lxnext(lx); + } else { + // not a comment — put position back, fall through to normal '/' handling + lx->pos = saved; + } } if (isalpha(c) || c == '_') { @@ -110,28 +134,85 @@ sit lxnext(_LX *lx) { if (lxpeek(lx) == '|') { lxget(lx); lx->col++; return (_T){TK_OR,0,lx_strdup_checked(lx,"||")}; } } if (c == '"') { - // String literal - int start = lx->pos; // pos is already after the opening quote + // String literal — decode escape sequences into a fresh buffer + int cap = 64, dlen = 0; + char *decoded = (char*)malloc(cap); + if (!decoded) perror_at(lx, "out of memory"); while (lxpeek(lx) != '"' && lxpeek(lx) != 0) { + char ch; if (lxpeek(lx) == '\\') { lxget(lx); // consume backslash - if (lxpeek(lx) != 0) lxget(lx); // consume escaped char + char esc = (char)lxget(lx); + switch (esc) { + case 'n': ch = '\n'; break; + case 't': ch = '\t'; break; + case 'r': ch = '\r'; break; + case '0': ch = '\0'; break; + case '\\': ch = '\\'; break; + case '"': ch = '"'; break; + case '\'': ch = '\''; break; + default: ch = esc; break; + } } else { - lxget(lx); + ch = (char)lxget(lx); } + if (dlen + 2 > cap) { + cap *= 2; + char *tmp = (char*)realloc(decoded, cap); + if (!tmp) { free(decoded); perror_at(lx, "out of memory"); } + decoded = tmp; + } + decoded[dlen++] = ch; } + decoded[dlen] = '\0'; if (lxpeek(lx) == '"') { - int len = lx->pos - start; // length of content lxget(lx); // consume closing quote - char *text = strndup(lx->buf + start, len); // start at content - if (!text) { - perror_at(lx, "out of memory"); - } - return (_T){TK_STRING, 0, text}; + return (_T){TK_STRING, 0, decoded}; } else { + free(decoded); perror_at(lx, "unterminated string literal"); } } + if (c == '\'') { + /* Character literal: decode a single char or escape sequence and emit + * TK_CHARLIT with the integer value in .val so the parser never needs + * to reason about escape sequences. */ + int char_val = 0; + if (lxpeek(lx) == '\\') { + lxget(lx); /* consume backslash */ + char esc = lxget(lx); + switch (esc) { + case 'n': char_val = '\n'; break; + case 't': char_val = '\t'; break; + case 'r': char_val = '\r'; break; + case '0': char_val = '\0'; break; + case '\\': char_val = '\\'; break; + case '\'': char_val = '\''; break; + case '"': char_val = '"'; break; + case 'a': char_val = '\a'; break; + case 'b': char_val = '\b'; break; + case 'f': char_val = '\f'; break; + case 'v': char_val = '\v'; break; + case 'x': { + int h = 0; + while (isxdigit(lxpeek(lx))) { + char hc = lxget(lx); + h = h * 16 + (isdigit(hc) ? hc - '0' : tolower(hc) - 'a' + 10); + } + char_val = h; + break; + } + default: char_val = (unsigned char)esc; break; + } + } else if (lxpeek(lx) != '\'') { + char_val = (unsigned char)lxget(lx); + } + if (lxpeek(lx) != '\'') { + perror_at(lx, "unterminated or multi-character char literal"); + } + lxget(lx); /* consume closing ' */ + return (_T){TK_CHARLIT, char_val, NULL}; + } switch (c) { case '(': return (_T){TK_LPAREN, 0, lx_strdup_checked(lx,"(")}; @@ -144,12 +225,18 @@ sit lxnext(_LX *lx) { case ';': return (_T){TK_SEMI, 0, lx_strdup_checked(lx,";")}; case '+': + if (lxpeek(lx) == '+') { lxget(lx); lx->col++; return (_T){TK_INC, 0, lx_strdup_checked(lx,"++")}; } + if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_PLUS_EQ,0, lx_strdup_checked(lx,"+=")}; } return (_T){TK_PLUS, 0, lx_strdup_checked(lx,"+")}; case '-': + if (lxpeek(lx) == '-') { lxget(lx); lx->col++; return (_T){TK_DEC, 0, lx_strdup_checked(lx,"--")}; } + if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_MINUS_EQ,0, lx_strdup_checked(lx,"-=")}; } return (_T){TK_MINUS, 0, lx_strdup_checked(lx,"-")}; case '*': + if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_STAR_EQ, 0, lx_strdup_checked(lx,"*=")}; } return (_T){TK_STAR, 0, lx_strdup_checked(lx,"*")}; case '/': + if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_SLASH_EQ,0, lx_strdup_checked(lx,"/=")}; } return (_T){TK_SLASH, 0, lx_strdup_checked(lx,"/")}; case '&': return (_T){TK_AMP, 0, lx_strdup_checked(lx,"&")}; @@ -169,6 +256,10 @@ sit lxnext(_LX *lx) { return (_T){TK_LBRACKET, 0, lx_strdup_checked(lx,"[")}; case ']': return (_T){TK_RBRACKET, 0, lx_strdup_checked(lx,"]")}; + case '?': + return (_T){TK_QUESTION, 0, lx_strdup_checked(lx,"?")}; + case ':': + return (_T){TK_COLON, 0, lx_strdup_checked(lx,":")}; default: return (_T){TK_INVALID, 0, NULL}; } |
