diff options
Diffstat (limited to 'src/lexer.h')
| -rw-r--r-- | src/lexer.h | 182 |
1 files changed, 182 insertions, 0 deletions
diff --git a/src/lexer.h b/src/lexer.h new file mode 100644 index 0000000..ca2b790 --- /dev/null +++ b/src/lexer.h @@ -0,0 +1,182 @@ + +#ifndef INCLUDE_lexerlexer +#define INCLUDE_lexerlexer + +#include "token.h" +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +typedef struct { + const char *buf; + int pos; + int line; + int col; +} _LX; + +/* Error reporting with line/column info */ +static void perror_at(_LX *lx, const char *msg) { + fprintf(stderr, "[LEXER] Error at line %d, column %d: %s\n", lx->line, lx->col, msg); + exit(1); +} + +#define sic static inline char +#define sii static inline int +#define siv static inline void +#define sit static inline _T + +sic lxpeek(_LX *lx) { return lx->buf[lx->pos]; } + +sic lxget(_LX *lx) { return lx->buf[lx->pos++]; } + +static inline char *lx_strdup_checked(_LX *lx, const char *s) { + char *p = strdup(s); + if (!p) { + perror_at(lx, "out of memory"); + } + return p; +} + +siv lxskipws(_LX *lx) { + while (lxpeek(lx) == ' ' || lxpeek(lx) == '\t') { + if (lxpeek(lx) == '\t') lx->col += 8; else lx->col++; + lxget(lx); + } +} + +sit lxnext(_LX *lx) { + lxskipws(lx); + char c = lxpeek(lx); + if (c == 0) + return (_T){TK_EOF, 0, NULL}; + + // Track newlines + if (c == '\n') { + lx->line++; + lx->col = 0; + lxget(lx); + return lxnext(lx); // recurse to get next token + } + + if (isalpha(c) || c == '_') { + int start = lx->pos; + while (isalnum(lxpeek(lx)) || lxpeek(lx) == '_') + lxget(lx); + int len = lx->pos - start; + char *text = strndup(lx->buf + start, len); + if (!text) { + perror_at(lx, "out of memory"); + } + return (_T){checkkw(text), 0, text}; + } + if (isdigit(c)) { + int start = lx->pos; + while (isdigit(lxpeek(lx))) + lxget(lx); + int len = lx->pos - start; + char *text = strndup(lx->buf + start, len); + if (!text) { + perror_at(lx, "out of memory"); + } + return (_T){TK_NUMBER, atoi(text), text}; + } + + lxget(lx); + lx->col++; + // handle multi-char tokens + if (c == '=') { + if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_EQ,0,lx_strdup_checked(lx,"==")}; } + return (_T){TK_ASSIGN, 0, lx_strdup_checked(lx,"=")}; + } + if (c == '!') { + if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_NE,0,lx_strdup_checked(lx,"!=")}; } + return (_T){TK_BANG,0,lx_strdup_checked(lx,"!")}; + } + if (c == '<') { + if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_LE,0,lx_strdup_checked(lx,"<=")}; } + if (lxpeek(lx) == '<') { lxget(lx); lx->col++; return (_T){TK_SHL,0,lx_strdup_checked(lx,"<<")}; } + return (_T){TK_LT,0,lx_strdup_checked(lx,"<")}; + } + if (c == '>') { + if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_GE,0,lx_strdup_checked(lx,">=")}; } + if (lxpeek(lx) == '>') { lxget(lx); lx->col++; return (_T){TK_SHR,0,lx_strdup_checked(lx,">>")}; } + return (_T){TK_GT,0,lx_strdup_checked(lx,">")}; + } + if (c == '&') { + if (lxpeek(lx) == '&') { lxget(lx); lx->col++; return (_T){TK_AND,0,lx_strdup_checked(lx,"&&")}; } + } + if (c == '|') { + if (lxpeek(lx) == '|') { lxget(lx); lx->col++; return (_T){TK_OR,0,lx_strdup_checked(lx,"||")}; } + } + if (c == '"') { + // String literal + int start = lx->pos; // pos is already after the opening quote + while (lxpeek(lx) != '"' && lxpeek(lx) != 0) { + if (lxpeek(lx) == '\\') { + lxget(lx); // consume backslash + if (lxpeek(lx) != 0) lxget(lx); // consume escaped char + } else { + lxget(lx); + } + } + if (lxpeek(lx) == '"') { + int len = lx->pos - start; // length of content + lxget(lx); // consume closing quote + char *text = strndup(lx->buf + start, len); // start at content + if (!text) { + perror_at(lx, "out of memory"); + } + return (_T){TK_STRING, 0, text}; + } else { + perror_at(lx, "unterminated string literal"); + } + } + switch (c) { + case '(': + return (_T){TK_LPAREN, 0, lx_strdup_checked(lx,"(")}; + case ')': + return (_T){TK_RPAREN, 0, lx_strdup_checked(lx,")")}; + case '{': + return (_T){TK_LBRACE, 0, lx_strdup_checked(lx,"{")}; + case '}': + return (_T){TK_RBRACE, 0, lx_strdup_checked(lx,"}")}; + case ';': + return (_T){TK_SEMI, 0, lx_strdup_checked(lx,";")}; + case '+': + return (_T){TK_PLUS, 0, lx_strdup_checked(lx,"+")}; + case '-': + return (_T){TK_MINUS, 0, lx_strdup_checked(lx,"-")}; + case '*': + return (_T){TK_STAR, 0, lx_strdup_checked(lx,"*")}; + case '/': + return (_T){TK_SLASH, 0, lx_strdup_checked(lx,"/")}; + case '&': + return (_T){TK_AMP, 0, lx_strdup_checked(lx,"&")}; + case '|': + return (_T){TK_BAR, 0, lx_strdup_checked(lx,"|")}; + case '^': + return (_T){TK_CARET, 0, lx_strdup_checked(lx,"^")}; + case '%': + return (_T){TK_PERCENT, 0, lx_strdup_checked(lx,"%")}; + case ',': + return (_T){TK_COMMA, 0, lx_strdup_checked(lx,",")}; + case '\'': + return (_T){TK_SQUOTE, 0, lx_strdup_checked(lx,"'")}; + case '"': + return (_T){TK_DQUOTE, 0, lx_strdup_checked(lx,"\"")}; + case '[': + return (_T){TK_LBRACKET, 0, lx_strdup_checked(lx,"[")}; + case ']': + return (_T){TK_RBRACKET, 0, lx_strdup_checked(lx,"]")}; + default: + return (_T){TK_INVALID, 0, NULL}; + } +} + +#undef sic +#undef sii +#undef siv +#undef sit + +#endif |
