#ifndef INCLUDE_lexerlexer #define INCLUDE_lexerlexer #include "token.h" #include #include #include #include typedef struct { const char *buf; int pos; int line; int col; } _LX; static void perror_at(_LX *lx, const char *msg) { fprintf(stderr, "[LEXER] Error at line %d, column %d: %s\n", lx->line, lx->col, msg); exit(1); } #define sic static inline char #define sii static inline int #define siv static inline void #define sit static inline _T sic lxpeek(_LX *lx) { return lx->buf[lx->pos]; } sic lxget(_LX *lx) { return lx->buf[lx->pos++]; } static inline char *lx_strdup_checked(_LX *lx, const char *s) { char *p = strdup(s); if (!p) { perror_at(lx, "out of memory"); } return p; } siv lxskipws(_LX *lx) { while (lxpeek(lx) == ' ' || lxpeek(lx) == '\t') { if (lxpeek(lx) == '\t') lx->col += 8; else lx->col++; lxget(lx); } } sit lxnext(_LX *lx) { lxskipws(lx); char c = lxpeek(lx); if (c == 0) return (_T){TK_EOF, 0, NULL}; // Track newlines if (c == '\n') { lx->line++; lx->col = 0; lxget(lx); return lxnext(lx); } // Comments: // and /* */ if (c == '/') { // peek one ahead int saved = lx->pos; lxget(lx); char c2 = lxpeek(lx); if (c2 == '/') { while (lxpeek(lx) != '\n' && lxpeek(lx) != 0) lxget(lx); return lxnext(lx); } else if (c2 == '*') { lxget(lx); // consume '*' for (;;) { char ch = lxpeek(lx); if (ch == 0) break; // unterminated, let it go lxget(lx); if (ch == '\n') { lx->line++; lx->col = 0; } if (ch == '*' && lxpeek(lx) == '/') { lxget(lx); break; } } return lxnext(lx); } else { // not a comment — put position back, fall through to normal '/' handling lx->pos = saved; } } if (isalpha(c) || c == '_') { int start = lx->pos; while (isalnum(lxpeek(lx)) || lxpeek(lx) == '_') lxget(lx); int len = lx->pos - start; char *text = strndup(lx->buf + start, len); if (!text) { perror_at(lx, "out of memory"); } return (_T){checkkw(text), 0, text}; } if (isdigit(c)) { int start = lx->pos; while (isdigit(lxpeek(lx))) lxget(lx); int len = lx->pos - start; char *text = strndup(lx->buf + start, len); if (!text) { perror_at(lx, "out of memory"); } return (_T){TK_NUMBER, atoi(text), text}; } lxget(lx); lx->col++; // handle multi-char tokens if (c == '=') { if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_EQ,0,lx_strdup_checked(lx,"==")}; } return (_T){TK_ASSIGN, 0, lx_strdup_checked(lx,"=")}; } if (c == '!') { if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_NE,0,lx_strdup_checked(lx,"!=")}; } return (_T){TK_BANG,0,lx_strdup_checked(lx,"!")}; } if (c == '<') { if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_LE,0,lx_strdup_checked(lx,"<=")}; } if (lxpeek(lx) == '<') { lxget(lx); lx->col++; return (_T){TK_SHL,0,lx_strdup_checked(lx,"<<")}; } return (_T){TK_LT,0,lx_strdup_checked(lx,"<")}; } if (c == '>') { if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_GE,0,lx_strdup_checked(lx,">=")}; } if (lxpeek(lx) == '>') { lxget(lx); lx->col++; return (_T){TK_SHR,0,lx_strdup_checked(lx,">>")}; } return (_T){TK_GT,0,lx_strdup_checked(lx,">")}; } if (c == '&') { if (lxpeek(lx) == '&') { lxget(lx); lx->col++; return (_T){TK_AND,0,lx_strdup_checked(lx,"&&")}; } } if (c == '|') { if (lxpeek(lx) == '|') { lxget(lx); lx->col++; return (_T){TK_OR,0,lx_strdup_checked(lx,"||")}; } } if (c == '"') { // String literal — decode escape sequences into a fresh buffer int cap = 64, dlen = 0; char *decoded = (char*)malloc(cap); if (!decoded) perror_at(lx, "out of memory"); while (lxpeek(lx) != '"' && lxpeek(lx) != 0) { char ch; if (lxpeek(lx) == '\\') { lxget(lx); // consume backslash char esc = (char)lxget(lx); switch (esc) { case 'n': ch = '\n'; break; case 't': ch = '\t'; break; case 'r': ch = '\r'; break; case '0': ch = '\0'; break; case '\\': ch = '\\'; break; case '"': ch = '"'; break; case '\'': ch = '\''; break; default: ch = esc; break; } } else { ch = (char)lxget(lx); } if (dlen + 2 > cap) { cap *= 2; char *tmp = (char*)realloc(decoded, cap); if (!tmp) { free(decoded); perror_at(lx, "out of memory"); } decoded = tmp; } decoded[dlen++] = ch; } decoded[dlen] = '\0'; if (lxpeek(lx) == '"') { lxget(lx); // consume closing quote return (_T){TK_STRING, 0, decoded}; } else { free(decoded); perror_at(lx, "unterminated string literal"); } } if (c == '\'') { /* Character literal: decode a single char or escape sequence and emit * TK_CHARLIT with the integer value in .val so the parser never needs * to reason about escape sequences. */ int char_val = 0; if (lxpeek(lx) == '\\') { lxget(lx); /* consume backslash */ char esc = lxget(lx); switch (esc) { case 'n': char_val = '\n'; break; case 't': char_val = '\t'; break; case 'r': char_val = '\r'; break; case '0': char_val = '\0'; break; case '\\': char_val = '\\'; break; case '\'': char_val = '\''; break; case '"': char_val = '"'; break; case 'a': char_val = '\a'; break; case 'b': char_val = '\b'; break; case 'f': char_val = '\f'; break; case 'v': char_val = '\v'; break; case 'x': { int h = 0; while (isxdigit(lxpeek(lx))) { char hc = lxget(lx); h = h * 16 + (isdigit(hc) ? hc - '0' : tolower(hc) - 'a' + 10); } char_val = h; break; } default: char_val = (unsigned char)esc; break; } } else if (lxpeek(lx) != '\'') { char_val = (unsigned char)lxget(lx); } if (lxpeek(lx) != '\'') { perror_at(lx, "unterminated or multi-character char literal"); } lxget(lx); /* consume closing ' */ return (_T){TK_CHARLIT, char_val, NULL}; } switch (c) { case '(': return (_T){TK_LPAREN, 0, lx_strdup_checked(lx,"(")}; case ')': return (_T){TK_RPAREN, 0, lx_strdup_checked(lx,")")}; case '{': return (_T){TK_LBRACE, 0, lx_strdup_checked(lx,"{")}; case '}': return (_T){TK_RBRACE, 0, lx_strdup_checked(lx,"}")}; case ';': return (_T){TK_SEMI, 0, lx_strdup_checked(lx,";")}; case '+': if (lxpeek(lx) == '+') { lxget(lx); lx->col++; return (_T){TK_INC, 0, lx_strdup_checked(lx,"++")}; } if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_PLUS_EQ,0, lx_strdup_checked(lx,"+=")}; } return (_T){TK_PLUS, 0, lx_strdup_checked(lx,"+")}; case '-': if (lxpeek(lx) == '-') { lxget(lx); lx->col++; return (_T){TK_DEC, 0, lx_strdup_checked(lx,"--")}; } if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_MINUS_EQ,0, lx_strdup_checked(lx,"-=")}; } return (_T){TK_MINUS, 0, lx_strdup_checked(lx,"-")}; case '*': if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_STAR_EQ, 0, lx_strdup_checked(lx,"*=")}; } return (_T){TK_STAR, 0, lx_strdup_checked(lx,"*")}; case '/': if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_SLASH_EQ,0, lx_strdup_checked(lx,"/=")}; } return (_T){TK_SLASH, 0, lx_strdup_checked(lx,"/")}; case '&': return (_T){TK_AMP, 0, lx_strdup_checked(lx,"&")}; case '|': return (_T){TK_BAR, 0, lx_strdup_checked(lx,"|")}; case '^': return (_T){TK_CARET, 0, lx_strdup_checked(lx,"^")}; case '%': return (_T){TK_PERCENT, 0, lx_strdup_checked(lx,"%")}; case ',': return (_T){TK_COMMA, 0, lx_strdup_checked(lx,",")}; case '\'': return (_T){TK_SQUOTE, 0, lx_strdup_checked(lx,"'")}; case '"': return (_T){TK_DQUOTE, 0, lx_strdup_checked(lx,"\"")}; case '[': return (_T){TK_LBRACKET, 0, lx_strdup_checked(lx,"[")}; case ']': return (_T){TK_RBRACKET, 0, lx_strdup_checked(lx,"]")}; case '?': return (_T){TK_QUESTION, 0, lx_strdup_checked(lx,"?")}; case ':': return (_T){TK_COLON, 0, lx_strdup_checked(lx,":")}; default: return (_T){TK_INVALID, 0, NULL}; } } #undef sic #undef sii #undef siv #undef sit #endif