summaryrefslogtreecommitdiff
path: root/src/lexer.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/lexer.h')
-rw-r--r--src/lexer.h117
1 files changed, 104 insertions, 13 deletions
diff --git a/src/lexer.h b/src/lexer.h
index ca2b790..3d36bf4 100644
--- a/src/lexer.h
+++ b/src/lexer.h
@@ -1,4 +1,3 @@
-
#ifndef INCLUDE_lexerlexer
#define INCLUDE_lexerlexer
@@ -15,7 +14,7 @@ typedef struct {
int col;
} _LX;
-/* Error reporting with line/column info */
+
static void perror_at(_LX *lx, const char *msg) {
fprintf(stderr, "[LEXER] Error at line %d, column %d: %s\n", lx->line, lx->col, msg);
exit(1);
@@ -56,7 +55,32 @@ sit lxnext(_LX *lx) {
lx->line++;
lx->col = 0;
lxget(lx);
- return lxnext(lx); // recurse to get next token
+ return lxnext(lx);
+ }
+
+ // Comments: // and /* */
+ if (c == '/') {
+ // peek one ahead
+ int saved = lx->pos;
+ lxget(lx);
+ char c2 = lxpeek(lx);
+ if (c2 == '/') {
+ while (lxpeek(lx) != '\n' && lxpeek(lx) != 0) lxget(lx);
+ return lxnext(lx);
+ } else if (c2 == '*') {
+ lxget(lx); // consume '*'
+ for (;;) {
+ char ch = lxpeek(lx);
+ if (ch == 0) break; // unterminated, let it go
+ lxget(lx);
+ if (ch == '\n') { lx->line++; lx->col = 0; }
+ if (ch == '*' && lxpeek(lx) == '/') { lxget(lx); break; }
+ }
+ return lxnext(lx);
+ } else {
+ // not a comment — put position back, fall through to normal '/' handling
+ lx->pos = saved;
+ }
}
if (isalpha(c) || c == '_') {
@@ -110,28 +134,85 @@ sit lxnext(_LX *lx) {
if (lxpeek(lx) == '|') { lxget(lx); lx->col++; return (_T){TK_OR,0,lx_strdup_checked(lx,"||")}; }
}
if (c == '"') {
- // String literal
- int start = lx->pos; // pos is already after the opening quote
+ // String literal — decode escape sequences into a fresh buffer
+ int cap = 64, dlen = 0;
+ char *decoded = (char*)malloc(cap);
+ if (!decoded) perror_at(lx, "out of memory");
while (lxpeek(lx) != '"' && lxpeek(lx) != 0) {
+ char ch;
if (lxpeek(lx) == '\\') {
lxget(lx); // consume backslash
- if (lxpeek(lx) != 0) lxget(lx); // consume escaped char
+ char esc = (char)lxget(lx);
+ switch (esc) {
+ case 'n': ch = '\n'; break;
+ case 't': ch = '\t'; break;
+ case 'r': ch = '\r'; break;
+ case '0': ch = '\0'; break;
+ case '\\': ch = '\\'; break;
+ case '"': ch = '"'; break;
+ case '\'': ch = '\''; break;
+ default: ch = esc; break;
+ }
} else {
- lxget(lx);
+ ch = (char)lxget(lx);
}
+ if (dlen + 2 > cap) {
+ cap *= 2;
+ char *tmp = (char*)realloc(decoded, cap);
+ if (!tmp) { free(decoded); perror_at(lx, "out of memory"); }
+ decoded = tmp;
+ }
+ decoded[dlen++] = ch;
}
+ decoded[dlen] = '\0';
if (lxpeek(lx) == '"') {
- int len = lx->pos - start; // length of content
lxget(lx); // consume closing quote
- char *text = strndup(lx->buf + start, len); // start at content
- if (!text) {
- perror_at(lx, "out of memory");
- }
- return (_T){TK_STRING, 0, text};
+ return (_T){TK_STRING, 0, decoded};
} else {
+ free(decoded);
perror_at(lx, "unterminated string literal");
}
}
+ if (c == '\'') {
+ /* Character literal: decode a single char or escape sequence and emit
+ * TK_CHARLIT with the integer value in .val so the parser never needs
+ * to reason about escape sequences. */
+ int char_val = 0;
+ if (lxpeek(lx) == '\\') {
+ lxget(lx); /* consume backslash */
+ char esc = lxget(lx);
+ switch (esc) {
+ case 'n': char_val = '\n'; break;
+ case 't': char_val = '\t'; break;
+ case 'r': char_val = '\r'; break;
+ case '0': char_val = '\0'; break;
+ case '\\': char_val = '\\'; break;
+ case '\'': char_val = '\''; break;
+ case '"': char_val = '"'; break;
+ case 'a': char_val = '\a'; break;
+ case 'b': char_val = '\b'; break;
+ case 'f': char_val = '\f'; break;
+ case 'v': char_val = '\v'; break;
+ case 'x': {
+ int h = 0;
+ while (isxdigit(lxpeek(lx))) {
+ char hc = lxget(lx);
+ h = h * 16 + (isdigit(hc) ? hc - '0' : tolower(hc) - 'a' + 10);
+ }
+ char_val = h;
+ break;
+ }
+ default: char_val = (unsigned char)esc; break;
+ }
+ } else if (lxpeek(lx) != '\'') {
+ char_val = (unsigned char)lxget(lx);
+ }
+ if (lxpeek(lx) != '\'') {
+ perror_at(lx, "unterminated or multi-character char literal");
+ }
+ lxget(lx); /* consume closing ' */
+ return (_T){TK_CHARLIT, char_val, NULL};
+ }
switch (c) {
case '(':
return (_T){TK_LPAREN, 0, lx_strdup_checked(lx,"(")};
@@ -144,12 +225,18 @@ sit lxnext(_LX *lx) {
case ';':
return (_T){TK_SEMI, 0, lx_strdup_checked(lx,";")};
case '+':
+ if (lxpeek(lx) == '+') { lxget(lx); lx->col++; return (_T){TK_INC, 0, lx_strdup_checked(lx,"++")}; }
+ if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_PLUS_EQ,0, lx_strdup_checked(lx,"+=")}; }
return (_T){TK_PLUS, 0, lx_strdup_checked(lx,"+")};
case '-':
+ if (lxpeek(lx) == '-') { lxget(lx); lx->col++; return (_T){TK_DEC, 0, lx_strdup_checked(lx,"--")}; }
+ if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_MINUS_EQ,0, lx_strdup_checked(lx,"-=")}; }
return (_T){TK_MINUS, 0, lx_strdup_checked(lx,"-")};
case '*':
+ if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_STAR_EQ, 0, lx_strdup_checked(lx,"*=")}; }
return (_T){TK_STAR, 0, lx_strdup_checked(lx,"*")};
case '/':
+ if (lxpeek(lx) == '=') { lxget(lx); lx->col++; return (_T){TK_SLASH_EQ,0, lx_strdup_checked(lx,"/=")}; }
return (_T){TK_SLASH, 0, lx_strdup_checked(lx,"/")};
case '&':
return (_T){TK_AMP, 0, lx_strdup_checked(lx,"&")};
@@ -169,6 +256,10 @@ sit lxnext(_LX *lx) {
return (_T){TK_LBRACKET, 0, lx_strdup_checked(lx,"[")};
case ']':
return (_T){TK_RBRACKET, 0, lx_strdup_checked(lx,"]")};
+ case '?':
+ return (_T){TK_QUESTION, 0, lx_strdup_checked(lx,"?")};
+ case ':':
+ return (_T){TK_COLON, 0, lx_strdup_checked(lx,":")};
default:
return (_T){TK_INVALID, 0, NULL};
}