From 6cb316da30cb460d54c4eec6fe15e034c89a7b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Fri, 17 Oct 2025 17:00:49 +0200 Subject: [PATCH 01/33] Make token value hb_string --- src/include/token_struct.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/include/token_struct.h b/src/include/token_struct.h index 2727d2a4c..595bf27d1 100644 --- a/src/include/token_struct.h +++ b/src/include/token_struct.h @@ -3,6 +3,7 @@ #include "location.h" #include "range.h" +#include "util/hb_string.h" typedef enum { TOKEN_WHITESPACE, // ' ' @@ -49,7 +50,7 @@ typedef enum { } token_type_T; typedef struct TOKEN_STRUCT { - char* value; + hb_string_T value; range_T range; location_T location; token_type_T type; From 74ceb0e824b0c0cd5afa50bba1e7c9b4b46489b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Fri, 17 Oct 2025 17:02:12 +0200 Subject: [PATCH 02/33] Make token_init take hb_string value --- src/include/token.h | 3 ++- src/token.c | 9 +++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/include/token.h b/src/include/token.h index 5628e2f0d..7c8dd8778 100644 --- a/src/include/token.h +++ b/src/include/token.h @@ -4,8 +4,9 @@ #include "lexer_struct.h" #include "position.h" #include "token_struct.h" +#include "util/hb_string.h" -token_T* token_init(const char* value, token_type_T type, lexer_T* lexer); +token_T* token_init(hb_string_T value, token_type_T type, lexer_T* lexer); char* token_to_string(const token_T* token); const char* token_type_to_string(token_type_T type); diff --git a/src/token.c b/src/token.c index b6ef42fe8..e0fd81ff8 100644 --- a/src/token.c +++ b/src/token.c @@ -4,6 +4,7 @@ #include "include/range.h" #include "include/token_struct.h" #include "include/util.h" +#include "include/util/hb_string.h" #include #include @@ -13,7 +14,7 @@ size_t token_sizeof(void) { return sizeof(struct TOKEN_STRUCT); } -token_T* token_init(const char* value, const token_type_T type, lexer_T* lexer) { +token_T* token_init(hb_string_T value, const token_type_T type, lexer_T* lexer) { token_T* token = calloc(1, token_sizeof()); if (type == TOKEN_NEWLINE) { @@ -21,11 +22,7 @@ token_T* token_init(const char* value, const token_type_T type, lexer_T* lexer) lexer->current_column = 0; } - if (value) { - token->value = herb_strdup(value); - } else { - token->value = NULL; - } + token->value = value; token->type = type; token->range = (range_T) { .from = lexer->previous_position, .to = lexer->current_position }; From 2de7ee36fe9d1f0f0fec8761d21f4f138edcfcf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Fri, 17 Oct 2025 17:04:48 +0200 Subject: [PATCH 03/33] Allocate lexer errors using the arena --- src/lexer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 45995751c..7de89dc64 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -55,7 +55,7 @@ void lexer_init(lexer_T* lexer, const char* source) { } token_T* lexer_error(lexer_T* lexer, const char* message) { - char error_message[128]; + char *error_message = hb_arena_alloc(lexer->allocator, sizeof(char) * 128); snprintf( error_message, @@ -67,7 +67,7 @@ token_T* lexer_error(lexer_T* lexer, const char* message) { lexer->current_column ); - return token_init(error_message, TOKEN_ERROR, lexer); + return token_init(hb_string_from_c_string(error_message), TOKEN_ERROR, lexer); } static void lexer_advance(lexer_T* lexer) { From ef6120c29c266032321bd578ff17c359d4bc5f42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Fri, 17 Oct 2025 17:14:21 +0200 Subject: [PATCH 04/33] Make lexer_match_and_advance take string --- src/lexer.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 7de89dc64..85d106ca3 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -55,7 +55,7 @@ void lexer_init(lexer_T* lexer, const char* source) { } token_T* lexer_error(lexer_T* lexer, const char* message) { - char *error_message = hb_arena_alloc(lexer->allocator, sizeof(char) * 128); + char* error_message = hb_arena_alloc(lexer->allocator, sizeof(char) * 128); snprintf( error_message, @@ -157,9 +157,11 @@ static token_T* lexer_advance_utf8_character(lexer_T* lexer, const token_type_T return token; } -static token_T* lexer_match_and_advance(lexer_T* lexer, const char* value, const token_type_T type) { - if (strncmp(lexer->source.data + lexer->current_position, value, strlen(value)) == 0) { - return lexer_advance_with(lexer, value, type); +static token_T* lexer_match_and_advance(lexer_T* lexer, hb_string_T value, const token_type_T type) { + hb_string_T remaining_source = hb_string_slice(lexer->source, lexer->current_position); + if (hb_string_starts_with(remaining_source, value)) { + // TODO(Tim): Fix string + return lexer_advance_with(lexer, value.data, type); } return NULL; @@ -206,11 +208,13 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) { // ===== ERB Parsing static token_T* lexer_parse_erb_open(lexer_T* lexer) { - const char* erb_patterns[] = { "<%==", "<%%=", "<%=", "<%#", "<%-", "<%%", "<%" }; + hb_string_T erb_patterns[7] = { hb_string_from_c_string("<%=="), hb_string_from_c_string("<%%="), + hb_string_from_c_string("<%="), hb_string_from_c_string("<%#"), + hb_string_from_c_string("<%-"), hb_string_from_c_string("<%%"), + hb_string_from_c_string("<%") }; lexer->state = STATE_ERB_CONTENT; - - for (size_t i = 0; i < sizeof(erb_patterns) / sizeof(erb_patterns[0]); i++) { + for (size_t i = 0; i < 7; i++) { token_T* match = lexer_match_and_advance(lexer, erb_patterns[i], TOKEN_ERB_START); if (match) { return match; } } @@ -314,22 +318,22 @@ token_T* lexer_next_token(lexer_T* lexer) { } case '/': { - token_T* token = lexer_match_and_advance(lexer, "/>", TOKEN_HTML_TAG_SELF_CLOSE); + token_T* token = lexer_match_and_advance(lexer, hb_string_from_c_string("/>"), TOKEN_HTML_TAG_SELF_CLOSE); return token ? token : lexer_advance_current(lexer, TOKEN_SLASH); } case '?': { - token_T* token = lexer_match_and_advance(lexer, "?>", TOKEN_XML_DECLARATION_END); + token_T* token = lexer_match_and_advance(lexer, hb_string_from_c_string("?>"), TOKEN_XML_DECLARATION_END); return token ? token : lexer_advance_current(lexer, TOKEN_CHARACTER); } case '-': { - token_T* token = lexer_match_and_advance(lexer, "-->", TOKEN_HTML_COMMENT_END); + token_T* token = lexer_match_and_advance(lexer, hb_string_from_c_string("-->"), TOKEN_HTML_COMMENT_END); return token ? token : lexer_advance_current(lexer, TOKEN_DASH); } case ']': { - token_T* token = lexer_match_and_advance(lexer, "]]>", TOKEN_CDATA_END); + token_T* token = lexer_match_and_advance(lexer, hb_string_from_c_string("]]>"), TOKEN_CDATA_END); return token ? token : lexer_advance_current(lexer, TOKEN_CHARACTER); } From 86c995fd43131b21e5e70ee55b52c4937701c04b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Fri, 17 Oct 2025 17:40:13 +0200 Subject: [PATCH 05/33] Make lexer_advance_with take string --- src/lexer.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 85d106ca3..3127966e7 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -102,8 +102,8 @@ static void lexer_advance_by(lexer_T* lexer, const size_t count) { } } -static token_T* lexer_advance_with(lexer_T* lexer, const char* value, const token_type_T type) { - lexer_advance_by(lexer, strlen(value)); +static token_T* lexer_advance_with(lexer_T* lexer, hb_string_T value, const token_type_T type) { + lexer_advance_by(lexer, value.length); return token_init(value, type, lexer); } @@ -125,7 +125,8 @@ static token_T* lexer_advance_with_next(lexer_T* lexer, size_t count, token_type } static token_T* lexer_advance_current(lexer_T* lexer, const token_type_T type) { - return lexer_advance_with(lexer, (char[]) { lexer->current_character, '\0' }, type); + hb_string_T value = { .data = lexer->source.data + lexer->current_position, .length = 1 }; + return lexer_advance_with(lexer, value, type); } static token_T* lexer_advance_utf8_character(lexer_T* lexer, const token_type_T type) { @@ -159,10 +160,7 @@ static token_T* lexer_advance_utf8_character(lexer_T* lexer, const token_type_T static token_T* lexer_match_and_advance(lexer_T* lexer, hb_string_T value, const token_type_T type) { hb_string_T remaining_source = hb_string_slice(lexer->source, lexer->current_position); - if (hb_string_starts_with(remaining_source, value)) { - // TODO(Tim): Fix string - return lexer_advance_with(lexer, value.data, type); - } + if (hb_string_starts_with(remaining_source, value)) { return lexer_advance_with(lexer, value, type); } return NULL; } @@ -260,11 +258,17 @@ static token_T* lexer_parse_erb_content(lexer_T* lexer) { static token_T* lexer_parse_erb_close(lexer_T* lexer) { lexer->state = STATE_DATA; - if (lexer_peek_erb_percent_close_tag(lexer, 0)) { return lexer_advance_with(lexer, "%%>", TOKEN_ERB_END); } - if (lexer_peek_erb_equals_close_tag(lexer, 0)) { return lexer_advance_with(lexer, "=%>", TOKEN_ERB_END); } - if (lexer_peek_erb_dash_close_tag(lexer, 0)) { return lexer_advance_with(lexer, "-%>", TOKEN_ERB_END); } + if (lexer_peek_erb_percent_close_tag(lexer, 0)) { + return lexer_advance_with(lexer, hb_string_from_c_string("%%>"), TOKEN_ERB_END); + } + if (lexer_peek_erb_equals_close_tag(lexer, 0)) { + return lexer_advance_with(lexer, hb_string_from_c_string("=%>"), TOKEN_ERB_END); + } + if (lexer_peek_erb_dash_close_tag(lexer, 0)) { + return lexer_advance_with(lexer, hb_string_from_c_string("-%>"), TOKEN_ERB_END); + } - return lexer_advance_with(lexer, "%>", TOKEN_ERB_END); + return lexer_advance_with(lexer, hb_string_from_c_string("%>"), TOKEN_ERB_END); } // ===== Tokenizing Function @@ -307,11 +311,11 @@ token_T* lexer_next_token(lexer_T* lexer) { if (isalnum(lexer_peek(lexer, 1))) { return lexer_advance_current(lexer, TOKEN_HTML_TAG_START); } if (lexer_peek_for_html_comment_start(lexer, 0)) { - return lexer_advance_with(lexer, ""), TOKEN_HTML_COMMENT_END); + token_T* token = lexer_match_and_advance(lexer, hb_string("-->"), TOKEN_HTML_COMMENT_END); return token ? token : lexer_advance_current(lexer, TOKEN_DASH); } case ']': { - token_T* token = lexer_match_and_advance(lexer, hb_string_from_c_string("]]>"), TOKEN_CDATA_END); + token_T* token = lexer_match_and_advance(lexer, hb_string("]]>"), TOKEN_CDATA_END); return token ? token : lexer_advance_current(lexer, TOKEN_CHARACTER); } From 70c32cd7d6fba2e1d6b7763fae6fe5f9698f6c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Mon, 27 Oct 2025 18:59:13 +0100 Subject: [PATCH 23/33] Fix hb_string_from_c_string usages in extract.c --- src/extract.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/extract.c b/src/extract.c index 7ff547da9..f0afde2f3 100644 --- a/src/extract.c +++ b/src/extract.c @@ -21,9 +21,9 @@ void herb_extract_ruby_to_buffer_with_semicolons(const char* source, hb_buffer_T } case TOKEN_ERB_START: { - if (hb_string_equals(token->value, hb_string_from_c_string("<%#")) - || hb_string_equals(token->value, hb_string_from_c_string("<%%")) - || hb_string_equals(token->value, hb_string_from_c_string("<%%="))) { + if (hb_string_equals(token->value, hb_string("<%#")) + || hb_string_equals(token->value, hb_string("<%%")) + || hb_string_equals(token->value, hb_string("<%%="))) { skip_erb_content = true; } @@ -73,9 +73,9 @@ void herb_extract_ruby_to_buffer(const char* source, hb_buffer_T* output) { } case TOKEN_ERB_START: { - if (hb_string_equals(token->value, hb_string_from_c_string("<%#")) - || hb_string_equals(token->value, hb_string_from_c_string("<%%")) - || hb_string_equals(token->value, hb_string_from_c_string("<%%="))) { + if (hb_string_equals(token->value, hb_string("<%#")) + || hb_string_equals(token->value, hb_string("<%%")) + || hb_string_equals(token->value, hb_string("<%%="))) { skip_erb_content = true; } From ca1fa0bb616989883b8c4f4fbcc234c8ed030d1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Mon, 27 Oct 2025 18:59:54 +0100 Subject: [PATCH 24/33] Fix hb_string_from_c_string usages in parser.c --- src/parser.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser.c b/src/parser.c index de58cc58d..c76002c79 100644 --- a/src/parser.c +++ b/src/parser.c @@ -609,7 +609,7 @@ static AST_HTML_ATTRIBUTE_NODE_T* parser_parse_html_attribute(parser_T* parser) token_T* equals_with_whitespace = calloc(1, sizeof(token_T)); equals_with_whitespace->type = TOKEN_EQUALS; // TODO(Tim): This is a leak - equals_with_whitespace->value = hb_string_from_c_string(equals_buffer.value); + equals_with_whitespace->value = hb_string(equals_buffer.value); equals_with_whitespace->location = (location_T) { .start = equals_start, .end = equals_end }; equals_with_whitespace->range = (range_T) { .from = range_start, .to = range_end }; @@ -719,7 +719,7 @@ static bool parser_lookahead_erb_is_attribute(lexer_T* lexer) { static void parser_handle_erb_in_open_tag(parser_T* parser, hb_array_T* children) { bool is_output_tag = !hb_string_is_empty(parser->current_token->value) - && hb_string_starts_with(parser->current_token->value, hb_string_from_c_string("<%=")); + && hb_string_starts_with(parser->current_token->value, hb_string("<%=")); if (!is_output_tag) { hb_array_append(children, parser_parse_erb_tag(parser)); From 78d0f7899d71983ebd0174840670cf08f0065c26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Tue, 28 Oct 2025 07:28:46 +0100 Subject: [PATCH 25/33] WIP: Use hb_string in token_type_to_string --- src/include/token.h | 4 +- src/token.c | 95 +++++++++++++++++++++------------------------ 2 files changed, 47 insertions(+), 52 deletions(-) diff --git a/src/include/token.h b/src/include/token.h index 7c8dd8778..2f4c0d29d 100644 --- a/src/include/token.h +++ b/src/include/token.h @@ -8,9 +8,9 @@ token_T* token_init(hb_string_T value, token_type_T type, lexer_T* lexer); char* token_to_string(const token_T* token); -const char* token_type_to_string(token_type_T type); +hb_string_T token_type_to_string(token_type_T type); -char* token_value(const token_T* token); +hb_string_T token_value(const token_T* token); int token_type(const token_T* token); size_t token_sizeof(void); diff --git a/src/token.c b/src/token.c index 1a3d5aa50..50ffeec91 100644 --- a/src/token.c +++ b/src/token.c @@ -43,53 +43,53 @@ token_T* token_init(hb_string_T value, const token_type_T type, lexer_T* lexer) return token; } -const char* token_type_to_string(const token_type_T type) { +hb_string_T token_type_to_string(const token_type_T type) { switch (type) { - case TOKEN_WHITESPACE: return "TOKEN_WHITESPACE"; - case TOKEN_NBSP: return "TOKEN_NBSP"; - case TOKEN_NEWLINE: return "TOKEN_NEWLINE"; - case TOKEN_IDENTIFIER: return "TOKEN_IDENTIFIER"; - case TOKEN_HTML_DOCTYPE: return "TOKEN_HTML_DOCTYPE"; - case TOKEN_XML_DECLARATION: return "TOKEN_XML_DECLARATION"; - case TOKEN_XML_DECLARATION_END: return "TOKEN_XML_DECLARATION_END"; - case TOKEN_CDATA_START: return "TOKEN_CDATA_START"; - case TOKEN_CDATA_END: return "TOKEN_CDATA_END"; - case TOKEN_HTML_TAG_START: return "TOKEN_HTML_TAG_START"; - case TOKEN_HTML_TAG_END: return "TOKEN_HTML_TAG_END"; - case TOKEN_HTML_TAG_START_CLOSE: return "TOKEN_HTML_TAG_START_CLOSE"; - case TOKEN_HTML_TAG_SELF_CLOSE: return "TOKEN_HTML_TAG_SELF_CLOSE"; - case TOKEN_HTML_COMMENT_START: return "TOKEN_HTML_COMMENT_START"; - case TOKEN_HTML_COMMENT_END: return "TOKEN_HTML_COMMENT_END"; - case TOKEN_EQUALS: return "TOKEN_EQUALS"; - case TOKEN_QUOTE: return "TOKEN_QUOTE"; - case TOKEN_BACKTICK: return "TOKEN_BACKTICK"; - case TOKEN_BACKSLASH: return "TOKEN_BACKSLASH"; - case TOKEN_DASH: return "TOKEN_DASH"; - case TOKEN_UNDERSCORE: return "TOKEN_UNDERSCORE"; - case TOKEN_EXCLAMATION: return "TOKEN_EXCLAMATION"; - case TOKEN_SLASH: return "TOKEN_SLASH"; - case TOKEN_SEMICOLON: return "TOKEN_SEMICOLON"; - case TOKEN_COLON: return "TOKEN_COLON"; - case TOKEN_AT: return "TOKEN_AT"; - case TOKEN_LT: return "TOKEN_LT"; - case TOKEN_PERCENT: return "TOKEN_PERCENT"; - case TOKEN_AMPERSAND: return "TOKEN_AMPERSAND"; - case TOKEN_ERB_START: return "TOKEN_ERB_START"; - case TOKEN_ERB_CONTENT: return "TOKEN_ERB_CONTENT"; - case TOKEN_ERB_END: return "TOKEN_ERB_END"; - case TOKEN_CHARACTER: return "TOKEN_CHARACTER"; - case TOKEN_ERROR: return "TOKEN_ERROR"; - case TOKEN_EOF: return "TOKEN_EOF"; + case TOKEN_WHITESPACE: return hb_string("TOKEN_WHITESPACE"); + case TOKEN_NBSP: return hb_string("TOKEN_NBSP"); + case TOKEN_NEWLINE: return hb_string("TOKEN_NEWLINE"); + case TOKEN_IDENTIFIER: return hb_string("TOKEN_IDENTIFIER"); + case TOKEN_HTML_DOCTYPE: return hb_string("TOKEN_HTML_DOCTYPE"); + case TOKEN_XML_DECLARATION: return hb_string("TOKEN_XML_DECLARATION"); + case TOKEN_XML_DECLARATION_END: return hb_string("TOKEN_XML_DECLARATION_END"); + case TOKEN_CDATA_START: return hb_string("TOKEN_CDATA_START"); + case TOKEN_CDATA_END: return hb_string("TOKEN_CDATA_END"); + case TOKEN_HTML_TAG_START: return hb_string("TOKEN_HTML_TAG_START"); + case TOKEN_HTML_TAG_END: return hb_string("TOKEN_HTML_TAG_END"); + case TOKEN_HTML_TAG_START_CLOSE: return hb_string("TOKEN_HTML_TAG_START_CLOSE"); + case TOKEN_HTML_TAG_SELF_CLOSE: return hb_string("TOKEN_HTML_TAG_SELF_CLOSE"); + case TOKEN_HTML_COMMENT_START: return hb_string("TOKEN_HTML_COMMENT_START"); + case TOKEN_HTML_COMMENT_END: return hb_string("TOKEN_HTML_COMMENT_END"); + case TOKEN_EQUALS: return hb_string("TOKEN_EQUALS"); + case TOKEN_QUOTE: return hb_string("TOKEN_QUOTE"); + case TOKEN_BACKTICK: return hb_string("TOKEN_BACKTICK"); + case TOKEN_BACKSLASH: return hb_string("TOKEN_BACKSLASH"); + case TOKEN_DASH: return hb_string("TOKEN_DASH"); + case TOKEN_UNDERSCORE: return hb_string("TOKEN_UNDERSCORE"); + case TOKEN_EXCLAMATION: return hb_string("TOKEN_EXCLAMATION"); + case TOKEN_SLASH: return hb_string("TOKEN_SLASH"); + case TOKEN_SEMICOLON: return hb_string("TOKEN_SEMICOLON"); + case TOKEN_COLON: return hb_string("TOKEN_COLON"); + case TOKEN_AT: return hb_string("TOKEN_AT"); + case TOKEN_LT: return hb_string("TOKEN_LT"); + case TOKEN_PERCENT: return hb_string("TOKEN_PERCENT"); + case TOKEN_AMPERSAND: return hb_string("TOKEN_AMPERSAND"); + case TOKEN_ERB_START: return hb_string("TOKEN_ERB_START"); + case TOKEN_ERB_CONTENT: return hb_string("TOKEN_ERB_CONTENT"); + case TOKEN_ERB_END: return hb_string("TOKEN_ERB_END"); + case TOKEN_CHARACTER: return hb_string("TOKEN_CHARACTER"); + case TOKEN_ERROR: return hb_string("TOKEN_ERROR"); + case TOKEN_EOF: return hb_string("TOKEN_EOF"); } - return "Unknown token_type_T"; + return hb_string("Unknown token_type_T"); } char* token_to_string(const token_T* token) { - const char* type_string = token_type_to_string(token->type); - const char* template = "#"; + hb_string_T type_string = token_type_to_string(token->type); + hb_string_T template = hb_string("#"); - char* string = calloc(strlen(type_string) + strlen(template) + token->value.length + 16, sizeof(char)); + char* string = calloc(template.length + type_string.length + token->value.length + 16, sizeof(char)); hb_string_T escaped; if (token->type == TOKEN_EOF) { @@ -100,8 +100,9 @@ char* token_to_string(const token_T* token) { sprintf( string, - template, - type_string, + template.data, + type_string.length, + type_string.data, escaped.length, escaped.data, token->range.from, @@ -117,12 +118,8 @@ char* token_to_string(const token_T* token) { return string; } -char* token_value(const token_T* token) { - hb_buffer_T buffer; - hb_buffer_init(&buffer, token->value.length); - hb_buffer_append_string(&buffer, token->value); - - return buffer.value; +hb_string_T token_value(const token_T* token) { + return token->value; } int token_type(const token_T* token) { @@ -149,7 +146,5 @@ token_T* token_copy(token_T* token) { void token_free(token_T* token) { if (!token) { return; } - // if (token->value != NULL) { free(token->value); } - free(token); } From 6c4814d0b257415ad27ba4d83f7500b1a7c415e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Tue, 28 Oct 2025 07:39:18 +0100 Subject: [PATCH 26/33] Use hb_string_T in errors template --- templates/src/errors.c.erb | 5 +---- templates/template.rb | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/templates/src/errors.c.erb b/templates/src/errors.c.erb index b61edee00..6595914e4 100644 --- a/templates/src/errors.c.erb +++ b/templates/src/errors.c.erb @@ -42,10 +42,7 @@ void error_init(ERROR_T* error, const error_type_T type, position_T start, posit if (message) { <%- error.message_arguments.each_with_index do |argument, i| -%> <%- if error.message_template.scan(/%[sdulfz]/)[i] == "%s" -%> - char truncated_argument_<%= i %>[ERROR_MESSAGES_TRUNCATED_LENGTH + 1]; - strncpy(truncated_argument_<%= i %>, <%= argument %>, ERROR_MESSAGES_TRUNCATED_LENGTH); - truncated_argument_<%= i %>[ERROR_MESSAGES_TRUNCATED_LENGTH] = '\0'; - + hb_string_T truncated_argument_<%= i %> = hb_string_truncate(<%= argument %>, ERROR_MESSAGES_TRUNCATED_LENGTH); <%- end -%> <%- end -%> snprintf( diff --git a/templates/template.rb b/templates/template.rb index 749c970db..a651afc22 100755 --- a/templates/template.rb +++ b/templates/template.rb @@ -103,7 +103,7 @@ def ruby_type end def c_type - "const char*" + "hb_string_T" end end From a255eeba0877094d221b5a353694b0a24cc648dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Tue, 28 Oct 2025 07:50:54 +0100 Subject: [PATCH 27/33] Adapt errors call sites to use hb_string_T --- src/include/parser_helpers.h | 4 ++-- src/parser.c | 38 ++++++++++++++++++------------------ src/parser_helpers.c | 9 ++++----- src/prism_helpers.c | 6 +++--- 4 files changed, 28 insertions(+), 29 deletions(-) diff --git a/src/include/parser_helpers.h b/src/include/parser_helpers.h index b3ab98300..ecdd6208a 100644 --- a/src/include/parser_helpers.h +++ b/src/include/parser_helpers.h @@ -15,8 +15,8 @@ token_T* parser_pop_open_tag(const parser_T* parser); void parser_append_unexpected_error( parser_T* parser, - const char* description, - const char* expected, + hb_string_T description, + hb_string_T expected, hb_array_T* errors ); void parser_append_unexpected_token_error(parser_T* parser, token_type_T expected_type, hb_array_T* errors); diff --git a/src/parser.c b/src/parser.c index c76002c79..94f9a48af 100644 --- a/src/parser.c +++ b/src/parser.c @@ -243,8 +243,8 @@ static AST_HTML_TEXT_NODE_T* parser_parse_text_content(parser_T* parser, hb_arra token_T* token = parser_consume_expected(parser, TOKEN_ERROR, document_errors); append_unexpected_error( - "Token Error", - "not TOKEN_ERROR", + hb_string("Token Error"), + hb_string("not TOKEN_ERROR"), token->value, token->location.start, token->location.end, @@ -267,9 +267,9 @@ static AST_HTML_TEXT_NODE_T* parser_parse_text_content(parser_T* parser, hb_arra if (hb_buffer_length(&content) > 0) { text_node = - ast_html_text_node_init(hb_buffer_value(&content), start, parser->current_token->location.start, errors); + ast_html_text_node_init(hb_string(content.value), start, parser->current_token->location.start, errors); } else { - text_node = ast_html_text_node_init("", start, parser->current_token->location.start, errors); + text_node = ast_html_text_node_init(hb_string(""), start, parser->current_token->location.start, errors); } free(content.value); @@ -394,8 +394,8 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_quoted_html_attribute_value if (token_is(parser, TOKEN_IDENTIFIER) || token_is(parser, TOKEN_CHARACTER)) { append_unexpected_error( - "Unescaped quote character in attribute value", - "escaped quote (\\') or different quote style (\")", + hb_string("Unescaped quote character in attribute value"), + hb_string("escaped quote (\\') or different quote style (\")"), opening_quote->value, potential_closing->location.start, potential_closing->location.end, @@ -518,9 +518,9 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_html_attribute_value(parser position_T end = token->location.end; append_unexpected_error( - "Invalid quote character for HTML attribute", - "single quote (') or double quote (\")", - "backtick (`)", + hb_string("Invalid quote character for HTML attribute"), + hb_string("single quote (') or double quote (\")"), + hb_string("backtick (`)"), start, end, errors @@ -535,8 +535,8 @@ static AST_HTML_ATTRIBUTE_VALUE_NODE_T* parser_parse_html_attribute_value(parser } append_unexpected_error( - "Unexpected Token", - "TOKEN_IDENTIFIER, TOKEN_QUOTE, TOKEN_ERB_START", + hb_string("Unexpected Token"), + hb_string("TOKEN_IDENTIFIER, TOKEN_QUOTE, TOKEN_ERB_START"), token_type_to_string(parser->current_token->type), parser->current_token->location.start, parser->current_token->location.end, @@ -799,8 +799,8 @@ static AST_HTML_OPEN_TAG_NODE_T* parser_parse_html_open_tag(parser_T* parser) { parser_append_unexpected_error( parser, - "Unexpected Token", - "TOKEN_IDENTIFIER, TOKEN_AT, TOKEN_ERB_START,TOKEN_WHITESPACE, or TOKEN_NEWLINE", + hb_string("Unexpected Token"), + hb_string("TOKEN_IDENTIFIER, TOKEN_AT, TOKEN_ERB_START,TOKEN_WHITESPACE, or TOKEN_NEWLINE"), errors ); } @@ -863,8 +863,8 @@ static AST_HTML_CLOSE_TAG_NODE_T* parser_parse_html_close_tag(parser_T* parser) append_void_element_closing_tag_error( tag_name, - expected.data, - got.data, + expected, + got, tag_opening->location.start, tag_closing->location.end, errors @@ -974,7 +974,7 @@ static AST_HTML_ELEMENT_NODE_T* parser_parse_html_element(parser_T* parser) { hb_array_T* errors = hb_array_init(8); - parser_append_unexpected_error(parser, "Unknown HTML open tag type", "HTMLOpenTag or HTMLSelfCloseTag", errors); + parser_append_unexpected_error(parser, hb_string("Unknown HTML open tag type"), hb_string("HTMLOpenTag or HTMLSelfCloseTag"), errors); return ast_html_element_node_init( open_tag, @@ -1134,9 +1134,9 @@ static void parser_parse_in_data_state(parser_T* parser, hb_array_T* children, h parser_append_unexpected_error( parser, - "Unexpected token", - "TOKEN_ERB_START, TOKEN_HTML_DOCTYPE, TOKEN_HTML_COMMENT_START, TOKEN_IDENTIFIER, TOKEN_WHITESPACE, " - "TOKEN_NBSP, TOKEN_AT, TOKEN_BACKSLASH, or TOKEN_NEWLINE", + hb_string("Unexpected token"), + hb_string("TOKEN_ERB_START, TOKEN_HTML_DOCTYPE, TOKEN_HTML_COMMENT_START, TOKEN_IDENTIFIER, TOKEN_WHITESPACE, " + "TOKEN_NBSP, TOKEN_AT, TOKEN_BACKSLASH, or TOKEN_NEWLINE"), errors ); } diff --git a/src/parser_helpers.c b/src/parser_helpers.c index a889a678e..3da3bc3b5 100644 --- a/src/parser_helpers.c +++ b/src/parser_helpers.c @@ -94,8 +94,8 @@ void parser_exit_foreign_content(parser_T* parser) { void parser_append_unexpected_error( parser_T* parser, - const char* description, - const char* expected, + hb_string_T description, + hb_string_T expected, hb_array_T* errors ) { token_T* token = parser_advance(parser); @@ -128,10 +128,9 @@ void parser_append_literal_node_from_buffer( hb_array_T* children, position_T start ) { - if (hb_buffer_length(buffer) == 0) { return; } - + if (buffer->length == 0) { return; } AST_LITERAL_NODE_T* literal = - ast_literal_node_init(hb_buffer_value(buffer), start, parser->current_token->location.start, NULL); + ast_literal_node_init(hb_string(buffer->value), start, parser->current_token->location.start, NULL); if (children != NULL) { hb_array_append(children, literal); } hb_buffer_clear(buffer); diff --git a/src/prism_helpers.c b/src/prism_helpers.c index 06ac155b5..22cef65c6 100644 --- a/src/prism_helpers.c +++ b/src/prism_helpers.c @@ -44,9 +44,9 @@ RUBY_PARSE_ERROR_T* ruby_parse_error_from_prism_error( position_T end = position_from_source_with_offset(source, end_offset); return ruby_parse_error_init( - error->message, - pm_diagnostic_id_human(error->diag_id), - pm_error_level_to_string(error->level), + hb_string(error->message), + hb_string(pm_diagnostic_id_human(error->diag_id)), + hb_string(pm_error_level_to_string(error->level)), start, end ); From 29d570409ff0f1f493c2ba73bc53260d5059d2ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Tue, 28 Oct 2025 08:08:35 +0100 Subject: [PATCH 28/33] Stop freeing strings --- templates/src/errors.c.erb | 1 - 1 file changed, 1 deletion(-) diff --git a/templates/src/errors.c.erb b/templates/src/errors.c.erb index 6595914e4..107a9ac86 100644 --- a/templates/src/errors.c.erb +++ b/templates/src/errors.c.erb @@ -134,7 +134,6 @@ static void error_free_<%= error.human %>(<%= error.struct_type %>* <%= error.hu <%- when Herb::Template::SizeTField -%> // size_t is part of struct <%- when Herb::Template::StringField -%> - if (<%= error.human %>-><%= field.name %> != NULL) { free((char*) <%= error.human %>-><%= field.name %>); } <%- else -%> <%= field.inspect %> <%- end -%> From efe44253d821c9a5caae8f15de07e5cd6b7dbfbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Thu, 30 Oct 2025 20:57:00 +0100 Subject: [PATCH 29/33] Revert to malloc in lexer error message --- src/lexer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lexer.c b/src/lexer.c index 89c6b40f3..aa825257b 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -58,7 +58,7 @@ void lexer_init(lexer_T* lexer, const char* source) { token_T* lexer_error(lexer_T* lexer, const char* message) { size_t error_message_length = 128; - char* error_message = hb_arena_alloc(lexer->allocator, sizeof(char) * error_message_length); + char* error_message = malloc(sizeof(char) * error_message_length); // hb_arena_alloc(lexer->allocator, sizeof(char) * error_message_length); snprintf( error_message, From e62ab747353d5bee27fd59f5c65b08987cb2b474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Thu, 30 Oct 2025 20:57:08 +0100 Subject: [PATCH 30/33] Fix test_token.c --- test/c/test_token.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/c/test_token.c b/test/c/test_token.c index 69335ad9c..529ddfcb2 100644 --- a/test/c/test_token.c +++ b/test/c/test_token.c @@ -4,7 +4,7 @@ #include "../../src/include/token.h" TEST(test_token) - ck_assert_str_eq(token_type_to_string(TOKEN_IDENTIFIER), "TOKEN_IDENTIFIER"); + ck_assert(hb_string_equals(token_type_to_string(TOKEN_IDENTIFIER), hb_string("TOKEN_IDENTIFIER"))); END TEST(test_token_to_string) From a569e8838ad00fa62b5b37e648a7e30b43f8a7fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Thu, 30 Oct 2025 20:57:19 +0100 Subject: [PATCH 31/33] Fix issues in generated code --- src/ast_node.c | 2 +- templates/src/ast_nodes.c.erb | 3 +-- templates/src/ast_pretty_print.c.erb | 2 +- templates/src/errors.c.erb | 6 +++--- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/ast_node.c b/src/ast_node.c index 976abfec8..4058c06e2 100644 --- a/src/ast_node.c +++ b/src/ast_node.c @@ -31,7 +31,7 @@ AST_LITERAL_NODE_T* ast_literal_node_init_from_token(const token_T* token) { ast_node_init(&literal->base, AST_LITERAL_NODE, token->location.start, token->location.end, NULL); - literal->content = NULL; // herb_strdup(token->value); + literal->content = token->value; return literal; } diff --git a/templates/src/ast_nodes.c.erb b/templates/src/ast_nodes.c.erb index 19cef5ef8..c00442570 100644 --- a/templates/src/ast_nodes.c.erb +++ b/templates/src/ast_nodes.c.erb @@ -35,7 +35,7 @@ <%- when Herb::Template::PrismNodeField -%> <%= node.human %>-><%= field.name %> = <%= field.name %>; <%- when Herb::Template::StringField -%> - <%= node.human %>-><%= field.name %> = herb_strdup(<%= field.name %>); + <%= node.human %>-><%= field.name %> = <%= field.name %>; <%- when Herb::Template::AnalyzedRubyField -%> <%= node.human %>-><%= field.name %> = <%= field.name %>; <%- when Herb::Template::VoidPointerField -%> @@ -107,7 +107,6 @@ static void ast_free_<%= node.human %>(<%= node.struct_type %>* <%= node.human % hb_array_free(&<%= node.human %>-><%= field.name %>); } <%- when Herb::Template::StringField -%> - if (<%= node.human %>-><%= field.name %> != NULL) { free((char*) <%= node.human %>-><%= field.name %>); } <%- when Herb::Template::PrismNodeField -%> if (<%= node.human %>-><%= field.name %> != NULL) { // The first argument to `pm_node_destroy` is a `pm_parser_t`, but it's currently unused: diff --git a/templates/src/ast_pretty_print.c.erb b/templates/src/ast_pretty_print.c.erb index f245e0224..b4be01ff5 100644 --- a/templates/src/ast_pretty_print.c.erb +++ b/templates/src/ast_pretty_print.c.erb @@ -41,7 +41,7 @@ void ast_pretty_print_node(AST_NODE_T* node, const size_t indent, const size_t r <%- when Herb::Template::ElementSourceField -%> pretty_print_string_property(element_source_to_string(<%= node.human %>-><%= field.name %>), hb_string("<%= field.name %>"), indent, relative_indent, <%= last %>, buffer); <%- when Herb::Template::StringField -%> - pretty_print_string_property(hb_string(<%= node.human %>-><%= field.name %>), hb_string("<%= field.name %>"), indent, relative_indent, <%= last %>, buffer); + pretty_print_string_property(<%= node.human %>-><%= field.name %>, hb_string("<%= field.name %>"), indent, relative_indent, <%= last %>, buffer); <%- when Herb::Template::PrismNodeField -%> pretty_print_string_property(hb_string("<%= field.name %>"), hb_string("<%= field.name %>"), indent, relative_indent, <%= last %>, buffer); <%- when Herb::Template::NodeField -%> diff --git a/templates/src/errors.c.erb b/templates/src/errors.c.erb index 107a9ac86..68ccdd840 100644 --- a/templates/src/errors.c.erb +++ b/templates/src/errors.c.erb @@ -78,7 +78,7 @@ void error_init(ERROR_T* error, const error_type_T type, position_T start, posit <%- when Herb::Template::SizeTField -%> <%= error.human %>-><%= field.name %> = <%= field.name %>; <%- when Herb::Template::StringField -%> - <%= error.human %>-><%= field.name %> = herb_strdup(<%= field.name %>); + <%= error.human %>-><%= field.name %> = <%= field.name %>; <%- else -%> <%= field.inspect %> <%- end -%> @@ -216,11 +216,11 @@ static void error_pretty_print_<%= error.human %>(<%= error.struct_type %>* erro <%- when Herb::Template::TokenField -%> pretty_print_token_property(error-><%= field.name %>, hb_string("<%= field.name %>"), indent, relative_indent, <%= error.fields.length - 1 == index %>, buffer); <%- when Herb::Template::TokenTypeField -%> - pretty_print_property(hb_string(token_type_to_string(error-><%= field.name %>)), hb_string("<%= field.name %>"), indent, relative_indent, <%= error.fields.length - 1 == index %>, buffer); + pretty_print_property(token_type_to_string(error-><%= field.name %>), hb_string("<%= field.name %>"), indent, relative_indent, <%= error.fields.length - 1 == index %>, buffer); <%- when Herb::Template::SizeTField -%> pretty_print_size_t_property(hb_string(error-><%= field.name %>), hb_string("<%= field.name %>"), indent, relative_indent, <%= error.fields.length - 1 == index %>, buffer); <%- when Herb::Template::StringField -%> - pretty_print_quoted_property(hb_string("<%= field.name %>"), hb_string(error-><%= field.name %>), indent, relative_indent, <%= error.fields.length - 1 == index %>, buffer); + pretty_print_quoted_property(hb_string("<%= field.name %>"), error-><%= field.name %>, indent, relative_indent, <%= error.fields.length - 1 == index %>, buffer); <%- else -%> <%= field.inspect %> <%- end -%> From 93d10abf9a48032dad32270950663c562b70cc7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Thu, 30 Oct 2025 21:14:04 +0100 Subject: [PATCH 32/33] Return uint32_t in utf8_char_byte_length --- src/include/utf8.h | 2 +- src/utf8.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/include/utf8.h b/src/include/utf8.h index 4c2d6c5da..2ba0b44e0 100644 --- a/src/include/utf8.h +++ b/src/include/utf8.h @@ -5,7 +5,7 @@ #include #include -int utf8_char_byte_length(unsigned char first_byte); +uint32_t utf8_char_byte_length(unsigned char first_byte); uint32_t utf8_sequence_length(hb_string_T value); bool utf8_is_valid_continuation_byte(unsigned char byte); diff --git a/src/utf8.c b/src/utf8.c index f24e7015d..79a8aa5e0 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -1,12 +1,13 @@ #include "include/utf8.h" #include "include/util/hb_string.h" +#include // UTF-8 byte patterns: // 0xxxxxxx = 1 byte (ASCII) // 110xxxxx = 2 bytes // 1110xxxx = 3 bytes // 11110xxx = 4 bytes -int utf8_char_byte_length(unsigned char first_byte) { +uint32_t utf8_char_byte_length(unsigned char first_byte) { if ((first_byte & 0x80) == 0) { return 1; } else if ((first_byte & 0xE0) == 0xC0) { @@ -28,13 +29,13 @@ bool utf8_is_valid_continuation_byte(unsigned char byte) { uint32_t utf8_sequence_length(hb_string_T value) { if (hb_string_is_empty(value)) { return 0; } - int expected_length = utf8_char_byte_length(value.data[0]); + uint32_t expected_length = utf8_char_byte_length(value.data[0]); if (value.length - expected_length < expected_length) { return 1; // Not enough bytes, treat as single byte } if (expected_length > 1) { - for (int i = 1; i < expected_length; i++) { + for (uint32_t i = 1; i < expected_length; i++) { if (!utf8_is_valid_continuation_byte((unsigned char) value.data[i])) { return 1; // Invalid continuation byte, treat first byte as single byte } From 1039634b10070a0d6c9d9375a459d28bee4fd340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20K=C3=A4chele?= Date: Thu, 30 Oct 2025 21:23:34 +0100 Subject: [PATCH 33/33] Use truncate method in lexer --- src/lexer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index aa825257b..21234dc8c 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -119,8 +119,11 @@ static token_T* lexer_advance_with_next(lexer_T* lexer, size_t count, token_type uint32_t end_position = lexer->current_position; + hb_string_T value = hb_string_slice(lexer->source, start_position); + value = hb_string_truncate(value, end_position - start_position); + token_T* token = token_init( - (hb_string_T) { .data = lexer->source.data + start_position, .length = end_position - start_position }, + value, type, lexer ); @@ -140,8 +143,8 @@ static token_T* lexer_advance_utf8_character(lexer_T* lexer, const token_type_T size_t start_position = lexer->current_position; lexer_advance_utf8_bytes(lexer, char_byte_length); - hb_string_T utf8_char = hb_string_slice(lexer->source, lexer->current_position); - utf8_char.length = MIN(char_byte_length, utf8_char.length); + hb_string_T utf8_char = hb_string_slice(lexer->source, start_position); + utf8_char = hb_string_truncate(utf8_char, char_byte_length); return token_init(utf8_char, type, lexer); } @@ -164,7 +167,7 @@ static token_T* lexer_parse_whitespace(lexer_T* lexer) { uint32_t end_position = lexer->current_position; hb_string_T value = hb_string_slice(lexer->source, start_position); - value.length = end_position - start_position; + value = hb_string_truncate(value, end_position - start_position); token_T* token = token_init(value, TOKEN_WHITESPACE, lexer); @@ -182,6 +185,7 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) { uint32_t end_position = lexer->current_position; hb_string_T value = hb_string_slice(lexer->source, start_position); + value = hb_string_truncate(value, end_position - start_position); value.length = end_position - start_position; token_T* token = token_init(value, TOKEN_IDENTIFIER, lexer); @@ -233,7 +237,7 @@ static token_T* lexer_parse_erb_content(lexer_T* lexer) { uint32_t end_position = lexer->current_position; hb_string_T value = hb_string_slice(lexer->source, start_position); - value.length = end_position - start_position; + value = hb_string_truncate(value, end_position - start_position); return token_init(value, TOKEN_ERB_CONTENT, lexer); }