Make string and char pointers comparable

2025-02-08 23:02:27 +01:00
parent 8a0f282714
commit d88bd652a4
15 changed files with 425 additions and 123 deletions
--- a/source.elna
+++ b/source.elna
@ -25,9 +25,14 @@ type
    first: Position;
 	last: Position
  end,
+  SourceCode = record
+    position: Position;
+	text: String
+  end,
  TokenValue* = union
    int_value: Int;
    string_value: pointer to Char;
+    string: String;
 	boolean_value: Bool;
 	char_value: Char
  end,
@ -179,11 +184,32 @@ begin
  return strncmp(this.ptr, that, length) = 0
 end

+proc open_substring(string: String, start: Word): String;
+begin
+  string.ptr := string.ptr + start;
+  string.length := string.length - start;
+  return string
+end
+
+proc char_at(string: String, position: Word): Char;
+begin
+  return (string.ptr + position)^
+end
+
 (*
  End of standard procedures.
 *)

-proc read_source(filename: pointer to Char): pointer to Char;
+proc make_position(): Position;
+var
+  result: Position;
+begin
+  result.line := 1u;
+  result.column := 1u;
+  return result
+end
+
+proc read_source(filename: pointer to Char, result: pointer to String): Bool;
 var
  input_file: pointer to FILE,
  source_size: Int,
@ -192,26 +218,28 @@ begin
  input_file := fopen(filename, "rb\0".ptr);

  if input_file = nil then
-    return nil
+    return false
+  end;
+  defer
+    fclose(input_file)
  end;
  if fseek(input_file, 0, SEEK_END) <> 0 then
-    fclose(input_file);
-    return nil
+    return false
  end;
  source_size := ftell(input_file);
  if source_size < 0 then
-    fclose(input_file);
-    return nil
+    return false
  end;
  rewind(input_file);

-  input := calloc(source_size + 1, 1);
+  input := malloc(source_size);
  if fread(input, source_size, 1, input_file) <> 1u then
-    input := nil
+    return false
  end;
-  fclose(input_file);
+  result^.length := cast(source_size as Word);
+  result^.ptr := cast(input as pointer to Char);

-  return input
+  return true
 end

 proc escape_char(escape: Char, result: pointer to Char): Bool;
@ -257,12 +285,26 @@ begin
  end
 end

-proc skip_spaces(input: pointer to Char): pointer to Char;
+proc advance_source(source_code: SourceCode, length: Word): SourceCode;
 begin
-  while is_space(input^) do
-    input := input + 1
+  source_code.text := open_substring(source_code.text, length);
+  source_code.position.column := source_code.position.column + length;
+
+  return source_code
+end
+
+proc skip_spaces(source_code: SourceCode): SourceCode;
+begin
+  while source_code.text.length > 0u and is_space(char_at(source_code.text, 0)) do
+    if char_at(source_code.text, 0) = '\n' then
+      source_code.position.line := source_code.position.line + 1u;
+      source_code.position.column := 1u
+	else
+	  source_code.position.column := source_code.position.column + 1u
+	end;
+    source_code.text := open_substring(source_code.text, 1u)
  end;
-  return input
+  return source_code
 end

 proc lex_identifier(input: pointer to Char): pointer to Char;
@ -273,19 +315,29 @@ begin
  return input
 end

-proc lex_comment(input: pointer to Char): pointer to Char;
+proc lex_comment(source_code: pointer to SourceCode, token_content: pointer to String): Bool;
 var
-  next: pointer to Char;
+  result: pointer to Char;
 begin
-  while input^ <> '\0' do
-    next := input + 1;
+  token_content^.ptr := source_code^.text.ptr;
+  token_content^.length := 0u;

-    if input^ = '*' and next^ = ')' then
-	  return next + 1
+  while source_code^.text.length > 1u do
+    if char_at(source_code^.text, 0) = '*' and char_at(source_code^.text, 1) = ')' then
+	  source_code^ := advance_source(source_code^, 2u);
+
+	  result := cast(malloc(token_content^.length) as pointer to Char);
+	  strncpy(result, token_content^.ptr, token_content^.length);
+	  token_content^.ptr := result;
+
+	  return true
 	end;
-	input := next
+	token_content^.length := token_content^.length + 1u;
+	source_code^ := advance_source(source_code^, 1)
  end;
-  return nil
+  token_content^.ptr := nil;
+  token_content^.length := 0u;
+  return false
 end

 proc lex_character(input: pointer to Char, current_token: pointer to Token): pointer to Char;
@ -458,7 +510,7 @@ begin
 	elsif current_token^.kind = TOKEN_AT then
 	  write_c('@')
 	elsif current_token^.kind = TOKEN_COMMENT then
-	  write_s("COMMENT")
+	  write_s("(* COMMENT *)")
 	elsif current_token^.kind = TOKEN_INTEGER then
 	  write_c('<');
      write_i(current_token^.value.int_value);
@ -562,154 +614,167 @@ begin
  return current_token
 end

-proc tokenize(input_pointer: pointer to Char, tokens_size: pointer to Word): pointer to Token;
+proc tokenize(source_code: SourceCode, tokens_size: pointer to Word): pointer to Token;
 var
  token_end: pointer to Char,
  tokens: pointer to Token,
  current_token: pointer to Token,
-  token_length: Word;
+  token_length: Word,
+  first_char: Char,
+  token_content: String;
 begin
  tokens_size^ := 0u;
  tokens := nil;
+  source_code := skip_spaces(source_code);

-  input_pointer := skip_spaces(input_pointer);
-
-  while input_pointer^ <> '\0' do
+  while source_code.text.length <> 0u do
 	tokens := cast(reallocarray(tokens, tokens_size^ + 1u, sizeof(Token)) as pointer to Token);
    current_token := tokens + tokens_size^;
+	first_char := char_at(source_code.text, 0);

-    if is_alpha(input_pointer^) or input_pointer^ = '_' then
-      token_end := lex_identifier(input_pointer + 1);
-	  token_length := cast(token_end as Word) - cast(input_pointer as Word);
+    if is_alpha(first_char) or first_char = '_' then
+      token_end := lex_identifier(source_code.text.ptr + 1);
+	  token_length := cast(token_end - source_code.text.ptr as Word);

-      current_token^ := categorize_identifier(input_pointer, token_length);
+      current_token^ := categorize_identifier(source_code.text.ptr, token_length);

-	  input_pointer := token_end
-	elsif is_digit(input_pointer^) then
+      source_code := advance_source(source_code, token_length)
+	elsif is_digit(first_char) then
 	  token_end := nil;
-	  current_token^.value.int_value := strtol(input_pointer, @token_end, 10);
+	  current_token^.value.int_value := strtol(source_code.text.ptr, @token_end, 10);
+	  token_length := cast(token_end - source_code.text.ptr as Word);

 	  if token_end^ = 'u' then
 	    current_token^.kind := TOKEN_WORD;
-	    input_pointer := token_end + 1
+        source_code := advance_source(source_code, token_length + 1u)
 	  else
 	    current_token^.kind := TOKEN_INTEGER;
-	    input_pointer := token_end
+        source_code := advance_source(source_code, token_length)
 	  end
-	elsif input_pointer^ = '(' then
-      input_pointer := input_pointer + 1;
-	  if input_pointer^ = '*' then
-	    token_end := lex_comment(input_pointer + 1);
+	elsif first_char = '(' then
+	  source_code := advance_source(source_code, 1u);

-		if token_end <> nil then
-	      token_length := cast(token_end as Word) - cast(input_pointer as Word);
-	      current_token^.value.string_value := cast(calloc(token_length + 1u, 1) as pointer to Char);
-	      strncpy(current_token^.value.string_value, input_pointer, token_length);
-		  current_token^.kind := TOKEN_COMMENT;
+	  if source_code.text.length = 0u then
+	    current_token^.kind := TOKEN_LEFT_PAREN
+	  elsif char_at(source_code.text, 0u) = '*' then
+	    source_code := advance_source(source_code, 1u);

-		  input_pointer := token_end
+		if lex_comment(@source_code, @token_content) then
+	      current_token^.value.string := token_content;
+		  current_token^.kind := TOKEN_COMMENT
 		else
 	      current_token^.kind := 0
 		end
 	  else
 	    current_token^.kind := TOKEN_LEFT_PAREN
 	  end
-	elsif input_pointer^ = ')' then
+	elsif first_char = ')' then
 	  current_token^.kind := TOKEN_RIGHT_PAREN;
-      input_pointer := input_pointer + 1
-	elsif input_pointer^ = '\'' then
-	  token_end := lex_character(input_pointer + 1, current_token);
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '\'' then
+	  token_end := lex_character(source_code.text.ptr + 1, current_token);
+	  token_length := cast(token_end - source_code.text.ptr as Word);

 	  if token_end^ = '\'' then
 	  	current_token^.kind := TOKEN_CHARACTER;
-	    input_pointer := token_end + 1
+	    source_code := advance_source(source_code, token_length + 1u)
 	  else
-	    input_pointer := input_pointer + 1
+	    source_code := advance_source(source_code, 1u)
 	  end
-	elsif input_pointer^ = '"' then
-	  token_end := lex_string(input_pointer + 1, current_token);
+	elsif first_char = '"' then
+	  token_end := lex_string(source_code.text.ptr + 1, current_token);

      if token_end^ = '"' then
 		current_token^.kind := TOKEN_STRING;
-	    input_pointer := token_end + 1
+	    token_length := cast(token_end - source_code.text.ptr as Word);
+	    source_code := advance_source(source_code, token_length + 1u)
 	  end
-	elsif input_pointer^ = '[' then
+	elsif first_char = '[' then
 	  current_token^.kind := TOKEN_LEFT_SQUARE;
-      input_pointer := input_pointer + 1
-	elsif input_pointer^ = ']' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = ']' then
 	  current_token^.kind := TOKEN_RIGHT_SQUARE;
-      input_pointer := input_pointer + 1
-	elsif input_pointer^ = '>' then
-	  input_pointer := input_pointer + 1;
-	  if input_pointer^ = '=' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '>' then
+	  source_code := advance_source(source_code, 1u);
+
+      if source_code.text.length = 0u then
+	    current_token^.kind := TOKEN_GREATER_THAN
+	  elsif char_at(source_code.text, 0) = '=' then
 	    current_token^.kind := TOKEN_GREATER_EQUAL;
-		input_pointer := input_pointer + 1
+	    source_code := advance_source(source_code, 1u)
 	  else
 	    current_token^.kind := TOKEN_GREATER_THAN
 	  end
-	elsif input_pointer^ = '<' then
-	  input_pointer := input_pointer + 1;
-	  if input_pointer^ = '=' then
+	elsif first_char = '<' then
+	  source_code := advance_source(source_code, 1u);
+
+	  if source_code.text.length = 0u then
+	    current_token^.kind := TOKEN_LESS_THAN
+	  elsif char_at(source_code.text, 0) = '=' then
 	    current_token^.kind := TOKEN_LESS_EQUAL;
-		input_pointer := input_pointer + 1
-	  elsif input_pointer^ = '>' then
+	    source_code := advance_source(source_code, 1u)
+	  elsif char_at(source_code.text, 0) = '>' then
 	    current_token^.kind := TOKEN_NOT_EQUAL;
-		input_pointer := input_pointer + 1
+	    source_code := advance_source(source_code, 1u)
 	  else
 	    current_token^.kind := TOKEN_LESS_THAN
 	  end
-	elsif input_pointer^ = '=' then
+	elsif first_char = '=' then
 	  current_token^.kind := TOKEN_EQUAL;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = ';' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = ';' then
 	  current_token^.kind := TOKEN_SEMICOLON;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = '.' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '.' then
 	  current_token^.kind := TOKEN_DOT;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = ',' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = ',' then
 	  current_token^.kind := TOKEN_COMMA;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = '+' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '+' then
 	  current_token^.kind := TOKEN_PLUS;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = '-' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '-' then
 	  current_token^.kind := TOKEN_MINUS;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = '*' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '*' then
 	  current_token^.kind := TOKEN_MULTIPLICATION;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = '/' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '/' then
 	  current_token^.kind := TOKEN_DIVISION;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = '%' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '%' then
 	  current_token^.kind := TOKEN_REMAINDER;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = ':' then
-	  input_pointer := input_pointer + 1;
-	  if input_pointer^ = '=' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = ':' then
+	  source_code := advance_source(source_code, 1u);
+
+	  if source_code.text.length = 0u then
+	    current_token^.kind := TOKEN_COLON
+	  elsif char_at(source_code.text, 0) = '=' then
 	    current_token^.kind := TOKEN_ASSIGNMENT;
-	    input_pointer := input_pointer + 1
+	    source_code := advance_source(source_code, 1u)
 	  else
 	    current_token^.kind := TOKEN_COLON
 	  end
-	elsif input_pointer^ = '^' then
+	elsif first_char = '^' then
 	  current_token^.kind := TOKEN_HAT;
-	  input_pointer := input_pointer + 1
-	elsif input_pointer^ = '@' then
+	  source_code := advance_source(source_code, 1u)
+	elsif first_char = '@' then
 	  current_token^.kind := TOKEN_AT;
-	  input_pointer := input_pointer + 1
+	  source_code := advance_source(source_code, 1u)
 	else
 	  current_token^.kind := 0;
-      input_pointer := input_pointer + 1
+	  source_code := advance_source(source_code, 1u)
 	end;

 	if current_token^.kind <> 0 then
      tokens_size^ := tokens_size^ + 1u;
-      input_pointer := skip_spaces(input_pointer)
+      source_code := skip_spaces(source_code)
 	else
 	  write_s("Lexical analysis error on \"");
-	  write_c(input_pointer^);
+	  write_c(first_char);
 	  write_s("\".\n")
 	end
  end;
@ -819,9 +884,9 @@ end

 proc process(argc: Int, argv: pointer to pointer to Char): Int;
 var
-  input: pointer to Char,
  tokens: pointer to Token,
  tokens_size: Word,
+  source_code: SourceCode,
  command_line: pointer to CommandLine;
 begin
  command_line := parse_command_line(argc, argv);
@ -829,12 +894,12 @@ begin
    return 2
  end;

-  input := read_source(command_line^.input);
-  if input = nil then
+  source_code.position := make_position();
+  if not read_source(command_line^.input, @source_code.text) then
    perror(command_line^.input);
 	return 3
  end;
-  tokens := tokenize(input, @tokens_size);
+  tokens := tokenize(source_code, @tokens_size);

  if command_line^.tokenize then
    print_tokens(tokens, tokens_size)