.global _tokenize, classification, transitions

.section .rodata

#
# Classification table assigns each possible character to a group (class). All
# characters of the same group a handled equivalently.
#
# Classification:
#
.equ CLASS_INVALID, 0x0
.equ CLASS_DIGIT, 0x01
.equ CLASS_CHARACTER, 0x02
.equ CLASS_SPACE, 0x03
.equ CLASS_COLON, 0x04
.equ CLASS_EQUALS, 0x05
.equ CLASS_LEFT_PAREN, 0x06
.equ CLASS_RIGHT_PAREN, 0x07
.equ CLASS_ASTERISK, 0x08
.equ CLASS_UNDERSCORE, 0x09
.equ CLASS_SINGLE, 0x0a
.equ CLASS_HEX, 0x0b
.equ CLASS_ZERO, 0x0c
.equ CLASS_X, 0x0d
.equ CLASS_EOF, 0x0e
.equ CLASS_DOT, 0x0f

.equ CLASS_COUNT, 16

.type classification, @object
.size classification, 128
classification:
	.byte CLASS_EOF # 00 NUL
	.byte CLASS_INVALID # 01 SOH
	.byte CLASS_INVALID # 02 STX
	.byte CLASS_INVALID # 03 ETX
	.byte CLASS_INVALID # 04 EOT
	.byte CLASS_INVALID # 05 ENQ
	.byte CLASS_INVALID # 06 ACK
	.byte CLASS_INVALID # 07 BEL
	.byte CLASS_INVALID # 08 BS
	.byte CLASS_SPACE # 09 HT
	.byte CLASS_SPACE # 0A LF
	.byte CLASS_INVALID # 0B VT
	.byte CLASS_INVALID # 0C FF
	.byte CLASS_SPACE # 0D CR
	.byte CLASS_INVALID # 0E SO
	.byte CLASS_INVALID # 0F SI
	.byte CLASS_INVALID # 10 DLE
	.byte CLASS_INVALID # 11 DC1
	.byte CLASS_INVALID # 12 DC2
	.byte CLASS_INVALID # 13 DC3
	.byte CLASS_INVALID # 14 DC4
	.byte CLASS_INVALID # 15 NAK
	.byte CLASS_INVALID # 16 SYN
	.byte CLASS_INVALID # 17 ETB
	.byte CLASS_INVALID # 18 CAN
	.byte CLASS_INVALID # 19 EM
	.byte CLASS_INVALID # 1A SUB
	.byte CLASS_INVALID # 1B ESC
	.byte CLASS_INVALID # 1C FS
	.byte CLASS_INVALID # 1D GS
	.byte CLASS_INVALID # 1E RS
	.byte CLASS_INVALID # 1F US
	.byte CLASS_SPACE # 20 Space
	.byte CLASS_SINGLE # 21 !
	.byte 0x00 # 22 "
	.byte 0x00 # 23 #
	.byte 0x00 # 24 $
	.byte CLASS_SINGLE # 25 %
	.byte CLASS_SINGLE # 26 &
	.byte 0x00 # 27 '
	.byte CLASS_LEFT_PAREN # 28 (
	.byte CLASS_RIGHT_PAREN # 29 )
	.byte CLASS_ASTERISK # 2A *
	.byte CLASS_SINGLE # 2B +
	.byte CLASS_SINGLE # 2C ,
	.byte 0x00 # 2D -
	.byte CLASS_DOT # 2E .
	.byte CLASS_SINGLE # 2F /
	.byte CLASS_ZERO # 30 0
	.byte CLASS_DIGIT # 31 1
	.byte CLASS_DIGIT # 32 2
	.byte CLASS_DIGIT # 33 3
	.byte CLASS_DIGIT # 34 4
	.byte CLASS_DIGIT # 35 5
	.byte CLASS_DIGIT # 36 6
	.byte CLASS_DIGIT # 37 7
	.byte CLASS_DIGIT # 38 8
	.byte CLASS_DIGIT # 39 9
	.byte CLASS_COLON # 3A :
	.byte CLASS_SINGLE # 3B ;
	.byte 0x00 # 3C <
	.byte CLASS_EQUALS # 3D =
	.byte 0x00 # 3E >
	.byte 0x00 # 3F ?
	.byte CLASS_SINGLE # 40 @
	.byte CLASS_CHARACTER # 41 A
	.byte CLASS_CHARACTER # 42 B
	.byte CLASS_CHARACTER # 43 C
	.byte CLASS_CHARACTER # 44 D
	.byte CLASS_CHARACTER # 45 E
	.byte CLASS_CHARACTER # 46 F
	.byte CLASS_CHARACTER # 47 G
	.byte CLASS_CHARACTER # 48 H
	.byte CLASS_CHARACTER # 49 I
	.byte CLASS_CHARACTER # 4A J
	.byte CLASS_CHARACTER # 4B K
	.byte CLASS_CHARACTER # 4C L
	.byte CLASS_CHARACTER # 4D M
	.byte CLASS_CHARACTER # 4E N
	.byte CLASS_CHARACTER # 4F O
	.byte CLASS_CHARACTER # 50 P
	.byte CLASS_CHARACTER # 51 Q
	.byte CLASS_CHARACTER # 52 R
	.byte CLASS_CHARACTER # 53 S
	.byte CLASS_CHARACTER # 54 T
	.byte CLASS_CHARACTER # 55 U
	.byte CLASS_CHARACTER # 56 V
	.byte CLASS_CHARACTER # 57 W
	.byte CLASS_CHARACTER # 58 X
	.byte CLASS_CHARACTER # 59 Y
	.byte CLASS_CHARACTER # 5A Z
	.byte CLASS_SINGLE # 5B [
	.byte 0x00 # 5C \
	.byte CLASS_SINGLE # 5D ]
	.byte CLASS_SINGLE # 5E ^
	.byte CLASS_UNDERSCORE # 5F _
	.byte 0x00 # 60 `
	.byte CLASS_HEX # 61 a
	.byte CLASS_HEX # 62 b
	.byte CLASS_HEX # 63 c
	.byte CLASS_HEX # 64 d
	.byte CLASS_HEX # 65 e
	.byte CLASS_HEX # 66 f
	.byte CLASS_CHARACTER # 67 g
	.byte CLASS_CHARACTER # 68 h
	.byte CLASS_CHARACTER # 69 i
	.byte CLASS_CHARACTER # 6A j
	.byte CLASS_CHARACTER # 6B k
	.byte CLASS_CHARACTER # 6C l
	.byte CLASS_CHARACTER # 6D m
	.byte CLASS_CHARACTER # 6E n
	.byte CLASS_CHARACTER # 6F o
	.byte CLASS_CHARACTER # 70 p
	.byte CLASS_CHARACTER # 71 q
	.byte CLASS_CHARACTER # 72 r
	.byte CLASS_CHARACTER # 73 s
	.byte CLASS_CHARACTER # 74 t
	.byte CLASS_CHARACTER # 75 u
	.byte CLASS_CHARACTER # 76 v
	.byte CLASS_CHARACTER # 77 w
	.byte CLASS_X # 78 x
	.byte CLASS_CHARACTER # 79 y
	.byte CLASS_CHARACTER # 7A z
	.byte 0x00 # 7B {
	.byte CLASS_SINGLE # 7C |
	.byte 0x00 # 7D }
	.byte CLASS_SINGLE # 7E ~
	.byte CLASS_INVALID # 7F DEL

.section .data

# The transition table describes transitions from one state to another, given
# a symbol (character class).
#
# The table has m rows and n columns, where m is the amount of states and n is
# the amount of classes. So given the current state and a classified character
# the table can be used to look up the next state.
#
# Each cell is a word long.
# - The least significant byte of the word is a row number (beginning with 0).
#   It specifies the target state. "ff" means that this is an end state and no
#   transition is possible.
# - The next byte is the action that should be performed when transitioning.
#   For the meaning of actions see labels in the _analyze_token function, which
#   handles each action.
#
.type transitions, @object
.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT
transitions:
	#     Invalid Digit   Alpha   Space   :       =       (       )     
	#     *       _       Single  Hex     0       x       NUL     .
	.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107
	.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start

	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon

	.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier

	.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer

	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign

	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals

	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren

	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren

	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk

	.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment

	.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment

	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token

	.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero

	.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal

	.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot

.section .text

# Returns the class from the classification table for the given character.
#
# Parameters:
# a0 - Character.
#
# Sets a0 to the class number.
.type _classify, @function
_classify:
	la t0, classification
	add t0, t0, a0 # Character class pointer.
	lbu a0, (t0) # Character class.
	ret

# Given the current state and a character class, calculates the next state.

# Parameters:
# a0 - Current state.
# a1 - Character class.
#
# Sets a0 to the next state.
.type _lookup_state, @function
_lookup_state:
	li t0, CLASS_COUNT
	mul a0, a0, t0 # Transition row.
	add a0, a0, a1 # Transition column.

	li t0, 4
	mul a0, a0, t0 # Multiply by the word size.

	la t0, transitions
	add t0, t0, a0
	lw a0, (t0) # Next state.

	ret

# Chains _classify and _lookup_state.
#
# Parameters:
# a0 - Current state.
# a1 - Character.
#
# Sets a0 to the next state based on the given character.
.type _next_state, @function
_next_state:
	# Prologue.
	addi sp, sp, -16
	sw ra, 12(sp)
	sw s0, 8(sp)
	addi s0, sp, 16

	sw a0, 4(sp)
	mv a0, a1
	call _classify

	mv a1, a0
	lw a0, 4(sp)
	call _lookup_state

	# Epilogue.
	lw ra, 12(sp)
	lw s0, 8(sp)
	addi sp, sp, 16
	ret

# Initializes the classification table.
#
# Paramaters:
# a0 - Source text pointer.
.type _analyze_token, @function
_analyze_token:
	# Prologue.
	addi sp, sp, -24
	sw ra, 20(sp)
	sw s0, 16(sp)
	addi s0, sp, 24

	sw s1, 12(sp) # Preserve s1 used for current source text position.
	mv s1, a0
	sw a0, 4(sp) # Keeps a pointer to the beginning of a token.

	sw s2, 8(sp) # Preserve s2 containing the current state.
	li s2, 0x00 # Initial, start state.

.Lanalyze_token_loop:
	mv a0, s2
	lbu a1, (s1)
	call _next_state

	li t0, 0xff
	and s2, a0, t0 # Next state.

	li t0, 0xff00
	and t1, a0, t0 # Transition action.
	srli t1, t1, 8


	li t0, 0x01 # Accumulate action.
	beq t1, t0, .Lanalyze_token_accumulate

	li t0, 0x02 # Print action.
	beq t1, t0, .Lanalyze_token_print

	li t0, 0x03 # Skip action.
	beq t1, t0, .Lanalyze_token_skip

	li t0, 0x04 # Comment action.
	beq t1, t0, .Lanalyze_token_comment

	/* DEBUG
	mv s4, t1
	addi t1, t1, '0'
	sb t1, 0(sp)
	li t1, ' '
	sb t1, 1(sp)
	addi t1, s2, '0'
	sb t1, 2(sp)
	addi a0, sp, 0 */
	sw s1, 0(sp)
	addi a0, s1, 0
	li a1, 3
	call _write_error
	/* mv t1, s4
	DEBUG */

	j .Lanalyze_token_reject

.Lanalyze_token_reject:
	addi s1, s1, 1

	j .Lanalyze_token_end

.Lanalyze_token_accumulate:
	addi s1, s1, 1

	j .Lanalyze_token_loop

.Lanalyze_token_skip:
	addi s1, s1, 1
	lw t0, 4(sp)
	addi t0, t0, 1
	sw t0, 4(sp)

	j .Lanalyze_token_loop

.Lanalyze_token_print:
	/* DEBUG
	lw a0, 4(sp)
	mv a1, s1
	sub a1, a1, a0
	call _write_error
	DEBUG */

	j .Lanalyze_token_end

.Lanalyze_token_comment:
	addi s1, s1, 1

	/* DEBUG
	lw a0, 4(sp)
	mv a1, s1
	sub a1, a1, a0
	call _write_error
	DEBUG */

	j .Lanalyze_token_end

.Lanalyze_token_end:
	mv a0, s1 # Return the advanced text pointer.

	# Restore saved registers.
	lw s1, 12(sp)
	lw s2, 8(sp)

	# Epilogue.
	lw ra, 20(sp)
	lw s0, 16(sp)
	addi sp, sp, 24
	ret

# Initializes the lookup tables.
#
# Parameters:
# a0 - Source text pointer.
.type _tokenize, @function
_tokenize:
	# Prologue.
	addi sp, sp, -8
	sw ra, 4(sp)
	sw s0, 0(sp)
	addi s0, sp, 8

.Ltokenize_loop:
	call _analyze_token

	lw t0, (a0)
	bnez t0, .Ltokenize_loop

	# Epilogue.
	lw ra, 4(sp)
	lw s0, 0(sp)
	addi sp, sp, 8
	ret