; genforth.inc
; macros to build forth systems from assembly.

;;;;;;;;;;;;;;;;;;;;;; IMPLEMENTATIONS OF THREADING ;;;;;;;;;;;;;;;;;;;;;;;;;

; PC is the program counter
; DSP is the data stack pointer
; RSP is the return stack pointer
; R0 is a scratch register, R0L is its lower byte
; TOS is a buffer for the top of stack
; INSTRTBL is a table for bytecode threading

;;; BYTECODE, (ZERO-BASED ?) CODE ARRAY
;;; -----------------------------------
Macro	BCCA_Enter
EndM	BCCA_Enter
Macro	BCCA_Next
    ife CS_R0		; R0 is callee-saved
      ife CA_Base	; Base of Code Array
	clr R0		; 2, 2  , 1	; or movzx R0,[byte PC] ; 3, 6  , 3
      else
	mov R0,CA_base	; 5, 2  , 1
      endif
    endif
    if CA_bits = 8
	mov R0H,[PC]	; 2, 4  , 1
	inc PC		; 1, 2  , 1
    else
      if PCeqSI
	lodsb		; 1, 5  , 5
      else
	mov R0L,[PC]	; 2, 4  , 1	; or mov R0H,[PC] is CA_bits=8
	inc PC		; 1, 2  , 1
      endif
	shl R0,CA_bits	; 3, 3  , 2	; number of bits to shift for array
    endif
    jmp R0		; 2, 7+m, 5
EndM	BCCA_Next
;TOTAL  BCCA:		;13,18+m,10	; CA_bits=8: 256 bytes element array;
;	BCZBCA:		;10,18+m,10	; but then, there may be more cache
;	BCCACSR0:	; 8,16+m, 9	; misses on a small cache, and/or
;	BCCASI:		;11,14+m,11	; the interspace may be space costly,
;	BCZBCASI:	; 8,14+m,11	; and require some hand optimization
;	BCCASICSR0:	; 6,12+m,10	; or writing specifically a automatic
;	BCCA8:		;10,15+m, 8	; optimizer.
;	BCZBCA8:	; 7,15+m, 8	; R0 can also be made a
;	BCCA8CSR0:	; 5,13+m, 7	; callee-saved register, so that we
; have 5,13+m,7 !!!

Macro	BCCA_PreNext
    ife CS_R0		; R0 is callee-saved
      ife CA_Base	; Base of Code Array
	clr R0		; 2, 2  , 1	; or movzx R0,[byte PC] ; 3, 6  , 3
      else
	mov R0,CA_base	; 5, 2  , 1
      endif
    endif
    if CA_bits = 8
	mov R0H,[PC]	; 2, 4  , 1
	inc PC		; 1, 2  , 1
    else
      if PCeqSI
	lodsb		; 1, 5  , 5
      else
	mov R0L,[PC]	; 2, 4  , 1	; or mov R0H,[PC] is CA_bits=8
	inc PC		; 1, 2  , 1
      endif
	shl R0,CA_bits	; 3, 3  , 2	; number of bits to shift for array
    endif
    push R0		; 1, 2  , 1
    ;ret		; 1,10+m, 5
EndM	BCCA_PreNext
;TOTAL  BCCA:		;13,23+m,11
;	BCZBCA:		;10,23+m,11
;	BCCACSR0:	; 8,21+m,10
;	BCCASI:		;11,22+m,14
;	BCZBCASI:	; 8,22+m,14
;	BCZBCASICSR0:	; 6,20+m,13
;	BCCA8:		;10,20+m, 9
;	BCZBCA8:	; 7,20+m, 9
;	BCCA8CSR0:	; 5,18+m, 8

;BCCA8SI
;push imm		; 5, 2  , 1
;lodsd			; 1, 5  , 5
;mov [esp+1],al 	; 4, 2  , 1
;ret			; 1,10+m, 5
;TOTAL			;11,19+m,12

;BCCA8
;push imm		; 5, 2  , 1
;mov R0L,[PC]		; 2, 4  , 1
;inc PC			; 1, 2  , 1
;mov [esp+1],R0L 	; 4, 2  , 1
;ret			; 1,10+m, 5
;TOTAL			;13,20+m, 9

;BCCA8CSR0
;mov R0H,[PC]		; 2, 4  , 1
;inc PC			; 1, 2  , 1
;push R0	 	; 2, 2  , 1
;ret			; 1,10+m, 5
;TOTAL			; 6,18+m, 8


;;; BYTECODE, INDIRECT THREADED
;;; ---------------------------
Macro	BCIT_Enter
EndM	BCIT_Enter
Macro	BCIT_Next
	ife CSR0
	  if INSTRTBL
	    mov R0,INSTRTBL/4	; 5, 2  , 1;
	  else
	    clr R0		; 2, 2  , 1;
	  endif
	endif
	mov R0L,[PC]		; 2, 4  , 1; was movzx ebx,[byte esi]; 3, 6, 3
	inc PC			; 1, 2  , 1; was lodsb; movzx eax,al ; 4, 8, 8
	jmp [R0*4]		; 3,11+m, 6
EndM	BCIT_Next
;TOTAL:	BCIT			;11,19+m, 9		; or 
;	BCZBIT:			; 8,19+m, 9
;	BCITCSR0:		; 6,17+m, 8

Macro	BCIT_PreNext
	ife CSR0
	  if INSTRTBL
	    mov R0,INSTRTBL/4	; 5, 2  , 1;
	  else
	    clr R0		; 2, 2  , 1;
	  endif
	endif
	mov R0L,[PC]		; 2, 4  , 1; was movzx ebx,[byte esi]; 3, 6, 3
	inc PC			; 1, 2  , 1; was lodsb; movzx eax,al ; 4, 8, 8
	push [R0*4]		; 7, 5  , 4
	ret			; 1,10+m, 5
EndM	BCIT_Next
;TOTAL: BCIT			;16,23+m,12
;TOTAL: BCZBIT			;13,23+m,12
;TOTAL: BCITCSR0		;11,21+m,11

;;; WORDCODE, (ZERO-BASED ?) CODE ARRAY / DIRECT THREADED
;;; -----------------------------------------------------
Macro	WC_Enter
  if WCE
    add PC,2		; 3, 2  , 1	; or inc PC;inc PC ; 2, 4  , 2
  endif
EndM	WC_Enter
Macro	WC_Next
    ife CS_R0		; R0 is callee-saved
      ife CA_Base	; Base of Code Array
	clr R0		; 2, 2  , 1	; or movzx R0,[byte PC] ; 3, 6  , 3
      else
	mov R0,CA_base	; 5, 2  , 1
      endif
    endif
    if CA_bits = 8
	mov R0H,[PC]	; 2, 4  , 1
	inc PC		; 1, 2  , 1
    else
      if PCeqSI
	lodsb		; 1, 5  , 5
      else
	mov R0L,[PC]	; 2, 4  , 1	; or mov R0H,[PC] is CA_bits=8
	inc PC		; 1, 2  , 1
      endif
	shl R0,CA_bits	; 3, 3  , 2	; number of bits to shift for array
    endif
    jmp R0		; 2, 7+m, 5
EndM	BCCA_Next
;TOTAL  BCCA:		;13,18+m,10	; CA_bits=8: 256 bytes element array;
;	BCZBCA:		;10,18+m,10	; but then, there may be more cache
;	BCCACSR0:	; 8,16+m, 9	; misses on a small cache, and/or
;	BCCASI:		;11,14+m,11	; the interspace may be space costly,
;	BCZBCASI:	; 8,14+m,11	; and require some hand optimization
;	BCCASICSR0:	; 6,12+m,10	; or writing specifically a automatic
;	BCCA8:		;10,15+m, 8	; optimizer.
;	BCZBCA8:	; 7,15+m, 8	; R0 can also be made a
;	BCCA8CSR0:	; 5,13+m, 7	; callee-saved register, so that we
; have 5,13+m,7 !!!

Macro	BCCA_PreNext
    ife CS_R0		; R0 is callee-saved
      ife CA_Base	; Base of Code Array
	clr R0		; 2, 2  , 1	; or movzx R0,[byte PC] ; 3, 6  , 3
      else
	mov R0,CA_base	; 5, 2  , 1
      endif
    endif
    if CA_bits = 8
	mov R0H,[PC]	; 2, 4  , 1
	inc PC		; 1, 2  , 1
    else
      if PCeqSI
	lodsb		; 1, 5  , 5
      else
	mov R0L,[PC]	; 2, 4  , 1	; or mov R0H,[PC] is CA_bits=8
	inc PC		; 1, 2  , 1
      endif
	shl R0,CA_bits	; 3, 3  , 2	; number of bits to shift for array
    endif
    push R0		; 1, 2  , 1
    ;ret		; 1,10+m, 5
EndM	BCCA_PreNext
;TOTAL  BCCA:		;13,23+m,11
;	BCZBCA:		;10,23+m,11
;	BCCACSR0:	; 8,21+m,10
;	BCCASI:		;11,22+m,14
;	BCZBCASI:	; 8,22+m,14
;	BCZBCASICSR0:	; 6,20+m,13
;	BCCA8:		;10,20+m, 9
;	BCZBCA8:	; 7,20+m, 9
;	BCCA8CSR0:	; 5,18+m, 8

;BCCA8SI
;push imm		; 5, 2  , 1
;lodsd			; 1, 5  , 5
;mov [esp+1],al 	; 4, 2  , 1
;ret			; 1,10+m, 5
;TOTAL			;11,19+m,12

;BCCA
;push 16-bit imm	; 4, 2  , 2	; or 3, 2, 2
;push [word PC]		; 3, 4  , 2	; 
;add PC,2		; 3, 2  , 1
;ret			; 1,10+m, 5
;TOTAL			;11,18+m,10

Macro	WC
lodsw			; 2, 5
jmp ax			; 3, 5
;TOTAL:			  5,10
;alternatively:
add esi,2		; 3, 1 ; or inc esi; inc esi 2, 2
jmp [word esi]		; 3, 5

push [word esi]		; 3, 4
add esi,2		; 3, 1	; or inc esi;inc esi (2,2)
ret			; 1, 5  ;
;TOTAL:			; 7,10 	; or 6,11


;;; WORDCODE, INDIRECT THREADED
;;; ---------------------------
Macro	WCIT_Enter
EndM	WCIT_Enter
Macro	WCIT_Next
	ife CSR0
	  if INSTRTBL
	    mov R0,INSTRTBL/4	; 5, 2  , 1
	  else
	    clr R0		; 2, 2  , 1
	  endif
	endif
	mov R0L,[PC]		; 2, 4  , 1; was movzx ebx,[byte esi]; 3, 6, 3
	inc PC			; 1, 2  , 1; was lodsb; movzx eax,al ; 4, 8, 8
	jmp [R0*4]		; 3,11+m, 6
EndM	WCIT_Next
;TOTAL:	WCIT			;11,19+m, 9		; or 
;	WCZBIT:			; 8,19+m, 9
;	WCITCSR0:		; 6,17+m, 8

Macro	WCIT_PreNext
	ife CSR0
	  if INSTRTBL
	    mov R0,INSTRTBL/4	; 5, 2  , 1;
	  else
	    clr R0		; 2, 2  , 1;
	  endif
	endif
	mov R0L,[PC]		; 2, 4  , 1; was movzx ebx,[byte esi]; 3, 6, 3
	inc PC			; 1, 2  , 1; was lodsb; movzx eax,al ; 4, 8, 8
	push [R0*4]		; 7, 5  , 4
	ret			; 1,10+m, 5
EndM	WCIT_Next
;TOTAL: WCIT			;16,23+m,12
;TOTAL: WCZBIT			;13,23+m,12
;TOTAL: WCITCSR0		;11,21+m,11


;;; INDIRECT-THREADED, PC=(E)SI
;;; ------------------------
Macro	SIIT_Enter
EndM
Macro	SIIT_Next
	lodsd			; 1, 5  , 5
	jmp [dword eax]		; 2, 7+m, 5
EndM
; TOTAL				; 3,12+m,10

Macro	SIIT_PreNext
	lodsd			; 1, 5  , 5
	push [dword eax]	; 2, 5  , 4
	;ret			; 1,10+m, 5
EndM	SIIT_PreNext
; TOTAL				; 4,20+m,14

;;; INDIRECT-THREADED
;;; -----------------
Macro	IT_Enter
EndM	IT_Enter
Macro	IT_Next
	mov R0,[PC]		; 2, 2  , 1
	add PC,4		; 3, 2  , 1
	jmp [R0]		; 2, 7+m, 5
EndM	IT_Next
; TOTAL				; 7,11+m, 7

Macro	IT_PreNext
	mov R0,[PC]		; 2, 2  , 1
	add PC,4		; 3, 2  , 1
	push [R0]		; 2, 5  , 4
	;ret			; 1,10+m, 5
EndM	IT_PreNext
; TOTAL				; 7,19+m,11

;;; RET-BASED DIRECT THREADED
;;; -------------------------
Macro	R_Enter
; may be some code to allow an alternative kind of threading !
EndM	R_Enter
Macro	R_Next
	ret		; 1,  10+m, 5
EndM	R_Next
;TOTAL	R:		; 1,  10+m, 5

;;; DIRECT-THREADED, PC=SI
;;; ----------------------
Macro	SIDT_Enter
EndM	SIDT_Enter
Macro	SIDT_Next
	lodsd			; 1, 5  , 5
	jmp eax			; 2, 7+m, 5
EndM	SIDT_Next
;	TOTAL:			; 3,12+m,10
Macro	SIDT_PreNext
	lodsd			; 1, 5  , 5
	push eax		; 1, 2  , 1
	;ret			; 1,10+m, 5
EndM	SIDT_PreNext
;	TOTAL:			; 3,17+m,11


;;; DIRECT-THREADED
;;; ---------------
Macro	DT_Enter
EndM	DT_Enter
Macro	DT_Next
	mov R0,[PC]		; 2, 4  , 1
	add PC,4		; 3, 2  , 1
	jmp R0			; 2, 7+m, 5
EndM	DT_Next
;	TOTAL:			; 7,13+m, 7

Macro	DT_PreNext
	mov R0,[PC]		; 2, 4  , 1
	add PC,4		; 3, 2  , 1
	push R0			; 1, 2  , 1
	;ret			; 1,10+m, 5
EndM	DT_PreNext
;	TOTAL:			; 7,18+m, 8


;;; DIRECT-THREADED w/ ENTER
;;; ------------------------
Macro	DTE_Enter
	add PC,4		; 3, 2  , 1
EndM	DTE_Enter
Macro	DT_Next
	jmp [PC]		; 2,10+m, 5
EndM	DT_Next
;	TOTAL:			; 5,12+m, 6

Macro	DT_PreNext
	push [PC]		; 2, 5  , 4
	;ret			; 1,10+m, 5
EndM	DT_PreNext
;	TOTAL:			; 5,17+m,10

;;;;;;;;;;;;;;;;;;;;;;;;; IMPLEMENTATIONS OF A STACK ;;;;;;;;;;;;;;;;;;;;;;;;;;





;;;;;;;;;;;;;;;;;;;;;;;;;;;; EFFICIENCY MEASURES ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; External loop for &X Threading with R:
; &X&LOOP: push offset &X&LOOP ; &X&_Next
; (mixing the push and the Next to minimize the +m)

; or more efficiently, prepend the next code to the whole routine !
; thus, mixed BCIT/R threading achieves best speed when used in mode R
; and good speed (25+m/12) with most compact threading when used in mode BCIT
; mixed /R 

;Threading	Toksiz	Direct use	Prepended to R	Loop over R
; (any)		n	x,a+m,b		x+1,a+m+8,b+4	1,a+m+11,b+7
; R:		4	 1,10+m, 5	N/A		N/A
; DT:		4	 5,13+m, 8
; SIDT:		4	 3,12+m,10	 4,20+m,12	1,23+m,17
; BCCA:		1	
; BCZBCA:	1	
; BCCA8:	1	
; BCAZBCA8:	1	
; BCIT:		1	33+m,15		1,		  ,25+m,12
; WCCA:		2	
; WCZBCA:	2	
; WCDT:		2	
; WCZBDT:	2					
; WCIT:		2	
; WCZBIT:	2	


;;;;;;;;;;;;;;;;;;;;;;; EQUATES FOR DEFAULT BEHAVIOUR ;;;;;;;;;;;;;;;;;;;;;;;;;
Threading	equ	R	; Ret based threading

;;; Generic code
;;; ------------
Macro	Enter
  % Threading&<_Enter>
EndM	Enter
Macro	Next
  % Threading&<_Next>
EndM	Next
