; ---------------------------------------------------------------------------
; ballZup by matja for EFnet #asm compo 5 (1k demo, effect, or game)
; ---------------------------------------------------------------------------
; Remake of vector balls, this time with per-pixel depth buffer.
; Nothing else really special going on, dodgy FPU code stole all my bytes :P
; Assemble with NASM! (NASM forevAH!)
;
; Requirements -
; - DOS 2.0+, 384K of base ram, VGA/MCGA, mad fast chipz ;)
;
; Respectz and Greetz due (reverse MD5 hashed order) -
; wbear,Frenzy,benw,mboy,Scali,DAL9000,mcarp,med,Spec-Chum,katz3h,brain,
; hex86,Mortiis!,CoaXCable,Atomicfoo,MoMsN,Auroran,as,spASM,RedMercry,Kalms,
; vulture,razzi,TNSE,|nsomniac,TommyGun,Parazite,Zhenya,Spansh,sosay,czth,
; attack,Reptile4,sulphur,kawfee,ticker,xor,TasmBoy,nANDy,khaladan,iretd,
; auspex,Jakdaw,ETHry,Ravian,GooRoo,#asm,#coders(both),#ukscene,WIJ,CoolPhat
;
; flames to mat@dimebar.net plzthx
;
; * commented version *
;
; I've put comments on long FPU instruction sequences showing the
; FPU stack state after each instrucion has executed, where the leftmost
; value is st0 (stack top), eg.
;   fld1            ; 1         (st0 = 1)
;   fldpi           ; pi, 1     (st0 = pi, st1 = 1)
;   faddp st1, st0  ; pi+1      (st0 = pi+1)
;   fstp [mem]      ;           (empty)
;
; ---------------------------------------------------------------------------

org 0x100                         

SPHERE_SIZE         equ     64
FRAME_DEPTH_SEG     equ     0x0000
SPHERE_DEPTH_SEG    equ     0x4000

start:
    ; -------------------------------------------------------------
    ; Allocate buffers
    ; 256k depth buffer
    ;  64k sprite depth buffer
    ; -------------------------------------------------------------
    ; All memory is allocated to the program already by DOS, so we
    ; need to release everything except the program segment, and then
    ; allocate the extra memory we need.

    push ds
    pop es
    mov ah, 0x4a    ; resize  
    mov bx, 0x1000  ; to 64k  
    int 0x21
    mov ah, 0x48    ; alloc
    mov bx, 0x6000  ; 384k
    int 0x21
    jnc alloc_ok
    ret
alloc_ok:
    mov [buffers], ax

    ; -------------------------------------------------------------
    ; Set mode 13h and greyscale palette
    ; -------------------------------------------------------------

    mov ax, 0x13
    int 0x10
    xor ax, ax
    mov dx, 0x3c8
    ; <- out dx, al ; this was missing from my original ª_ª thanks Libthium :)
    inc dx
set_palette:
    out dx, al
    out dx, al
    out dx, al
    mov al, ah
    shr al, 2
    inc ah
    jnz set_palette

    ; initialise the fpu.
    ; it's usually in a usable state if our prog has been spawned by dos,
    ; but good practice to get everything we use into a known state.

    finit

    ; we need ds set to cs to get to our variables, by default cs=ds=es but
    ; this isn't guarenteed after we call int's, so set it again
    ; (could use cs: overrides of course, but that costs an extra byte
    ; per instruction that uses it)

    push cs  
    pop ds

    ; -------------------------------------------------------------
    ; Generate sphere depth values
    ; -------------------------------------------------------------
    ; the idea is to loop from -1 to 1, for both x and y (sg_x and sg_y),
    ; and calculate the depth value for every point on the unit sphere
    ; using (1 - sqrt(1 - (x*x + y*y))) * 65536
    ; sg_d is 2/SPHERE_SIZE

    fld1                        
    fchs
    fstp dword [sg_y]               ; sg_y = -1
    mov cx, SPHERE_SIZE             ; y-loop counter

    mov ax, [buffers]               ; get base of allocated memory
    add ax, SPHERE_DEPTH_SEG        ; get segment of sphere depth values
    mov es, ax               
    xor di, di                      ; point to start of segment

sg_l1:
    fld1
    fchs             
    fstp dword [sg_x]               ; sg_x = -1
    push cx
    mov cx, SPHERE_SIZE             ; x-loop counter

sg_l2:
    fld1                            ; 1           
    fld1                            ; 1, 1
    fld dword [sg_x]                ; x, 1, 1
    fmul st0, st0                   ; x*x, 1, 1
    fld dword [sg_y]                ; y, x*x, 1, 1
    fmul st0, st0                   ; y*y, x*x, 1, 1
    faddp st1,st0                   ; x*x+y*y, 1, 1
    fsubp st1,st0                   ; 1-x*x+y*y, 1
    fsqrt                           ; sqrt(1-x*x+y*y), 1
    fsubp st1,st0                   ; 1-sqrt(1-x*x+y*y)
    fmul dword [FPU_CONST_65535]    ; (1-sqrt(1-x*x+y*y))*65535
    fistp dword [es:di]             ; store as 32bit int
    add di, 4

    fld dword [sg_x]                ; sg_x = sg_x + sg_d 
    fadd dword [sg_d]
    fstp dword [sg_x]

    dec cx                          
    jnz sg_l2                       ; continue x-loop?

    pop cx
    fld dword [sg_y]                ; sg_y = sg_y + sg_d 
    fadd dword [sg_d]
    fstp dword [sg_y]

    dec cx                          
    jnz sg_l1                       ; continue y-loop?


main:
    ; -------------------------------------------------------------
    ; Main loop
    ; -------------------------------------------------------------
    ; start of a new frame...

    push cs                         
    pop ds

    call clear_depth_buffer

    ; initialise random seed, so random number function generates the same
    ; sequence for every frame

    mov dword [rand_seed], 0

    ; copy main_t to main_tt
    ; main_tt is the temporary time value that we will use to draw
    ; the sprites, it will go from main_t to main_tt + 2pi

    push cs
    pop ds
    mov eax, dword [main_t]
    mov dword [main_tt], eax

    ; -------------------------------------------------------------
    ; Update rotation values
    ; -------------------------------------------------------------

    fld dword [FPU_CONST_M0P5]
    fstp dword [main_ra]
    fld dword [main_t]
    fmul dword [FPU_CONST_1P4]
    fstp dword [main_rb]
    fld dword [main_t]
    fmul dword [FPU_CONST_1P2]
    fstp dword [main_rc]

    ; -------------------------------------------------------------
    ; Calculate and draw sprites
    ; -------------------------------------------------------------

    mov cx, 628                     ; 628 sprites, 1 every 0.01 radians
sprite_loop:
    push cx                         ; save loop counter for later

    ; -------------------------------------------------------------
    ; Generate 2 sets of x/y/z/scale values for current main_tt value
    ; -------------------------------------------------------------

    fld dword [main_tt]             ; push time value
    mov bx, main_x2                 ; point to vars to store x/y/z/scale
    call knot_func_0                ; calculate position
    fld dword [main_tt]             ; push time value
    mov si, bx                      ; save pointer for blend function later
    mov bx, main_x                  ; point to vars to store x/y/z/scale
    call knot_func_1                ; calculate position

    ; now interpolate between the 2 positions at bx and si, store to bx

    fld dword [main_t]              ; main_t
    fsin                            ; sin(main_t)
    fmul dword [FPU_CONST_0P5]      ; sin(main_t)*0.5
    fadd dword [FPU_CONST_0P5]      ; sin(main_t)*0.5+0.5

    ; blend = sin(main_t)*0.5+0.5
    ; doing this in a loop for each component saves some bytes ... 

    fld1                            ; 1, blend
    fsub st0, st1                   ; 1-blend, blend
    mov bp, 4
blend_loop:
    fld dword [bx]                  ; x1, 1-blend, blend
    fmul st0, st1                   ; x1*(1-blend), 1-blend, blend
    fld dword [si]                  ; x2, x1*(1-blend), 1-blend, blend
    fmul st0, st3                   ; x2*blend, x1*(1-blend), 1-blend, blend
    faddp st1, st0                  ; x2*blend+x1*(1-blend), 1-blend, blend
    fstp dword [bx]                 ; 1-blend, blend
    add bx, 4
    add si, 4
    dec bp
    jnz blend_loop
    fstp st0                        ; remove 1-blend
    fstp st0                        ; remove blend

    ; -------------------------------------------------------------
    ; Rotate x/y/z arround y, x, then z
    ; -------------------------------------------------------------
    ; Using a generic rotate function saves some bytes

    mov si, main_x
    mov di, main_z
    fld dword [main_rb]
    call rotate
    mov si, main_y
    mov di, main_z
    fld dword [main_ra]
    call rotate
    mov si, main_x
    mov di, main_y
    fld dword [main_rc]
    call rotate
    fld dword [main_z]
    fld st0

    ; -------------------------------------------------------------
    ; View transform follows ...
    ; Various magic numbers to get everything to fit on screen
    ; -------------------------------------------------------------

    ; add an offset to the z, basically gives the effect of a camera
    ; in the -z direction
    ; main_biasedz = main_z + 10

    fadd dword [FPU_CONST_10]
    fstp dword [main_biasedz]

    ; sprite_size = (5 / main_biasedz) * main_scale;

    fld dword [FPU_CONST_5]
    fdiv dword [main_biasedz]
    fmul dword [main_scale]
    fstp dword [main_sprite_size]

    ; calculate screen coordinates of sprite
    ; screen_x = ((x/biasedz)*screen_scale + SCREEN_WIDTH/2) - (SPHERE_SIZE/2)

    fld dword [main_x]
    fdiv dword [main_biasedz]
    fmul dword [main_screen_scale]
    fadd dword [SCREEN_WIDTH_OVER_2]
    fld dword [main_sprite_size]
    fimul word [SPHERE_SIZE_W]
    fmul dword [FPU_CONST_0P5]
    fsubp st1, st0
    fistp dword [main_screen_x]

    ; screen_y = ((x/biasedz)*screen_scale + SCREEN_WIDTH/2) - (SPHERE_SIZE/2)

    fld dword [main_y]
    fdiv dword [main_biasedz]
    fmul dword [main_screen_scale]
    fadd dword [SCREEN_HEIGHT_OVER_2]
    fld dword [main_sprite_size]
    fimul word [SPHERE_SIZE_W]
    fmul dword [FPU_CONST_0P5]
    fsubp st1, st0
    fistp dword [main_screen_y]

    ; calculate arguments to pass to put_sprite_depth_scale function ...
    ; first is psds_offset, which is the pixel offset in the depth buffer
    ; where the top-left of the sprite should be written
    ; psds_offset = main_screen_x + main_screen_y * 320

    mov eax, dword [main_screen_y]
    imul ax, 320
    add eax, dword [main_screen_x]
    mov [psds_offset], ax

    mov eax, [main_sprite_size]
    mov [psds_scale], eax

    ; z value still in st0, multiply by 65535 so we can store it as an
    ; integer in the depth buffer.
    ; (65535 is only chosen because we already have that float constant
    ; in memory, and it gives enough depth precision)    

    fmul dword [FPU_CONST_65535]
    fistp dword [psds_depth]

    ; args setup, write the sprite to the depth buffer

    call put_sprite_depth_scale

    ; function has changed ds, make it point back to program vars so
    ; we can access the vars without a cs: override

    push cs
    pop ds

    ; increment main_tt by 0.01 for the next sprite calc

    fld dword [main_tt]
    fadd dword [FPU_CONST_0P01]
    fstp dword [main_tt]

    ; restore loop counter from stack, decrement, and check if we need
    ; to continue.  jz/jmp is used instead of a single jnz because
    ; the label (sprite_loop) is out of range (128 bytes)
    
    pop cx                          
    dec cx
    jz  sprite_loop_skip
    jmp sprite_loop
sprite_loop_skip:

    ; increment main_t by 0.01 for the next frame
    ; (this is bad form, we should read a realtime counter and calculate
    ; main_t from that so that the effect runs in the same time
    ; regardless of machine speed, but, no space to fit that :(

    fld dword [main_t]
    fadd dword [FPU_CONST_0P01]
    fstp dword [main_t]

    ; copy our work to the screen

    call display_depth_buffer

    ; check if any keys are pressed, if so, exit loop
    ; we use 2 jumps here instead of a single jz because the
    ; main: label is out of range of a conditional jump (128 bytes)

    mov ah, 1                       
    int 0x16
    jnz main_skip                 
    jmp main                   
main_skip:

    ; switch back to textmode

    mov ax, 3
    int 0x10

exit:
    ; exit to dos

    mov ax, 0x4c00
    int 0x21
    ret                     ; paranoid


; -----------------------------------------------------------------
; Generic rotation function
; -----------------------------------------------------------------
; On entry -
; st0 = angle (r)
;   [si] = x
;   [di] = y
; on exit -
;   [si] = x*cos(r) + y*-sin(r)
;   [di] = x*sin(r) + y*cos(r)

rotate:
    fsincos                 ; cos(r), sin(r)
    fld st1                 ; sin(r), cos(r), sin(r)
    fchs                    ; -sin(r), cos(r), sin(r)
    fmul dword [di]         ; y*-sin(r), cos(r), sin(r)
    fld st1                 ; cos(r), y*-sin(r), cos(r), sin(r)
    fmul dword [si]         ; x*cos(r), y*-sin(r), cos(r), sin(r)
    faddp st1, st0          ; x*cos(r)+y*-sin(r), cos(r), sin(r)
    fld dword [si]          ; x, x*cos(r)+y*-sin(r), cos(r), sin(r)
    fxch st1                ; x*cos(r)+y*-sin(r), x, cos(r), sin(r)
    fstp dword [si]         ; x, cos(r), sin(r)
    fmulp st2, st0          ; cos(r), x*sin(r)
    fmul dword [di]         ; y*cos(r), x*sin(r)
    faddp st1, st0          ; y*cos(r)+x*sin(r)
    fstp dword [di]                                  
    ret

; -----------------------------------------------------------------
; Random balls generation function
; -----------------------------------------------------------------
; On entry -
;   bx = address of x/y/z/scale to fill
;   st0 = time value
; *must pop value from fpu stack before return*
; x = sin(t+frand())
; y = cos(t+frand())
; z = cos(t+frand())
; scale = 0.4;

knot_func_0:
    call get_rand
    fadd st0, st1
    fsin
    fstp dword [bx]

    call get_rand
    fadd st0, st1
    fsin
    fstp dword [bx+4]

    call get_rand
    faddp st1, st0
    fsin
    fstp dword [bx+8]

    fld dword [FPU_CONST_0P4]
    fstp dword [bx+12]

    ret



; -----------------------------------------------------------------
; generate random floating point number from 0 to 2pi
; -----------------------------------------------------------------
 
get_rand:
    ; first generate a random 32bit number ...

    mov eax, [rand_seed]
    imul eax, eax, 0x343fd
    add eax, 0x269ec3
    mov [rand_seed], eax

    ; convert low-order word to 0-2pi, return in st0

    fild word [rand_seed]
    fdiv dword [FPU_CONST_65535]
    fldpi
    fadd st0, st0
    fmulp st1, st0
    ret


; -----------------------------------------------------------------
; Function to generate a toroidal helix
; -----------------------------------------------------------------
; On entry -
;   bx = address of x/y/z/scale to fill
;   st0 = time value
; *must pop value from fpu stack before return*
;
; x = ((3+sin(16*t))*cos(t))*0.4
; y = (cos(16*t))*0.4
; z = ((3+sin(16*t))*sin(t))*0.4
; scale = 0.4

knot_func_1:
                                    ; t
    fld dword [FPU_CONST_0P4]       ; 0.4, t
    fld dword [FPU_CONST_16]        ; 16, 0.4, t
    fmul st0, st2                   ; 16*t, 0.4, t
    fsincos                         ; cos(16*t), sin(16*t), 0.4, t
    fxch st1                        ; sin(16*t), cos(16*t), 0.4, t
    fadd dword [FPU_CONST_3]        ; 3+sin(16*t), cos(16*t), 0.4, t
    fmul st0, st2                   ; (3+sin(16*t))*0.4, cos(16*t), 0.4, t
    fld st3                         ; t, (3+sin(16*t))*0.4, cos(16*t), 0.4, t
    fsincos                         ; cos(t), sin(t), (3+sin(16*t))*0.4, cos(16*t), 0.4, t
    fmul st0, st2                   ; cos(t)*(3+sin(16*t))*0.4, sin(t), (3+sin(16*t))*0.4, cos(16*t), 0.4, t
    fstp dword [bx]                 ; sin(t), (3+sin(16*t))*0.4, cos(16*t), 0.4, t
    fmulp st1, st0                  ; sin(t)*(3+sin(16*t))*0.4, cos(16*t), 0.4, t
    fstp dword [bx+8]               ; cos(16*t), 0.4, t
    fmul st0, st1                   ; 0.4*cos(16*t), 0.4, t
    fstp dword [bx+4]               ; 0.4, t
    fstp st1                        ; 0.4
    fstp dword [bx+12]
    ret


; -----------------------------------------------------------------
; Draw scaled sprite to depth-buffer
; -----------------------------------------------------------------
; On entry -
; [psds_offset] = depth buffer pixel offset (16bit)
; [psds_scale] = scale (32bit float, 1.0 = original size)

put_sprite_depth_scale:

    push cs
    pop ds

    fld1                            ; 1
    fdiv dword [psds_scale]         ; 1/psds_scale
    fst dword [psds_ooscale]        ; psds_ooscale = 1/psds_scale
    fmul dword [FPU_CONST_65535]    ; (1/psds_scale)*65535
    fistp dword [psds_idx]          ; psds_idx = (1/psds_scale)*65535
    fild word [psds_ssize]          ; psds_ssize
    fmul dword [psds_scale]         ; psds_ssize*psds_scale
    fistp word [psds_newsize]       ; psds_newsize = psds_ssize*psds_scale
    fldz                            ; 0
    fstp dword [psds_yoscale]       ; psds_yoscale = 0

    mov ax, word [buffers]
    add ax, SPHERE_DEPTH_SEG
    mov es, ax                      ; es = sphere depth values

    ; Problem here is the that the depth buffer we write to is 256k,
    ; and the depth values we are going to write can span segments,
    ; we want to avoid code to check for that (for speed and size), so
    ; make a segment:offset pair where we can start writing by
    ; working out the linear address and then converting to seg:ofs.
    ; This limits the biggest sprite we can draw to 51 pixels high. 

    movzx edx, word [buffers]
    shl edx, 4
    movzx eax, word [psds_offset]
    shl eax, 2
    add edx, eax
    mov di, dx
    and di, 0xf
    shr edx, 4              
    mov gs, dx                      ; gs:di = depth buffer write start 

    xor cx, cx
    
psds_l1:
    push cx

    fld dword [psds_yoscale]
    fistp dword [temp_32]
    mov eax, [temp_32]
    mul dword [psds_ssize]

    shl eax, 2
    mov bx, ax

    xor eax, eax
    mov cx, [psds_newsize]
    push di
psds_l2:
    push eax
    shr eax, 16
    shl eax, 2
    mov si, ax
    cmp dword [es:bx+si], 0x80000000
    je psds_skip
    mov edx, [es:bx+si]
    add edx, [psds_depth]
    cmp edx, [gs:di]
    jg psds_skip
    mov [gs:di], edx
psds_skip:
    pop eax
    add eax, [psds_idx]
    add di, 4
    dec cx
    jnz psds_l2

    pop di
    add di, 1280

    fld dword [psds_yoscale]
    fadd dword [psds_ooscale]
    fstp dword [psds_yoscale]

    pop cx
    inc cx
    cmp cx, [psds_newsize]

    jnz psds_l1

    ret


; -----------------------------------------------------------------
; Display depth buffer to screen
; -----------------------------------------------------------------
; For every value in depth buffer -
;   screen = depth * -0.00078 + 128

display_depth_buffer:
    mov cx, 4                       ; depth buffer spans 4 segments
    xor di, di                      ; point to beginning of vga segment
    mov ax, [buffers]
    push word 0xa000
    pop es
ddb_l1:
    xor si, si                      ; point to beginning of depth segment
    push cx
    push ax
    mov gs, ax
    mov cx, 0x4000                  ; 0x4000 depth dwords to read
ddb_l2:
    fild dword [gs:si]              ; depth
    fmul dword [FPU_CONST_M0P00078] ; depth * -0.00078
    fadd dword [FPU_CONST_128]      ; depth * -0.00078 + 128
    fistp dword [temp_32]           ; store
    mov eax, [temp_32]
    mov [es:di], al
    add si, 4
    inc di
    dec cx
    jnz ddb_l2
    pop ax
    pop cx
    add ax, 0x1000
    dec cx
    jnz ddb_l1
    ret


; -----------------------------------------------------------------
; Clear depth buffer
; -----------------------------------------------------------------
; Write large (far) z values to depth buffer so that small (closer)
; values can be written to it for the next frame

clear_depth_buffer:
    mov dx, [cs:buffers]
    mov es, dx
    mov cx, 4                   
    mov eax, 400000         ; conveniently makes a nice dark grey
cdb_l1:
    push cx
    mov cx, 0x4000
    xor di, di
    rep stosd
    add dx, 0x1000
    mov es, dx
    pop cx
    dec cx
    jnz cdb_l1
    ret


; -----------------------------------------------------------------
; Initialised vars and consts
; -----------------------------------------------------------------
; Constants we use ...
; Remembering to keep manual note of their sizes and format cause
; NASM won't automatically do that ;(

FPU_CONST_M0P5          dd -0.5
FPU_CONST_1P2           dd 1.2
FPU_CONST_1P4           dd 1.4
FPU_CONST_5             dd 5.0
FPU_CONST_10            dd 10.0
FPU_CONST_0P5           dd 0.5
FPU_CONST_65535         dd 65535.0
FPU_CONST_0P01          dd 0.01
FPU_CONST_0P4           dd 0.4
FPU_CONST_3             dd 3.0
FPU_CONST_16            dd 16.0
FPU_CONST_M0P00078      dd -0.00078
FPU_CONST_128           dd 128.0
SCREEN_WIDTH_OVER_2     dd 160.0
SCREEN_HEIGHT_OVER_2    dd 100.0
sg_d                    dd 0.03125
psds_ssize              dw SPHERE_SIZE
main_screen_scale       dd 500.0
SPHERE_SIZE_W           dw 64

; main time var, starts at 0

main_t                  dd 0.0


; -----------------------------------------------------------------
; Unintialised variables
; -----------------------------------------------------------------
; Temporary vars, arguments to funcs, args returned from funcs, etc.
; The .com doesn't contain these values, so we can't guarentee their
; values.  That's no problem because we already write to these before
; they are read back.

segment .bss

buffers             resw 1

temp_16             resw 1
temp_32             resd 1

sg_x                resd 1
sg_y                resd 1

main_x              resd 1
main_y              resd 1
main_z              resd 1
main_scale          resd 1
main_biasedz        resd 1
main_tt             resd 1

main_x2             resd 1
main_y2             resd 1
main_z2             resd 1
main_scale2         resd 1

main_ra             resd 1
main_rb             resd 1
main_rc             resd 1
main_nx             resd 1
main_ny             resd 1
main_nz             resd 1

main_screen_x       resd 1
main_screen_y       resd 1
main_sprite_size    resd 1
main_dboffset       resd 1

psds_newsize        resw 1
psds_idx            resd 1
psds_offset         resw 1
psds_yoscale        resd 1
psds_ooscale        resd 1
psds_depth          resd 1
psds_scale          resd 1

rand_seed           resd 1

; ---------------------------------------------------------------------------