; Raspberry Casket Player V2.x (24-Aug-2023)
; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; Provided by Chris 'platon42' Hodges <chrisly@platon42.de>
; Latest: https://git.platon42.de/chrisly42/PretrackerRaspberryCasket
; Rewritten by platon42/Desire based on a resourced, binary identical
; version of the original Pretracker V1.0 replayer binary provided
; by hitchhikr (thanks!), originally written in C by Pink/Abyss.
; This version is the hard work of reverse engineering all the
; offsets, removing all the C compiler crud, removing dead and
; surplus code (maybe artefacts from earlier ideas that did nothing),
; optimizing the code where possible. This resulted in both reduced
; size of the replayer, faster sample calculation and speeding the
; tick routine up significantly.
; Bugs from the original replayer were fixed.
; I also added a few optional features that come in handy, such as
; song-end detection and precalc progress support.
; Also: Open source. It's 2023, keeping the code closed is just not
; part of the demoscene spirit (anymore?), at least for a replayer.
; This player is still being optimized and worked on since its
; first release in late 2022.
; Verification
; ~~~~~~~~~~~~
; The first versions of the replayer had been verified against about
; 60 Pretracker tunes to create an identical internal state for each tick
; and identical samples (if certain optimizations switches are disabled).
; During the process this identical state and identical samples promise
; had to be dropped due to bugs in the original player and optimizations.
; This is especially the case for the track delay feature of Pretracker
; that could in some cases cause odd behaviour and unwanted muting that
; has been fixed in Raspberry Casket. So the verification is now heavily
; reduced to about 20 songs that still are identical.
; I do, however, now also have an emulated Paula output verification
; that compares the generated sound between the original code and
; Raspberry Casket. Divergences are manually checked from time to time.
; I might have introduced bugs though. If you find some problems,
; please let me know under chrisly@platon42.de. Thank you.
; Usage
; ~~~~~
; The new replayer comes as a drop-in binary replacement if you wish.
; In this case you will get faster sample generation (about 12%
; faster on 68000) and about 45% less CPU time spent. However, you
; won't get stuff as song-end detection and precalc progress this way.
; This mode uses the old CPU DMA wait that takes away 8 raster lines.
; If you want to get rid of the unnecessary waiting, you can switch
; to a copper driven audio control. If you want to use the top portion
; of the copperlist for this, you probably need to double buffer it.
; Otherwise, you could also position the copperlist at the end of
; the display and use single buffering if you call the tick routine
; during the vertical blank.
; Please use the documented sizes for the MySong and MyPlayer data
; structures, which are the symbols sv_SIZEOF and pv_SIZEOF
; respectively (about 2K and 12K with volume table).
; The source needs two common include files to compile (custom.i and
; dmabits.i). You should leave assembler optimizations enabled.
; (0. If you're using copper list mode, call pre_PrepareCopperlist.)
; 1. Call pre_SongInit with
;    - a pointer to MySong (mv_SIZEOF) in a1 and
;    - the music data in a2.
;    It will return the amount of sample memory needed in d0.
; 2. Then call pre_PlayerInit with
;    - a pointer to MyPlayer (pv_SIZEOF) in a0
;    - a pointer to chip memory sample buffer in a1
;    - the pointer to MySong in a2
;    - a pointer to a longword for progress information or null in a3
;    This will create the samples, too.
; 3. After that, regularly call pre_PlayerTick with MyPlayer in a0
;    and optionally the copperlist in a1 if you're using that mode).
; Size
; ~~~~
; The original C compiled code was... just bad. The new binary is
; less than 1/3rd of the original one.
; The code has been also optimized in a way that it compresses better.
; The original code compressed with Blueberry's Shrinkler goes from
; 18052 bytes down to 9023 bytes.
; Raspberry Casket, depending on the features compiled in, is about
; 5716 bytes and shrinkles down to ~4071 bytes (in isolation).
; So this means that the optimization is not just "on the outside".
; About 2.4 KB of the code (and data) are spent for the sample generation,
; the remaining code for playback.
; Timing
; ~~~~~~
; 1. Sample precalculation
; Sample generation is faster than the original 1.0 player and also
; faster than the 1.5 player, which got a slightly better performance
; than the 1.0 one (compiler change?).
; According to my measurements on my set of Pretracker tunes,
; Raspberry Casket needs between 10% to 20% less instructions.
; Of these instructions, about 5% are `muls` operations and the new
; player is only able to shave off between 3% and 8% percent of those,
; so this is probably the limiting factor.
; 2. Playback
; Raspberry Casket is about twice as fast as the old replayer for playback.
; Unfortunately, the replayer is still pretty slow and has high
; jitter compared to other standard music replayers.
; This means it may take up to 32 raster lines (13-18 on average)
; which is significant more than a standard Protracker replayer
; (the original one could take about 60 raster lines worst case and
; about 34 on average!).
; Watch out for Presto, the LightSpeedPlayer variant that should
; solve this problem.
; Changelog see https://git.platon42.de/chrisly42/PretrackerRaspberryCasket#Changelog

; Here come the various options for you to configure.
; To create an optimized drop-in replacement for the old V1.0 binary:

; This player was based on the V1.0 binary of the pretracker replayer
; and thus can only play files created with versions up to V1.0.
; Subsongs and sound effects are not supported. There is usually no
; use for this in an intro anyway.
; Enabling this switch will patch the song runtime to make it backward
; compatible with this replayer. Song data will be modified in memory.
; I would encourage you to still use V1.0 as the files saved with
; Pretracker V1.5 are unnecessarily bigger (about 458 bytes) and might
; lose some instrument names it seems.

; The original binary had a lot of extra checks that I would consider
; paranoia. For example it handled songs without instruments, waves
; or patterns.
; It contains code that I think is unreachable and a relic of
; being used for the actual tracker itself (releasing notes after
; a delay, looping patterns instead of advancing song).
; Or for combinations of song stopping (F00) and note delay in the
; same step, which no sane musician would use.
; This switch gets rid of the code. If you find a song that doesn't
; work with this disabled, please let me know.

; There is some pitch shifting applied during track delay by
; a maximum of one period -- depending on the delay value.
; I guess this might add some phasing to the sound (which would be
; only audible in a mono mix, I would think), but it comes with an
; extra multiplication per channel.
; Moreover, I traced down that this may reduce the Amiga period
; value lower than the minimum allowed value of 124. This may cause
; distortions (like playing a note in Protracker with pitch B-3)
; on the delayed track.
; If you really want (to risk) that, enable this switch.

; The binary comes with a jump table that has three official entry
; points. As you have the source, you could just bsr to the three
; functions (pre_SongInit, pre_PlayerInit, pre_PlayerTick) directly.
; If you want a drop-in (binary) replacement instead, enable this
; switch to get the jump table back.

; The original Pretracker replayer does not come with a song-end
; detection. If you want to have that (e.g. for a music disk), enable
; this switch and check for pv_songend_detected_b (relative to the
; MyPlayer structure) which goes to true when song-end is reached.

; Do you want to have information on the sample generation progress
; during the call to pre_PlayerInit? Then enable this and call
; pre_PlayerInit with a pointer to a longword in a3.
; Please make sure yourself that the initial value is zero.
; It will be incremented by the number of samples (in bytes)
; for each waveform done. You can get the total number of samples
; from the (previously undocumented) return value of pre_SongInit.

; Use slightly faster code and smaller for sample generation that
; might be off by 1 (rounding error) compared to the original code.

; Use tables for volume calculation instead of multiplication.
; Is slightly faster on 68000 (about 11%), but probably has no
; benefit for 68020+ and needs about 3.5 KB more RAM (MyPlayer)
; and 40 bytes of code.

; I found some obvious bugs in the code. This switch enables bugfixes,
; but the sound might not be exactly the same as in the tracker /
; original player (e.g. there is a bug in the volume envelope that
; will cause a pause in the decay curve that is not supposed to
; happen).

; You want to take care of what registers may be trashed by these
; routines because you have your own ABI? Good!
; Setting this to 0 will remove the register saving/restoring on
; the stack. All registers then may be trashed.
; Otherwise, d2-d7/a2-a6 are preserved as with the AmigaOS standard.

; Enable output to copperlist instead of audio registers and DMA wait.
; This gives your slightly more CPU time and less jitter (maybe 7
; rasterlines).
; When enabled, provide a pointer to a copperlist in a1 on calling
; pre_PlayerTick. It will update the necessary fields in the
; copperlist. To initially generate a copperlist (or two), use the
; pre_PrepareCopperlist subroutine. It will write out 37 copper
; commands (WAITs and MOVEs) starting for a given rasterline (if
; you're below rasterline 255, make sure you have the necessary
; wait for this case yourself!).
; There are two major reasonable ways to use this:
; - Your intro does no fancy copper shenanigans:
;   You can reserve the space of 37 longwords inside your normal
;   copperlist. The position should be after rasterline ~35.
; - You do a lot of copper stuff during the frame:
;   Create the pretracker copperlist to be at the very end of the
;   frame (rasterline 300 is a good spot). Make sure your custom
;   copper code has a static jump to the 37 copper instructions
;   at its end and terminate the copperlist correctly (hint:
;   move.l d1,(a0) after bsr pre_PrepareCopperlist will terminate
;   the copperlist ($fffffffe).
PRETRACKER_COPPER_OUTPUT = 0 ; 0 = standard CPU wait, 1 = Copperlist


        include "raspberry_casket.i"
        include "hardware/custom.i"
        include "hardware/dmabits.i"

; Code starts here

        dc.l    pre_SongInit-pre_FuncTable
        dc.l    pre_PlayerInit-pre_FuncTable
        dc.l    pre_PlayerTick-pre_FuncTable
        ;dc.b    '$VER: Raspberry Casket 1.1',0

; pre_PrepareCopperlist - initialize copperlist for replaying
; a0.l = copperlist (37 longwords for 4 channels, 5+8*NUM_CHANNELS)
; d0.w = rasterline (<239 or >=256)
; out: a0 = copperlist ptr after init
        movem.l d2-d3/d6/d7,-(sp)
        moveq.l #-2,d1
        lsl.w   #8,d0
        move.b  #$07,d0
        move.w  d0,(a0)+
        move.w  d1,(a0)+

        move.l  #(dmacon<<16)|DMAF_AUDIO,(a0)+

        add.w   #$500,d0
        move.w  d0,(a0)+
        move.w  d1,(a0)+

        ; writing 5*4 = 20 words
        move.w  #aud0+ac_ptr,d2
        moveq.l #0,d3
        moveq.l #NUM_CHANNELS-1,d7
.chloop moveq.l #5-1,d6
.dloop  move.w  d2,(a0)+
        move.w  d3,(a0)+
        addq.w  #2,d2
        dbra    d6,.dloop
        addq.w  #ac_SIZEOF-ac_dat,d2
        dbra    d7,.chloop

        move.l  #(dmacon<<16)|DMAF_SETCLR,(a0)+

        add.w   #$500,d0
        move.w  d0,(a0)+
        move.w  d1,(a0)+

        ; writing 2*4 = 12 words
        move.w  #aud0+ac_ptr,d2
        moveq.l #NUM_CHANNELS-1,d7
        moveq.l #3-1,d6
.dloop2 move.w  d2,(a0)+
        move.w  d3,(a0)+
        addq.w  #2,d2
        dbra    d6,.dloop2
        add.w   #ac_SIZEOF-ac_per,d2
        dbra    d7,.chloop2
        movem.l (sp)+,d2-d3/d6/d7


; SongInit - initialize data structure belonging to a song
; In:
; - a0: MyPlayer structure (unused)
; - a1: MySong structure (must be sv_SIZEOF bytes!)
; - a2: Pretracker song data
; Out:
; - d0: chipmemory (bytes) required for samples or 0 on error
        movem.l d2-d7/a2-a5,-(sp)
        moveq.l #0,d0
        movem.l (a2),d1/d3-d6                      ; song offsets $0000/$0004/$0008/$000c/$0010
        move.b  d1,d2
        move.b  d0,d1
        cmp.l   #$50525400,d1                   ; "PRE"-Text
        bne     .error
        moveq.l #MAX_INSTRUMENTS-1,d7           ; notice there's one extra name (available in 1.5, but not usable)!
        cmp.b   #$1e,d2
        bgt     .error
        bne.s   .nopatchv15
        move.l  $005c(a2),d0                    ; make song backward compatible
        ror.w   #8,d0
        move.l  d0,$003c(a2)
        addq.l  #8,d3                           ; skip over first pattern data (offset $0004)
        moveq.l #2*MAX_INSTRUMENTS-1,d7         ; v1.5 has 32 slots (the other ones used for sfx)
        cmp.b   #$1b,d2
        bgt     .error

        move.l  a1,a0
        move.w  #sv_SIZEOF,d0
        bsr     pre_MemClr

        add.l   a2,d3                           ; add to offset $0004
        move.l  d3,sv_pos_data_adr(a1)          ; address to position data (POSD)
        lea     (a2,d4.l),a4                    ; add to offset $0008

        moveq.l #0,d0
        moveq.l #0,d4
        lea     $003c(a2),a0
        move.b  (a0)+,sv_pat_restart_pos_w+1(a1)    ; $003c song restart pos
        move.b  (a0)+,d1                            ; $003d number of patterns
        move.b  (a0)+,sv_pat_pos_len_w+1(a1)        ; $003e songlength in pattern positions
        move.b  (a0)+,d0                            ; $003f number of steps!
        move.b  (a0)+,d4                            ; $0040 number of instruments
        move.b  (a0)+,sv_num_waves_b(a1)            ; $0041 number of waves

        move.b  d0,sv_num_steps_b(a1)

        lea     sv_wavegen_order_table+MAX_WAVES(a1),a3
        moveq.l #MAX_WAVES-1,d3                 ; fill 24 bytes with default order of waves?
        move.b  d3,-(a3)
        dbra    d3,.fillcount

        cmp.b   #$19,d2                         ; check if version is higher than 19
        bls.s   .hasnowaveordering

        moveq.l #MAX_WAVES-1,d3
        move.b  (a0)+,(a3)+                     ; $0042 wave generation ordering
        dbra    d3,.waveorderloop


        lea     sv_pattern_table(a1),a0
        move.l  a4,(a0)+
        add.w   d0,a4                           ; *3 bytes per pattern line
        add.w   d0,a4
        add.w   d0,a4
        subq.b  #1,d1
        bne.s   .pattableloop

        lea     (a2,d5.l),a0                    ; offset (from $000c) into instrument names
        moveq.l #23-1,d0                        ; max 23 chars
        tst.b   (a0)+
        dbeq    d0,.inststrloop
        dbra    d7,.instrnamesloop

        move.l  d4,d0
        beq.s   .noinstsskip
        lsl.w   #3,d0
        add.l   a0,d0                           ; skip 8 bytes of info per instrument (ININ)
        lea     sv_inst_patterns_table(a1),a3
        lea     sv_inst_infos_table(a1),a4
        cmp.w   #MAX_INSTRUMENTS,d4
        ble.s   .notruncto32
        moveq.l #MAX_INSTRUMENTS,d4
        subq.w  #1,d4
        move.l  d0,(a3)+

        moveq.l #0,d1
        move.b  (a0)+,d1                        ; ii_vibrato_delay
        lea     pre_vib_delay_table(pc),a5
        move.b  (a5,d1.w),d1
        addq.w  #1,d1
        move.w  d1,uii_vibrato_delay(a4)

        moveq.l #0,d1
        move.b  (a0)+,d1                        ; ii_vibrato_depth
        move.b  pre_vib_depth_table-pre_vib_delay_table(a5,d1.w),uii_vibrato_depth+1(a4)

        move.b  (a0)+,d1                        ; ii_vibrato_speed
        move.b  pre_vib_speed_table-pre_vib_delay_table(a5,d1.w),d1
        muls    uii_vibrato_depth(a4),d1        ; bake in this strange vibrato stuff
        asr.w   #4,d1
        move.w  d1,uii_vibrato_speed(a4)

        moveq.l #0,d1
        move.b  (a0)+,d1                        ; ii_adsr_attack
        add.w   d1,d1
        move.w  pre_fast_roll_off_16-pre_vib_delay_table(a5,d1.w),d1
        move.w  d1,uii_adsr_attack(a4)

        moveq.l #0,d1
        move.b  (a0)+,d1                        ; ii_adsr_decay
        move.b  pre_ramp_up_16-pre_vib_delay_table(a5,d1.w),uii_adsr_decay+1(a4)

        move.b  (a0)+,d1                        ; ii_adsr_sustain
        ; what is this? a patch?
        cmp.b   #15,d1
        bne.s   .dont_patch_sustain
        moveq.l #16,d1
        lsl.w   #6,d1
        move.w  d1,uii_adsr_sustain(a4)

        moveq.l #0,d1
        move.b  (a0)+,d1                        ; ii_adsr_release
        move.b  pre_ramp_up_16-pre_vib_delay_table(a5,d1.w),uii_adsr_release(a4)

        move.b  (a0)+,d1                        ; ii_pattern_steps
        move.b  d1,uii_pattern_steps(a4)
        add.l   d1,d0
        add.l   d1,d0
        add.l   d1,d0                           ; calc next start address
        lea     uii_SIZEOF(a4),a4
        dbra    d4,.instinfoloop

        lea     (a2,d6.l),a0                    ; offset (from $0010) into wave names
        moveq.l #MAX_WAVES-1,d7
        moveq.l #23-1,d0                        ; max 23 chars
        tst.b   (a0)+
        dbeq    d0,.wavestrloop
        dbra    d7,.wavenamesloop

        move.l  a0,d0
        lsr.w   #1,d0
        bcc.s   .addressiseven
        addq.l  #1,a0                           ; make address even
        move.l  a0,sv_waveinfo_ptr(a1)

        moveq.l #2,d0                           ; at least empty sample
        moveq.l #0,d7
        move.b  sv_num_waves_b(a1),d7           ; has instruments?
        beq.s   .hasnoinstruments

        move.l  sv_waveinfo_ptr(a1),a3
        subq.w  #1,d7
        moveq.l #0,d1
        move.b  wi_sam_len_b(a3),d1
        addq.w  #1,d1
        lsl.w   #7,d1
        move.l  d1,sv_wavelength_table-sv_waveinfo_table(a1)
        btst    #2,wi_flags_b(a3)
        beq.s   .onlythreeocts
        mulu    #15,d1
        lsr.l   #3,d1           ; * 1.875
        move.l  d1,sv_wavetotal_table-sv_waveinfo_table(a1)
        move.l  a3,(a1)+
        add.l   d1,d0
        lea     wi_SIZEOF(a3),a3
        dbra    d7,.wavetableloop
        ; d0 will contain the size of the samples
        movem.l (sp)+,d2-d7/a2-a5

; PlayerInit - initialize player and calculate samples
; In:
; - a0: MyPlayer (must have size of pv_SIZEOF, will be initialized)
; - a1: sample buffer
; - a2: MySong (must have been filled with SongInit before!)
; - a3: pointer to a longword for the progress of samples bytes generated (or null)
        movem.l d2-d7/a2-a6,-(sp)
        move.l  a0,a4
        move.l  a2,a6

        move.w  #pv_SIZEOF,d0
        bsr     pre_MemClr     ; keeps a1 unchanged!

        move.l  a6,pv_my_song(a4)

; ----------------------------------------
; proposed register assignment:
; a0 = sample output / scratch
; a1 = scratch
; a3 = waveinfo
; a4 = MyPlayer
; a6 = MySong

        move.l  a3,pv_precalc_progress_ptr(a4)

        move.l  a1,pv_sample_buffer_ptr(a4)
        beq.s   .hasnosamplebuffer  ; PARANOIA

        moveq.l #0,d7
        move.w  d7,(a1)+               ; empty sample
        move.b  sv_num_waves_b(a6),d7
        move.w  d7,pv_wg_wave_counter_w(a4)
        beq.s   .hasnosamplebuffer  ; PARANOIA

        lea     pv_wave_sample_table(a4),a0
        lea     sv_wavetotal_table(a6),a3
        subq.w  #1,d7
        move.l  a1,(a0)+               ; write sample start pointer
        adda.l  (a3)+,a1
        dbra    d7,.samplestartsloop

; ----------------------------------------

        lea     pre_delta_period_table(pc),a0
        lea     pv_period_table(a4),a1

        ; fill the missing entries in the period table by interpolating
        move.w  (a0)+,d2
        moveq.l #3*NOTES_IN_OCTAVE-1,d7
        moveq.l #0,d0
        moveq.l #0,d1
        move.b  (a0)+,d1
        move.w  d2,d0
        sub.w   d1,d2
        swap    d1
        lsr.l   #4,d1

        moveq.l #16-1,d6
        move.w  d0,(a1)+
        swap    d0
        sub.l   d1,d0
        swap    d0
        dbra    d6,.perfineipolloop
        dbra    d7,.periodtableloop

; ----------------------------------------

        move.l  #$00ffff06,pv_pat_curr_row_b(a4)    ; pattern frame = 0, line = $ff, pattern pos = $ff, speed_even = 0
        move.l  #$06060100,pv_pat_speed_odd_b(a4)   ; and pv_pat_line_ticks_b, pv_pat_stopped_b, pv_songend_detected_b
        addq.w  #2,pv_stop_len_lof(a4)
        move.w  #$007b,pv_stop_per_vol_trg(a4)

        lea     pv_channeldata(a4),a0
        moveq.l #NUM_CHANNELS-1,d7
        moveq.l #0,d0
        move.b  #MAX_VOLUME,pcd_pat_vol_b(a0)
        st      pcd_track_delay_offset_b(a0)
        move.l  sv_waveinfo_ptr(a6),pcd_waveinfo_ptr(a0)    ; we should actually have no wave selected
        addq.w  #3,pcd_adsr_phase_w(a0)

        lea     pv_sample_buffer_ptr(a4),a1
        lea     pcd_out_base(a0),a2
        move.l  (a1)+,(a2)+     ; pv_sample_buffer_ptr -> pcd_out_ptr_l
        move.l  (a1)+,(a2)+     ; pv_stop_len_lof      -> pcd_out_len_w / pcd_out_lof_w
        move.l  (a1)+,(a2)+     ; pv_stop_per_vol_trg  -> pcd_out_per_w / pcd_out_vol_b / pcd_out_trg_b

        move.b  d0,pcd_channel_num_b(a0)
        bset    d0,pcd_channel_mask_b(a0)
        addq.b  #1,d0
        lea     pcd_SIZEOF(a0),a0
        dbra    d7,.chaninitloop2

; ----------------------------------------

        bset    #1,$BFE001          ; filter off

        tst.b   sv_num_waves_b(a6)
        beq     .earlyexit          ; PARANOIA

        lea     sv_wavegen_order_table(a6),a1
        bsr.s   pre_WaveGen

; ----------------------------------------
        lea     pv_volume_table(a4),a0
        moveq.l #(MAX_VOLUME+1)-1,d7
        moveq.l #0,d0
        moveq.l #MAX_VOLUME*2-1,d6
        moveq.l #0,d1
        move.w  d1,d2
        lsr.w   #6,d2
        move.b  d2,(a0)+
        add.w   d0,d1
        dbra    d6,.vol_innerloop
        addq.w  #1,d0
        dbra    d7,.vol_outerloop

; ----------------------------------------
        movem.l (sp)+,d2-d7/a2-a6

        include "raspberry_casket_wavegen.asm"

; PlayerTick - Play one frame of music (called every VBL)
; In:
; - a0: MyPlayer
; - a1: copperlist (if enabled)
        movem.l d2-d7/a2-a6,-(sp)
        move.l  a0,a4
        move.l  a1,pv_copperlist_ptr(a4)
        movea.l pv_my_song(a4),a6
        tst.b   pv_pat_stopped_b(a4)
        beq     .inst_pattern_processing    ; don't process if music has been stopped

; ----------------------------------------
; processes the current pattern position
; registers used:
; d0: pitch shift (lower part)
; d1: scratch
; d2: effect cmd
; d3: pitch_ctrl
; d4: inst number
; d5: effect cmd
; d6: unused (flag later)
; d7: pitch
; a0: pattern data pointer
; a1: short-term scratch
; a2: unused
; a3: unused
; a4: pv
; a5: channel struct
; a6: mysong struct
        lea     pv_channeldata(a4),a5   ; start with first channel
        ; I think this is something leftover from the tracker itself.
        ; Nothing sets pcd_pat_adsr_rel_delay_b from inside the player.
        ; It is used as a counter to release a note (ADSR) after a given time.
        ; It's not the same as the instrument ADSR release (see pcd_note_off_delay_b)
        tst.b   pcd_pat_adsr_rel_delay_b(a5)
        ble.s   .handle_2nd_instrument
        subq.b  #1,pcd_pat_adsr_rel_delay_b(a5)
        bne.s   .handle_2nd_instrument

        move.w  pcd_adsr_volume_w(a5),d3
        lsr.w   #6,d3
        move.w  d3,pcd_adsr_vol64_w(a5)
        moveq.l #16,d4
        move.w  d4,pcd_adsr_pos_w(a5)
        sub.w   d3,d4
        lsr.w   #1,d4
        add.b   pcd_adsr_release_b(a5),d4
        move.b  d4,pcd_adsr_phase_speed_b(a5)
        move.w  #3,pcd_adsr_phase_w(a5)

; ----------------------------------------
        moveq.l #0,d1
        move.b  pcd_pat_2nd_inst_num4_b(a5),d1
        beq.s   .handle_current_instrument

        tst.b   pcd_pat_2nd_inst_delay_b(a5)
        beq.s   .trigger_2nd_instrument
        subq.b  #1,pcd_pat_2nd_inst_delay_b(a5)
        bra.s   .handle_current_instrument

        move.b  d1,pcd_new_inst_num_b(a5)
        move.w  d1,pcd_inst_num4_w(a5)
        add.w   d1,d1
        add.w   d1,d1
        lea     sv_inst_infos_table-uii_SIZEOF(a6),a1
        add.w   d1,a1
        move.l  a1,pcd_inst_info_ptr(a5)    ; loads 2nd instrument
        move.b  uii_pattern_steps(a1),pcd_inst_pattern_steps_b(a5)

        moveq.l #0,d1
        move.l  a5,a1
        move.l  d1,(a1)+    ; pcd_pat_portamento_dest_w and pcd_pat_pitch_slide_w
        move.l  d1,(a1)+    ; pcd_pat_vol_ramp_speed_b, pcd_pat_2nd_inst_num4_b, pcd_pat_2nd_inst_delay_b, pcd_wave_offset_b
        move.l  d1,(a1)+    ; pcd_inst_pitch_slide_w and pcd_inst_sel_arp_note_w
        move.w  d1,(a1)+    ; pcd_inst_note_pitch_w
        addq.l  #2,a1

        move.l  d1,(a1)+    ; pcd_inst_line_ticks_b, pcd_inst_pitch_pinned_b, pcd_inst_vol_slide_b, pcd_inst_step_pos_b

        subq.b  #1,d1
        move.w  d1,(a1)+    ; pcd_inst_wave_num4_w

        move.l  #$ff010010,(a1)+    ; pcd_track_delay_offset_b, pcd_inst_speed_stop_b, pcd_inst_pitch_w
        move.l  #(MAX_VOLUME<<16)|(MAX_VOLUME<<8)|MAX_VOLUME,(a1)+  ; pcd_inst_vol_w / pcd_loaded_inst_vol_b / pcd_pat_vol_b

        bra.s   .continue_with_inst

; ----------------------------------------

; ----------------------------------------
; handle portamento
        move.w  pcd_pat_portamento_dest_w(a5),d3
        beq.s   .no_portamento_active
        move.w  pcd_inst_curr_port_pitch_w(a5),d2
        moveq.l #0,d1
        move.b  pcd_pat_portamento_speed_b(a5),d1
        cmp.w   d3,d2
        bge.s   .do_portamento_down
        add.w   d1,d2
        cmp.w   d3,d2
        bgt.s   .portamento_note_reached
        bra.s   .update_portamento_value

        sub.w   d1,d2
        cmp.w   d3,d2
        bge.s   .update_portamento_value

        clr.w   pcd_pat_portamento_dest_w(a5)
        move.w  d3,d2
        move.w  d2,pcd_inst_curr_port_pitch_w(a5)

; ----------------------------------------
; handle volume ramping
        move.b  pcd_pat_vol_ramp_speed_b(a5),d1
        beq.s   .no_vol_ramping_active
        add.b   pcd_pat_vol_b(a5),d1
        bpl.s   .noclip_pat_vol_min
        moveq.l #0,d1
        cmp.b   #MAX_VOLUME,d1
        ble.s   .noclip_pat_vol_max
        moveq.l #MAX_VOLUME,d1
        move.b  d1,pcd_pat_vol_b(a5)

; ----------------------------------------
; enters with channel number in d0

        ; handle delayed note and note off first
        moveq.l #0,d4
        move.b  pcd_note_delay_b(a5),d4
        blt     .pat_play_cont
        beq.s   .no_note_delay
        subq.b  #1,d4
        beq.s   .note_delay_end_reached

        move.b  d4,pcd_note_delay_b(a5)     ; note still delayed
        bra     .pat_play_cont              ; I believe that with activated track delay, we must jump here

        st       pcd_note_delay_b(a5)       ; release note delay
        moveq.l #0,d5
        move.b  pcd_channel_num_b(a5),d5
        move.w  pv_curr_pat_pos_w(a4),d2
        add.w   d2,d2
        add.w   d2,d2          ; *4
        add.w   d5,d2
        add.w   d2,d2          ; 8*pos+2*chan
        movea.l sv_pos_data_adr(a6),a1
        adda.w  d2,a1
        ;move.l  a1,d2
        ;cmpa.w  #0,a1
        ;beq     .pat_play_other ; this is probably never happening!
        moveq.l #0,d2
        move.b  pv_pat_curr_row_b(a4),d2
        cmp.b   sv_num_steps_b(a6),d2
        bcc     .pat_play_cont

        move.b  ppd_pat_num(a1),d5
        beq     .pat_play_cont
        add.w   d5,d5
        add.w   d5,d5
        add.w   #sv_pattern_table,d5
        move.l  -4(a6,d5.w),a0
        move.l  a0,d5   ; move to data register due to cc's
        beq     .pat_play_cont

        move.b  ppd_pat_shift(a1),d0        ; pattern pitch shift (signed)
        ext.w   d0

        add.w   d2,a0
        add.w   d2,d2
        add.w   d2,a0                       ; pattern data

        move.b  pdb_inst_effect(a0),d4      ; instrument and command byte
        moveq.l #15,d2
        and.w   d4,d2
        lsr.w   #4,d4                       ; instrument nr bits 0-4

        moveq.l #0,d5
        move.b  pdb_effect_data(a0),d5

        cmp.b   #$e,d2
        bne.s   .pat_exy_cmd_cont
        ; handle $exy commands
        tst.b   pcd_note_delay_b(a5)
        bne.s   .pat_exy_cmd_cont           ; ignore if already running note delay
        move.l  d5,d3
        moveq.l #15,d1
        and.w   d3,d1
        lsr.w   #4,d3
        sub.w   #$d,d3
        bne.s   .pat_is_not_ed_cmd
        ; note delay in x sub steps
        IFNE    PRETRACKER_PARANOIA_MODE    ; who does this kind of stuff?
        tst.b   pv_pat_speed_even_b(a4)
        beq.s   .pat_exy_cmd_cont
        move.b  d1,pcd_note_delay_b(a5)
        bra     .pat_play_cont             ; I believe that with activated track delay, we must jump here

        addq.b  #$d-$a,d3
        bne.s   .pat_exy_cmd_cont
        ; note off in x sub steps
        move.b  d1,pcd_note_off_delay_b(a5)

        st      pcd_note_delay_b(a5)

; ----------------------------------------
; read out pattern editor data

        moveq.l #0,d6                   ; clear arp flag
        move.b  d6,pcd_pat_vol_ramp_speed_b(a5)
        move.w  d6,pcd_pat_pitch_slide_w(a5)
        move.b  pdb_pitch_ctrl(a0),d3   ; pitch and control byte
        bpl.s   .noselinst16plus
        add.w   #16,d4                  ; add high bit of instrument number
        moveq.l #$3f,d7
        and.w   d3,d7                   ; pitch
        add.w   d4,d4
        add.w   d4,d4
        beq.s   .no_new_note            ; if no instrument
        tst.w   d7
        bne.s   .no_new_note            ; if it has pitch

        ; only change of instrument, not pitch
        move.b  pcd_loaded_inst_vol_b(a5),pcd_pat_vol_b(a5)

        cmp.w   pcd_inst_num4_w(a5),d4
        bne.s   .no_new_note
        ; attack!
        move.l   d6,pcd_adsr_phase_w(a5)   ; and pcd_adsr_volume_w
        ;clr.w   pcd_adsr_volume_w(a5)
        ;move.b  #1,pcd_adsr_trigger_b(a5) ; never read


; d2 = effect cmd, d3 = pitch_ctrl, d4 = inst number, d5 = effect data, d7 = pitch

        and.w   #$40,d3     ; ARP bit
        bne.s   .is_an_arp_note ; d3 is zero in this case, no no 2nd inst number

        ; d3.l must be 0 in this case
        ; normal note, not an ARP node
        tst.b   d2
        bne.s   .arp_processing_done

        ; d2.l is guaranteed to be zero

        ; 0xx: play second instrument
        tst.b   d5
        beq.s   .no_effect

        move.w  d4,d3       ; 1st instrument num

        moveq.l #15,d4      ; FIXME seems like it only supports the lower 15 instruments
        and.w   d5,d4
        add.w   d4,d4
        add.w   d4,d4       ; 2nd instrument from pattern effect

        tst.b   d7
        bne.s   .arp_processing_done

        ; play 2nd inst without (new) pitch
        addq.w  #1,d0       ; pattern pitch shift
        lsl.w   #4,d0
        bra.s   .check_for_2nd_instrument

        move.b  d2,d3
        or.b    d5,d3
        beq.s   .all_arp_notes_zero     ; if we branch there, both d2 and d3 MUST be 0 already

        move.b  d2,pcd_arp_note_1_b(a5)

        move.b  d5,d2
        lsr.b   #4,d2
        move.b  d2,pcd_arp_note_2_b(a5)

        moveq.l #15,d2
        and.b   d5,d2
        move.b  d2,pcd_arp_note_3_b(a5)

        moveq.l #0,d2   ; make sure we don't get a random command here

        moveq.l #1,d6   ; set ARP flag
        moveq.l #0,d3

; ----------------------------------------
; d2 = effect cmd, d3 = alt inst number (or 0), d4 = inst number, d5 = effect cmd, d6 = ARP flag, d7 = pitch

        cmp.b   #NOTE_OFF_PITCH,d7
        beq.s   .release_note

        tst.b   d7
        beq     .start_patt_effect_handling

        add.w   d7,d0               ; pattern pitch shift
        lsl.w   #4,d0

        cmp.b   #3,d2       ; is command portamento?
        beq.s   .cont_after_inst_trigger

        tst.b   d4
        beq.s   .cont_after_inst_trigger

        move.w  d4,d1

        move.b  d1,pcd_new_inst_num_b(a5)
        move.w  d1,pcd_inst_num4_w(a5)
        add.w   d1,d1
        add.w   d1,d1
        lea     sv_inst_infos_table-uii_SIZEOF(a6),a1
        add.w   d1,a1
        move.l  a1,pcd_inst_info_ptr(a5)
        move.b  uii_pattern_steps(a1),pcd_inst_pattern_steps_b(a5)

        moveq.l #0,d1
        move.l  a5,a1
        move.l  d1,(a1)+    ; pcd_pat_portamento_dest_w and pcd_pat_pitch_slide_w
        move.l  d1,(a1)+    ; pcd_pat_vol_ramp_speed_b, pcd_pat_2nd_inst_num4_b, pcd_pat_2nd_inst_delay_b, pcd_wave_offset_b
        move.l  d1,(a1)+    ; pcd_inst_pitch_slide_w and pcd_inst_sel_arp_note_w
        move.l  d1,(a1)+    ; pcd_inst_note_pitch_w and pcd_inst_curr_port_pitch_w

        move.l  d1,(a1)+    ; pcd_inst_line_ticks_b, pcd_inst_pitch_pinned_b, pcd_inst_vol_slide_b, pcd_inst_step_pos_b

        subq.b  #1,d1
        move.w  d1,(a1)+    ; pcd_inst_wave_num4_w

        move.l  #$ff010010,(a1)+    ; pcd_track_delay_offset_b, pcd_inst_speed_stop_b, pcd_inst_pitch_w
        move.l  #(MAX_VOLUME<<16)|(MAX_VOLUME<<8)|MAX_VOLUME,(a1)+  ; pcd_inst_vol_w / pcd_loaded_inst_vol_b / pcd_pat_vol_b

        bra.s   .cont_after_inst_trigger

        ; FIXME we have the identical code (different regs) three times (one is inactive)
        move.w  pcd_adsr_volume_w(a5),d4
        asr.w   #6,d4
        move.w  d4,pcd_adsr_vol64_w(a5)
        moveq.l #16,d7
        move.w  d7,pcd_adsr_pos_w(a5)
        sub.w   d4,d7
        lsr.w   #1,d7
        add.b   pcd_adsr_release_b(a5),d7
        move.b  d7,pcd_adsr_phase_speed_b(a5)
        move.w  #3,pcd_adsr_phase_w(a5)
        bra.s   .start_patt_effect_handling

        tst.b   d6                  ; has ARP?
        bne.s   .has_arp_check_portamento

        move.l  d6,pcd_arp_notes_l(a5) ; the whole ARP flag in d6 must be 0

        cmp.b   #3,d2               ; is command portamento?
        beq.s   .pat_new_portamento

        ; clear portamento
        move.w  #$10,pcd_inst_pitch_w(a5)
        move.w  d0,pcd_inst_curr_port_pitch_w(a5)
        clr.w   pcd_pat_portamento_dest_w(a5)
        bra.s   .start_patt_effect_handling

        add.w   #$10,d0             ; pattern pitch shift
        move.w  d0,pcd_pat_portamento_dest_w(a5)

        move.w  pcd_inst_pitch_w(a5),d1
        add.w   d1,pcd_inst_curr_port_pitch_w(a5)
        clr.w   pcd_inst_pitch_w(a5)
        tst.b   d5
        beq.s   .execute_pattern_command
        move.b  d5,pcd_pat_portamento_speed_b(a5)
        bra.s   .execute_pattern_command

; ----------------------------------------
        ; FIXME can we move this code to avoid blocking the two registers d3/d5
        tst.b   d3
        beq.s   .has_no_second_inst
        move.b  d3,pcd_pat_2nd_inst_num4_b(a5)
        move.b  d5,d3
        lsr.b   #4,d3
        move.b  d3,pcd_pat_2nd_inst_delay_b(a5)

        tst.b   d6                  ; ARP bit is set, cannot have a command
        bne     .pat_play_cont
        add.w   d2,d2
        move.w  .pattern_command_jmptable(pc,d2.w),d2
        jmp     .pattern_command_jmptable(pc,d2.w)

        dc.w    .pat_play_nop-.pattern_command_jmptable
        dc.w    .pat_slide_up-.pattern_command_jmptable
        dc.w    .pat_slide_down-.pattern_command_jmptable
        dc.w    .pat_play_nop-.pattern_command_jmptable     ; portamento is handled above
        dc.w    .pat_set_vibrato-.pattern_command_jmptable
        dc.w    .pat_set_track_delay-.pattern_command_jmptable
        dc.w    .pat_play_nop-.pattern_command_jmptable
        dc.w    .pat_play_nop-.pattern_command_jmptable
        dc.w    .pat_play_nop-.pattern_command_jmptable
        dc.w    .pat_set_wave_offset-.pattern_command_jmptable
        dc.w    .pat_volume_ramp-.pattern_command_jmptable
        dc.w    .pat_pos_jump-.pattern_command_jmptable
        dc.w    .pat_set_volume-.pattern_command_jmptable
        dc.w    .pat_pat_break-.pattern_command_jmptable
        dc.w    .pat_play_nop-.pattern_command_jmptable
        dc.w    .pat_set_speed-.pattern_command_jmptable

; d5 = command parameter data
; ----------------------------------------
        lea     pv_pat_speed_even_b(a4),a1
        cmp.b   #MAX_SPEED,d5
        bhs.s   .pat_set_speed_shuffle
        move.b  d5,(a1)+            ; pv_pat_speed_even_b
        move.b  d5,(a1)+            ; pv_pat_speed_odd_b
        move.b  d5,(a1)+            ; pv_pat_line_ticks_b
        sne     (a1)+               ; pv_pat_stopped_b
        seq     (a1)+               ; pv_songend_detected_b
        bra     .pat_play_cont
        moveq.l #15,d2
        and.w   d5,d2               ; odd speed
        lsr.w   #4,d5               ; even speed
        move.b  d5,(a1)+            ; pv_pat_speed_even_b
        move.b  d2,(a1)+            ; pv_pat_speed_odd_b
        btst    #0,pv_pat_curr_row_b(a4)
        beq.s   .pat_shuffle_on_even
        move.b  d2,d5               ; toggle speed to odd row
        move.b  d5,(a1)+            ; pv_pat_line_ticks_b
        bra     .pat_play_cont

; ----------------------------------------
        clr.w   pcd_vibrato_pos_w(a5)
        move.w  #1,pcd_vibrato_delay_w(a5)
        moveq.l #15,d2
        and.w   d5,d2

        lea     pre_vib_depth_table(pc),a1
        move.b  (a1,d2.w),pcd_vibrato_depth_w+1(a5)
        lsr.b   #4,d5
        move.b  pre_vib_speed_table-pre_vib_depth_table(a1,d5.w),d2
        muls    pcd_vibrato_depth_w(a5),d2
        asr.w   #4,d2
        move.w  d2,pcd_vibrato_speed_w(a5)
        bra.s   .pat_play_cont

; ----------------------------------------
        cmp.b   #NUM_CHANNELS-1,pcd_channel_num_b(a5)
        beq.s   .pat_play_cont  ; we are at channel 3 -- track delay not available here

        tst.b   d5
        bne.s   .pat_track_delay_set
        IFNE    PRETRACKER_BUGFIX_CODE      ; clearing track delay when it already was cleared will overwrite the note needlessly
        tst.b   pcd_track_delay_steps_b(a5)
        beq.s   .pat_play_cont
        move.b  d5,pcd_SIZEOF+pcd_pat_vol_b(a5)
        move.b  d5,pcd_track_delay_steps_b(a5)
        bra.s   .pat_play_cont

        moveq.l #15,d2
        and.b   d5,d2
        add.b   d2,d2
        cmp.b   pcd_track_delay_steps_b(a5),d2
        beq.s   .pat_track_set_only_vol
        move.b  d2,pcd_track_delay_steps_b(a5)
        move.b  d2,pcd_SIZEOF+pcd_track_init_delay_b(a5)
        lsr.b   #4,d5
        move.b  d5,pcd_track_delay_vol16_b(a5)
        bra.s   .pat_play_cont

; ----------------------------------------
        tst.b   d5
        beq.s   .pat_play_cont
        moveq.l #15,d3
        and.b   d5,d3
        beq.s   .pat_vol_ramp_up
        ; NOTE: Changed behaviour: using d3 instead of d5
        ; if both lower and upper were specified, this
        ; probably led to a drastic decrease of volume.
        neg.b   d3
        move.b  d3,pcd_pat_vol_ramp_speed_b(a5)
        bra.s   .pat_play_cont
        lsr.b   #4,d5
        move.b  d5,pcd_pat_vol_ramp_speed_b(a5)
        bra.s   .pat_play_cont

; ----------------------------------------
        neg.w   d5
        move.w  d5,pcd_pat_pitch_slide_w(a5)
        bra.s   .pat_play_cont

; ----------------------------------------
        move.b  d5,pcd_wave_offset_b(a5)
        bra.s   .pat_play_cont

; ----------------------------------------
        move.b  d5,pv_next_pat_pos_b(a4)
        bra.s   .pat_play_cont

; ----------------------------------------
        move.b  d5,pv_next_pat_row_b(a4)
        bra.s   .pat_play_cont

; ----------------------------------------
        cmp.b   #MAX_VOLUME,d5
        bls.s   .pat_set_volume_nomax
        moveq.l #MAX_VOLUME,d5
        move.b  d5,pcd_pat_vol_b(a5)

; ----------------------------------------

        cmp.b   #NUM_CHANNELS-1,pcd_channel_num_b(a5)
        beq.s   .pat_channels_loop_end

        lea     pcd_SIZEOF(a5),a5

        tst.b   pcd_track_delay_steps_b-pcd_SIZEOF(a5)  ; check if the next channel has track delay
        bne.s   .pat_play_cont              ; skip channel that has track delay enabled
        bra     .pre_pat_chan_loop


; end of pattern loop

; ----------------------------------------
; Pattern advancing and pattern break and jump handling. Song looping and song-end detection.
        subq.b  #1,pv_pat_line_ticks_b(a4)
        bne.s   .no_pattern_advance

        ; clear note delay info
        moveq.l #0,d0
        move.b  d0,pv_channeldata+pcd_note_delay_b+REPTN*pcd_SIZEOF(a4)

        move.b  sv_num_steps_b(a6),d1       ; number of steps in pattern

        move.b  pv_pat_curr_row_b(a4),d0
        addq.b  #1,d0                       ; normal step increment

        move.w  pv_curr_pat_pos_w(a4),d3    ; current song position

        move.b  pv_next_pat_row_b(a4),d2    ; $ff means no pattern break
        bmi.s   .no_pattern_break
        st      pv_next_pat_row_b(a4)       ; processed break, set to $ff
        move.b  d2,d0
        IFNE    0 ; PRETRACKER_BUGFIX_CODE  ; currently disabled to keep old behaviour
        moveq.l #0,d2                       ; clear mask
        cmp.b   d1,d0
        blo.s   .has_legal_break_pos
        move.b  d1,d0                       ; limit to last step
        subq.b  #1,d0
        bra.s   .has_legal_break_pos

        cmp.b   d1,d0
        blo.s   .pattern_end_not_reached
        moveq.l #0,d0
        move.b  pv_loop_pattern_b(a4),d0    ; keep same pattern rolling?
        bne.s   .pattern_end_not_reached
        addq.w  #1,d3                       ; pattern break will increment song pos -- if there is no new pattern position

        move.b  pv_next_pat_pos_b(a4),d4
        bmi.s   .no_new_position            ; has a new pattern position
        st      pv_next_pat_pos_b(a4)
        cmp.b   d3,d4
        bhi.s   .no_backjump
        st      pv_songend_detected_b(a4)   ; detect jumping back
        move.b  d4,d3                       ; load new position
        IFNE    0 ; PRETRACKER_BUGFIX_CODE  ; currently disabled to keep old behaviour
        not.b   d2
        and.b   d2,d0                       ; if we had NO pattern break, we will clear d0
        moveq.l #0,d0

        cmp.w   sv_pat_pos_len_w(a6),d3
        blo.s   .no_restart_song
        move.w  sv_pat_restart_pos_w(a6),d3
        st      pv_songend_detected_b(a4)
        move.b  d0,pv_pat_curr_row_b(a4)
        move.w  d3,pv_curr_pat_pos_w(a4)

        move.b  pv_pat_speed_even_b(a4),d1
        lsr.b   #1,d0
        bcc.s   .set_speed_even
        move.b  pv_pat_speed_odd_b(a4),d1
        move.b  d1,pv_pat_line_ticks_b(a4)

; ----------------------------------------
; processes the instrument pattern for each running instrument
; registers used:
; d0: pitch
; d1: volume
; d2: inst num
; d3: scratch
; a0: pattern data pointer
; a1: scratch
; a2: instrument info
; a3: wave info
; a4: pv
; a5: channel struct
; a6: mysong struct

        lea     pv_channeldata(a4),a5

        move.l  pcd_waveinfo_ptr(a5),a3

        move.l  pcd_inst_info_ptr(a5),a2
        move.l  a2,d3
        beq     .inst_no_inst_active

        ; calculate pitch -- funny that there is no min check (seems to happen later though)
        move.w  pcd_inst_pitch_slide_w(a5),d0
        add.w   pcd_pat_pitch_slide_w(a5),d0
        beq.s   .inst_no_pitch_slides_active
        add.w   pcd_inst_pitch_w(a5),d0
        cmp.w   #(3*NOTES_IN_OCTAVE)<<4,d0
        ble.s   .inst_noclip_pitch_max
        move.w  #(3*NOTES_IN_OCTAVE)<<4,d0
        move.w  d0,pcd_inst_pitch_w(a5)

        move.b  pcd_inst_vol_slide_b(a5),d1
        beq.s   .inst_no_vol_slide_active
        add.b   pcd_inst_vol_w+1(a5),d1
        bpl.s   .inst_noclip_vol_zero
        moveq.l #0,d1
        cmp.b   #MAX_VOLUME,d1
        ble.s   .inst_noclip_vol_max
        moveq.l #MAX_VOLUME,d1
        move.b  d1,pcd_inst_vol_w+1(a5)

        move.b  pcd_inst_line_ticks_b(a5),d2
        bne     .inst_still_ticking

        moveq.l #0,d0
        move.w  d0,pcd_inst_pitch_slide_w(a5)
        move.b  d0,pcd_inst_vol_slide_b(a5)

;        IFNE    PRETRACKER_PARANOIA_MODE ; new step is never written
;        move.w  pcd_inst_new_step_w(a5),d1
;        blt.s   .inst_no_new_step_pos
;        cmp.w   #$20,d1
;        ble.s   .inst_good_new_step_pos
;        moveq.l #$20,d1
;        move.b  d1,pcd_inst_step_pos_b(a5)
;        move.w  #$ffff,pcd_inst_new_step_w(a5)
;        ENDC
        move.b  pcd_inst_step_pos_b(a5),d0
        cmp.b   pcd_inst_pattern_steps_b(a5),d0
        bhs     .inst_pat_loop_exit

        moveq.l #-1,d4
        moveq.l #0,d7
        moveq.l #0,d3       ; flag for stitching -- if set, must not trigger new note
        ; enters with d4 = -1, meaning no first note pos yet

        move.w  pcd_inst_num4_w(a5),d1
        movea.l sv_inst_patterns_table-4(a6,d1.w),a0

        add.w   d0,a0
        add.w   d0,a0
        add.w   d0,a0

        moveq.l #0,d2               ; default to not stitched
        move.b  (a0)+,d1            ; pdb_pitch_ctrl get pitch byte
        bpl.s   .inst_note_is_not_stitched  ; means that note is stitched

        tst.w   d4
        bpl.s   .inst_no_update_first_note
        move.w  d0,d4               ; position of first note before stitching
        moveq.l #1,d2               ; next note will be fetched immediately
        move.b  (a0)+,d6            ; pdb_pitch_ctrl get pitch byte
        smi     d2                  ; note stitched?
        ;neg.b   d2

        tst.b   d3
        bne.s   .skippitchloading
        moveq.l #$3f,d1
        and.w   d6,d1
        beq.s   .skippitchloading   ; no new note
        subq.w  #1,d1
        lsl.w   #4,d1
        move.w  d1,pcd_inst_note_pitch_w(a5)
        and.w   #1<<6,d6
        sne     pcd_inst_pitch_pinned_b(a5)
        neg.b   pcd_inst_pitch_pinned_b(a5) ; only to be state binary compatible
        moveq.l #15,d6
        and.b   (a0)+,d6        ; pdb_effect_cmd command number
        add.w   d6,d6
        move.w  .inst_command_jmptable(pc,d6.w),d3
        moveq.l #0,d5
        move.b  (a0)+,d5        ; pdb_effect_data command parameter byte, note that condition codes are used in inst_set_speed
        jmp     .inst_command_jmptable(pc,d3.w)

        dc.w    .inst_select_wave-.inst_command_jmptable
        dc.w    .inst_slide_up-.inst_command_jmptable
        dc.w    .inst_slide_down-.inst_command_jmptable
        dc.w    .inst_adsr-.inst_command_jmptable
        dc.w    .inst_select_wave-.inst_command_jmptable
        dc.w    .inst_nop-.inst_command_jmptable
        dc.w    .inst_nop-.inst_command_jmptable
        dc.w    .inst_nop-.inst_command_jmptable
        dc.w    .inst_nop-.inst_command_jmptable
        dc.w    .inst_nop-.inst_command_jmptable
        dc.w    .inst_vol_slide-.inst_command_jmptable
        dc.w    .inst_jump_to_step-.inst_command_jmptable
        dc.w    .inst_set_volume-.inst_command_jmptable
        dc.w    .inst_nop-.inst_command_jmptable
        dc.w    .inst_use_pat_arp-.inst_command_jmptable
        dc.w    .inst_set_speed-.inst_command_jmptable

; d0 = current step / next step
; d5 = command parameter data / scratch
; d2 = note stitched flag
; d1 = scratch
; d3 = scratch
; d6 = scratch
; ----------------------------------------
        clr.b   pcd_wave_nosync(a5)
        pea     .inst_cmd_cont_next(pc)
        subq.w  #1,d5
        cmp.w   #MAX_WAVES,d5
        bhs.s   .inst_set_wave_rts
        add.w   d5,d5
        add.w   d5,d5
        cmp.w   pcd_inst_wave_num4_w(a5),d5
        beq.s   .inst_set_wave_rts

        move.w  d5,pcd_inst_wave_num4_w(a5)
        move.l  sv_waveinfo_table(a6,d5.w),a3
        move.l  a3,pcd_waveinfo_ptr(a5)
        move.l  pv_wave_sample_table(a4,d5.w),a1
        move.w  wi_chipram_w(a3),d5

        move.b  pcd_channel_mask_b(a5),d3
        or.b    d3,pv_trigger_mask_w+1(a4)
        move.b  d3,pcd_out_trg_b(a5)

        moveq.l #0,d3

        move.w  wi_loop_offset_w(a3),d1     ; is unlikely >= 32768 -- if it is, it will be past end of sample

        tst.w   wi_subloop_len_w(a3)
        beq.s   .inst_set_wave_has_no_subloop

        tst.w   d6
        and.b   pcd_last_wave_was_looping_b(a5),d6 ; mask out nosync if not allowed
        beq.s   .inst_set_wave_has_subloop

        ; nosync version
        st      pcd_wave_nosync(a5)
        move.w  d3,pcd_out_lof_w(a5)
        ;move.l  a1,pcd_inst_wave_ptr(a5)
        move.l  a1,pcd_out_ptr_l(a5)

        move.w  pcd_inst_loop_offset_w(a5),d1
        cmp.w   d1,d5
        cmp.w   pcd_inst_loop_offset_w(a5),d5
        bhs.s   .inst_set_wave_ns_keep_pp
        st      pcd_inst_ping_pong_dir_b(a5)    ; force forward if before loop offset
        IFEQ    PRETRACKER_BUGFIX_CODE          ; this adds an extra jump to the wave that's not desired
        clr.w   pcd_inst_subloop_wait_w(a5)

        sub.w   wi_subloop_step_w(a3),d1
        move.w  d1,pcd_inst_loop_offset_w(a5)

        tst.w   d6
        beq.s   .inst_select_wave_nosync_no_subloop

        ; note that nosync on non-looping waves doesn't effectivly do anything special
        adda.w  d1,a1
        sub.w   d1,d5
        ;cmp.w   #1,d5 ; not necessary as increases in steps of 2
        bhi.s   .inst_set_wave_ns_has_min_length
        moveq.l #2,d5
        move.w  d5,pcd_out_len_w(a5)
        moveq.l #-1,d3

        bra.s   .inst_set_wave_ns_has_subloop


        adda.w  d1,a1                       ; add loop offset (which is actually not a loop offset for one-shot samples)
        sub.w   d1,d5
        ;cmp.w   #1,d5 ; not necessary as increases in steps of 2
        bhi.s   .inst_set_wave_has_min_length
        moveq.l #2,d5
        move.l  pv_sample_buffer_ptr(a4),a1 ; fix start address to empty sample
        move.w  d5,pcd_out_len_w(a5)

        moveq.l #-1,d3
        move.w  d3,pcd_out_lof_w(a5)
        ;move.l  a1,pcd_inst_wave_ptr(a5)
        move.l  a1,pcd_out_ptr_l(a5)
        move.w  d1,pcd_inst_loop_offset_w(a5)

        not.b   d3
        move.b  d3,pcd_last_wave_was_looping_b(a5) ; allow nosync command
        st      pcd_inst_ping_pong_dir_b(a5)
        moveq.l #0,d5
        move.b  wi_subloop_wait_b(a3),d5
        addq.w  #1,d5
        move.w  d5,pcd_inst_subloop_wait_w(a5)

; ----------------------------------------
        subq.w  #1,d5
        beq.s   .inst_adsr_release
        subq.w  #1,d5
        bne     .inst_cmd_cont_next
        ; d5.l is zero
        move.w  d5,pcd_adsr_phase_w(a5)
        move.w  d5,pcd_adsr_volume_w(a5)
        bra     .inst_cmd_cont_next

        move.w  pcd_adsr_volume_w(a5),d5
        asr.w   #6,d5
        move.w  d5,pcd_adsr_vol64_w(a5)
        moveq.l #16,d6
        move.w  d6,pcd_adsr_pos_w(a5)
        sub.w   d5,d6
        lsr.w   #1,d6
        add.b   pcd_adsr_release_b(a5),d6
        move.b  d6,pcd_adsr_phase_speed_b(a5)
        move.w  #3,pcd_adsr_phase_w(a5)
        bra.s   .inst_cmd_cont_next

; ----------------------------------------
        moveq.l #15,d3
        and.w   d5,d3
        beq.s   .inst_vol_slide_up
        ; NOTE: Changed behaviour: using d3 instead of d5
        ; if both lower and upper were specified, this
        ; probably led to a drastic decrease of volume.
        neg.w   d3
        move.b  d3,pcd_inst_vol_slide_b(a5)
        bra.s   .inst_cmd_cont_next

        lsr.w   #4,d5
        move.b  d5,pcd_inst_vol_slide_b(a5)
        bra.s   .inst_cmd_cont_next

; ----------------------------------------
        cmp.w   d0,d5
        bge.s   .inst_cmd_cont_next         ; only backward jumps allowed (?)
        ; this stuff is PARANOIA
        tst.b   d4
        bmi.s   .inst_jump_to_step_doit     ; we did not have a stitched note before
        cmp.b   d3,d4
        ble     .inst_cmd_cont_next         ; we are jumping back to the stitched note, ignore
        move.w  d5,d0
        tst.b   d7                          ; check if we had jumped before
        bne.s   .inst_we_were_stitched
        moveq.l #-1,d4                      ; mark as no first stitch pos

        move.w  pcd_inst_num4_w(a5),d1
        movea.l sv_inst_patterns_table-4(a6,d1.w),a0

        add.w   d5,a0
        add.w   d5,a0
        add.w   d5,a0

        bra.s   .inst_fetch_next
        moveq.l #0,d2
        bra.s   .inst_cont_from_nasty_double_jump

; ----------------------------------------
        cmp.w   #MAX_VOLUME,d5
        ble.s   .inst_set_volume_nomax
        moveq.l #MAX_VOLUME,d5
        move.w  d5,pcd_inst_vol_w(a5)
        bra.s   .inst_cmd_cont_next

; ----------------------------------------
        moveq.l #3,d3
        and.w   d5,d3
        beq.s   .inst_use_pat_arp_play_base
        lsr.w   #4,d5
        beq.s   .inst_use_pat_arp_skip_empty
        subq.w  #1,d5
        bne.s   .inst_cmd_cont_next     ; illegal high nibble (only 0/1 allowed)

        ; pick arp note
        move.b  pcd_arp_notes_l-1(a5,d3.w),d3

        ; play base note
        lsl.w   #4,d3
        move.w  d3,pcd_inst_sel_arp_note_w(a5)
        bra.s   .inst_cmd_cont_next

        ; pick arp note, if it's 0, skip it
        move.b  pcd_arp_notes_l-1(a5,d3.w),d3
        bne.s   .inst_use_pat_arp_set

        addq.w  #1,d0
        bra.s   .inst_fetch_next

; ----------------------------------------
        neg.w   d5
        move.w  d5,pcd_inst_pitch_slide_w(a5)
        bra.s   .inst_cmd_cont_next

; ----------------------------------------
        seq     d3
        or.b    d3,d5
        move.b  d5,pcd_inst_speed_stop_b(a5)

; ----------------------------------------
        addq.w  #1,d0
        tst.b   d2
        beq.s   .inst_pat_loop_exit2
        ; d2 != 0 in this case, hence d3 will be set
        moveq.l #1,d7                       ; mark that we are in at least next iteration
        move.b  d2,d3                       ; mark stitching
        cmp.b   pcd_inst_pattern_steps_b(a5),d0
        blo     .inst_pat_loop

        st      d2
        add.b   pcd_inst_speed_stop_b(a5),d2
        move.b  d2,pcd_inst_line_ticks_b(a5)
        move.b  d0,pcd_inst_step_pos_b(a5)  ; update inst step pos

        tst.b   pcd_inst_wave_num4_w+1(a5)
        bpl.s   .inst_wave_selected

.inst_no_wave_selected                      ; FIXME this code is dubious at best -- it selects wave 0 if no wave was selected before
        moveq.l #0,d5
        moveq.l #0,d6
        bsr     .inst_select_wave_subroutine

        cmp.b   #$ff,d2
        beq.s   .inst_pat_loop_exit3
        subq.b  #1,pcd_inst_line_ticks_b(a5)


; ----------------------------------------
; a5 = channel
        move.w  pcd_inst_vol_w(a5),d1
        tst.b   pcd_new_inst_num_b(a5)
        bne.s   .load_instrument
        move.l  a3,d3
        beq.s   .no_inst_selected

        move.w  pcd_adsr_volume_w(a5),d2
        move.w  pcd_adsr_phase_w(a5),d4
        beq.s   .adsr_attack
        subq.w  #1,d4
        move.w  d4,d3
        beq.s   .adsr_decay_and_release ; we destinguish via d3 == 0 -> decay
        subq.w  #1,d4
        beq     .adsr_sustain

        move.w  pcd_adsr_pos_w(a5),d4
        add.w   pcd_adsr_vol64_w(a5),d4
        move.w  d4,pcd_adsr_pos_w(a5)
        sub.w   #16,d4
        blt.s   .adsr_done
        move.w  d4,pcd_adsr_pos_w(a5)

        ; same code for both release and decay
        moveq.l #0,d4
        moveq.l #-$71,d5        ; same as $8f, we only need the byte
        move.b  pcd_adsr_phase_speed_b(a5),d4
        cmp.b   d5,d4
        bhs.s   .adsr_absurd_slow_release
        move.b  d4,d5
        addq.b  #1,d5
        add.w   d4,d4
        lea     pre_roll_off_table(pc),a1
        move.w  (a1,d4.w),d4
        bra.s   .adsr_release_cont

        moveq.l #2,d4           ; FIXME I guess this should be 1, if I look at the roll-off table
        move.b  d5,pcd_adsr_phase_speed_b(a5)

        tst.w   d3
        beq.s   .adsr_is_actually_decay

        sub.w   d4,d2
        bpl.s   .adsr_done
        moveq.l #0,d2
        bra.s   .adsr_done

        sub.w   d4,d2

        cmp.w   uii_adsr_sustain(a2),d2
        bgt.s   .adsr_done
        move.w  #2,pcd_adsr_phase_w(a5)
        move.w  uii_adsr_sustain(a2),d2
        bra.s   .adsr_done

        move.b  d1,pcd_loaded_inst_vol_b(a5)
        move.l  uii_vibrato_delay(a2),pcd_vibrato_delay_w(a5)  ; and uii_vibrato_depth
        ;move.w  uii_vibrato_delay(a2),pcd_vibrato_delay_w(a5)
        ;move.w  uii_vibrato_depth(a2),pcd_vibrato_depth_w(a5)

        move.l  uii_vibrato_speed(a2),pcd_vibrato_speed_w(a5)  ; and uii_adsr_release
        ;move.w  uii_vibrato_speed(a2),pcd_vibrato_speed_w(a5)
        ;move.b  uii_adsr_release(a2),pcd_adsr_release_b(a5)

        moveq.l #0,d2
        move.l  d2,pcd_adsr_phase_w(a5)     ; and pcd_adsr_volume_w
        ;move.w  d2,pcd_adsr_phase_w(a5)
        ;move.w  d2,pcd_adsr_volume_w(a5)

        move.l  d2,pcd_new_inst_num_b(a5)   ; and pcd_vibrato_pos_w

        add.w   uii_adsr_attack(a2),d2
        cmp.w   #MAX_VOLUME<<4,d2
        blt.s   .adsr_done

        move.w  #MAX_VOLUME<<4,d2
        move.w  #1,pcd_adsr_phase_w(a5)
        move.b  uii_adsr_decay+1(a2),pcd_adsr_phase_speed_b(a5)


        move.w  d2,pcd_adsr_volume_w(a5)

        ; handle note cut-off command (EAx command)
        tst.b   pcd_note_off_delay_b(a5)
        beq.s   .dont_release_note
        subq.b  #1,pcd_note_off_delay_b(a5)
        bne.s   .dont_release_note
        ; cut off note
        clr.w   pcd_adsr_volume_w(a5)
        move.w  #3,pcd_adsr_phase_w(a5)


; ----------------------------------------
; calculate final volume output = inst_vol * ADSR volume * pattern volume

        lea     pv_volume_table(a4),a1
        lsl.w   #3,d2
        and.w   #127<<7,d2
        or.b    d1,d2
        move.b  (a1,d2.w),d1
        lsl.w   #7,d1
        or.b    pcd_pat_vol_b(a5),d1
        move.b  (a1,d1.w),pcd_out_vol_b(a5)
        lsr.w   #4,d2
        mulu    d2,d1
        lsr.w   #6,d1

        moveq.l #0,d2
        move.b  pcd_pat_vol_b(a5),d2
        mulu    d1,d2
        lsr.w   #6,d2
        move.b  d2,pcd_out_vol_b(a5)

; ----------------------------------------
; wave loop pos advancing and pattern wave offset command handling

        moveq.l #0,d1
        tst.b   wi_allow_9xx_b(a3)
        sne     d1
        and.b   pcd_wave_offset_b(a5),d1

        move.w  wi_subloop_len_w(a3),d3
        beq     .wave_has_no_subloop
        move.w  d3,pcd_out_len_w(a5)        ; FIXME can we move this to wave loading?
        move.w  pcd_inst_subloop_wait_w(a5),d5

        move.b  pcd_inst_ping_pong_dir_b(a5),d4
        move.w  wi_subloop_step_w(a3),d2
        tst.w   d1
        beq.s   .wave_with_subloop_but_no_wave_offset

        ; update loop offset from pattern
        lsl.w   #7,d1

        clr.b   pcd_wave_offset_b(a5)

        ; keep current direction of ping-pong unchanged
        tst.b   d4
        beq.s   .wave_move_one_step_ahead
        sub.w   d2,d1       ; go in reverse direction one step?
        bra.s   .wave_submove_cont
        add.w   d2,d1       ; go in reverse direction one step?
        bra.s   .wave_submove_cont

        move.w  pcd_inst_loop_offset_w(a5),d1
        subq.w  #1,d5
        bgt.s   .wave_subloop_wait

        ; subloop moves!
        ; reset subloop wait
        moveq.l #0,d5
        move.b  wi_subloop_wait_b(a3),d5

        tst.b   d4
        bne.s   .loop_is_moving_forwards
        sub.w   d2,d1                   ; decrement offset in backward direction one step

        move.w  wi_loop_start_w(a3),d2
        sub.w   d1,d2                   ; calc how many bytes we are past front
        bmi.s   .wave_new_loop_pos_fits
        bra.s   .wave_loop_dir_changed

        add.w   d2,d1                   ; increment offset in forward direction one step

        move.w  d1,d4
        add.w   d3,d4                   ; calculate new end position of loop sample

        move.w  wi_loop_end_w(a3),d2
        cmp.w   d1,d2
        bhs.s   .is_not_past_loop_end   ; are we starting playback past the end of the loop
        move.w  wi_chipram_w(a3),d2     ; use sample end
        sub.w   d4,d2                   ; space left = (max(loop end, sample end) - curr start)
        bhi.s   .wave_new_loop_pos_fits ; max(loop end, sample end) > curr start?

        add.w   d2,d1                   ; fix front of loop
        not.b   pcd_inst_ping_pong_dir_b(a5)
        IFEQ    PRETRACKER_FASTER_CODE  ; this extra code doesn't seem to be justified
        tst.w   d2
        bne.s   .wave_new_loop_pos_fits ; perfect fit for last loop
        ; partial fit only
        subq.w  #1,d5                   ; why, oh why?
        move.w  d1,pcd_inst_loop_offset_w(a5)

        move.w  d5,pcd_inst_subloop_wait_w(a5)
        move.w  d1,pcd_out_lof_w(a5)

        moveq.l #0,d1
        bra.s   .wave_load_sample_offset

        tst.w   d1
        beq.s   .wave_loop_handling_done

        ; apply offset from pattern for sample without subloop
        lsl.w   #7,d1

        clr.b   pcd_wave_offset_b(a5)

        move.b  pcd_channel_mask_b(a5),d2   ; trigger output
        or.b    d2,pv_trigger_mask_w+1(a4)
        move.b  d2,pcd_out_trg_b(a5)

        move.w  wi_chipram_w(a3),d2
        sub.w   d1,d2
        bhi.s   .waveoffset_is_not_past_end
        moveq.l #2,d2
        ; FIXME actually we should set the start address to the empty sample
        ; FIXME (or at least to the beginning of the sample?) to avoid an audible glitch
        moveq.l #0,d1
        move.w  d2,pcd_out_len_w(a5)
        move.w  pcd_inst_wave_num4_w(a5),d2
        add.l   pv_wave_sample_table(a4,d2.w),d1
        move.l  d1,pcd_inst_wave_ptr(a5)
        move.l  d1,pcd_out_ptr_l(a5)


; ----------------------------------------
; pitch handling
        move.w  pcd_inst_pitch_w(a5),d0
        sub.w   #$10,d0
        tst.b   pcd_inst_pitch_pinned_b(a5)
        bne.s   .pitch_pinned
        add.w   pcd_inst_sel_arp_note_w(a5),d0
        add.w   pcd_inst_curr_port_pitch_w(a5),d0
        sub.w   #$10,d0
        add.w   pcd_inst_note_pitch_w(a5),d0

; ----------------------------------------
; vibrato processing
        tst.b   pcd_vibrato_delay_w+1(a5)
        beq.s   .vibrato_already_active
        subq.b  #1,pcd_vibrato_delay_w+1(a5)
        bne.s   .vibrato_still_delayed

        move.w  pcd_vibrato_speed_w(a5),d2
        beq.s   .vibrato_disabled      ; no speed -- skip stuff
        move.w  pcd_vibrato_depth_w(a5),d4
        move.w  d2,d1
        add.w   pcd_vibrato_pos_w(a5),d1
        cmp.w   d1,d4
        blt.s   .vibrato_flipit
        neg.w   d4
        cmp.w   d1,d4
        ble.s   .vibrato_cont
        neg.w   d2
        move.w  d2,pcd_vibrato_speed_w(a5)
        move.w  d4,d1
        move.w  d1,pcd_vibrato_pos_w(a5)

        asr.w   #3,d1
        add.w   d1,d0

; ----------------------------------------
; select right sample corresponding to current pitch

        move.l  pcd_out_ptr_l(a5),d4
        move.w  pcd_out_len_w(a5),d3
        move.w  d0,d5
        sub.w   #$219,d5
        ble     .is_normal_octave
        btst    #2,wi_flags_b(a3)
        beq     .check_for_pitch_high_clipping

        ; select high pitch version of the sample
        move.w  #NOTES_IN_OCTAVE*16,d2
        moveq.l #1,d1
        sub.w   d2,d0
        sub.w   d2,d5
        blt.s   .oct1
        addq.w  #1,d1
        sub.w   d2,d0
        sub.w   d2,d5
        blt.s   .oct2
        addq.w  #1,d1
        sub.w   d2,d0
        moveq.l #0,d2
        move.w  wi_chipram_w(a3),d2

        move.w  pcd_out_lof_w(a5),d7
        addq.w  #1,d7   ; compare to $ffff
        beq.s   .high_oct_oneshot_wave
        subq.w  #1,d7
        lsr.w   d1,d7                       ; halve/quarter/eighth loop offset
        lsr.w   d1,d3                       ; halve/quarter/eighth loop length
        move.w  d7,pcd_out_lof_w(a5)
        bra.s   .cont_after_loop_fix

        tst.b   pcd_out_trg_b(a5)
        beq.s   .no_retrigger_new

        move.w  pcd_inst_wave_num4_w(a5),d7
        movea.l pv_wave_sample_table(a4,d7.w),a3

        sub.l   a3,d4                       ; calc whatever original offset into sample
        move.w  d3,d6
        add.w   d4,d6

        move.w  d2,d7
        sub.w   d6,d7                       ; calculate (remaining) one-shot length
        cmp.w   d7,d3                       ; I think this case was already catered for in loading
        bcc.s   .sam_length_okay
        moveq.l #2,d3
        bra.s   .cont_after_no_sample_left
        add.w   d6,d3
        sub.w   d2,d3
        lsr.w   d1,d3

        lsr.w   d1,d4
        add.l   a3,d4

        move.w  d3,pcd_out_len_w(a5)
        subq.w  #1,d1
        ;bmi.s   .is_normal_octave          ; this should never happen -- d1 is at least 1
        ; find offset in sample buffer for the right octave
        add.l   d2,d4
        lsr.w   #1,d2
        dbra    d1,.movetoloopposloop

        cmp.w   #$231,d0
        ble.s   .noclippitchhigh
        move.w  #$231,d0                    ; That's probably B-3+1, mapping to period $71 (although $7c is the last safe value)
        add.w   d0,d0
        bge.s   .noclippitchlow
        moveq.l #0,d0
        move.w  pv_period_table(a4,d0.w),pcd_out_per_w(a5)

        tst.b   pcd_out_trg_b(a5)
        beq.s   .wasnottriggered
        ; this code seems to move the sample start to "loop offset" for first trigger
        moveq.l #0,d0
        move.w  pcd_out_lof_w(a5),d0
        addq.w  #1,d0       ; compare to $ffff
        beq.s   .hasnoloop2
        subq.w  #1,d0
        add.l   d0,d4
        clr.w   pcd_out_lof_w(a5)
        move.l  d4,pcd_out_ptr_l(a5)

        ; this code is probably here to ensure triggering the wave when the octave sample changes
        cmp.w   pcd_last_trigger_length_w(a5),d3
        beq.s   .hassamesamlen
        move.w  d3,pcd_last_trigger_length_w(a5)
        move.b  pcd_channel_mask_b(a5),d3
        or.b    d3,pv_trigger_mask_w+1(a4)
        cmp.b   pcd_out_trg_b(a5),d3
        beq.s   .hassamesamlen
        or.b    d3,pv_trigger_mask_w+1(a4)
        ; we need to mark the first length loading, without trigger that is actually not a real trigger, so we can filter it
        ; also this is used to detect changes triggers only related to change of octaves
        tas     d3
        move.b  d3,pcd_out_trg_b(a5)


; ----------------------------------------
; track delay handling
        cmp.b   #NUM_CHANNELS-1,pcd_channel_num_b(a5)
        beq     .updatechannels

        lea     pcd_SIZEOF(a5),a5

        move.b  pcd_track_delay_steps_b-pcd_SIZEOF(a5),d3
        beq     .inst_chan_loop         ; no track delay

        moveq.l #MAX_TRACK_DELAY-1,d0   ; load from last buffer

        ; advance and wrap offset
        move.b  pcd_track_delay_offset_b(a5),d1
        addq.w  #1,d1
        and.w   d0,d1
        move.b  d1,pcd_track_delay_offset_b(a5)

        ; write previous channel data to this channel's buffer
        move.w  d1,d2
        lsl.w   #4,d2
        lea     pcd_track_delay_buffer(a5,d2.w),a3
        lea     pcd_out_base-pcd_SIZEOF(a5),a1
        move.l  (a1)+,(a3)+             ; ocd_sam_ptr
        move.l  (a1)+,(a3)+             ; ocd_length/ocd_loop_offset
        move.l  (a1)+,(a3)+             ; ocd_period/ocd_volume/ocd_trigger

        moveq.l #0,d5
        tst.b   pcd_track_init_delay_b(a5)
        bmi.s   .track_delay_ready

        subq.b  #1,pcd_track_init_delay_b(a5)
        bmi.s   .track_delay_trigger_first

        lea     pcd_out_base(a5),a3
        lea     pv_sample_buffer_ptr(a4),a1
        move.l  (a1)+,(a3)+             ; ocd_sam_ptr
        move.l  (a1)+,(a3)+             ; ocd_length/ocd_loop_offset
        move.l  (a1)+,(a3)+             ; ocd_period/ocd_volume/ocd_trigger

        bra.s   .check_next_channel

        move.b  pcd_channel_mask_b(a5),d5

        sub.b   d3,d1
        and.w   d1,d0

        moveq.l #7,d4
        and.w   d4,d1
        lea     pre_minus4plus4_table(pc),a1
        move.b  (a1,d1.w),d1
        ext.w   d1

        lsl.w   #4,d0
        lea     pcd_track_delay_buffer(a5,d0.w),a1
        lea     pcd_out_base(a5),a3
        move.l  (a1)+,(a3)+             ; ocd_sam_ptr
        move.l  (a1)+,(a3)+             ; ocd_length/ocd_loop_offset

        ; FIXME this seems odd! Why modulate the period by the distance?
        move.w  (a1)+,d0                ; ocd_period
        muls    d0,d1
        swap    d1
        add.w   d1,d0
        move.w  d0,(a3)+                ; ocd_period
        move.w  (a1)+,(a3)+             ; ocd_period

        move.w  pcd_track_delay_vol16_b-pcd_SIZEOF(a5),d4
        clr.b   d4
        add.w   d4,d4
        move.b  (a1)+,d4            ; ocd_volume
        move.b  (a1)+,d2            ; ocd_trigger
        lea     pv_volume_table(a4),a1
        move.b  (a1,d4.w),(a3)+     ; ocd_volume (this track)
        moveq.l #0,d4
        move.b  (a1)+,d4            ; ocd_volume
        move.b  pcd_track_delay_vol16_b-pcd_SIZEOF(a5),d2
        ext.w   d2
        mulu    d4,d2               ; apply track delay volume
        lsr.w   #4,d2
        move.b  d2,(a3)+            ; fix volume
        move.b  (a1)+,d2            ; ocd_trigger

        add.b   d2,d2               ; change mask to next channel
        or.b    d5,d2
        move.b  d2,(a3)+            ; ocd_trigger (this track)
        or.b    d2,pv_trigger_mask_w+1(a4)
        bra     .check_next_channel

; ----------------------------------------
        ; so this changed a lot from the original routine
        move.w  pv_trigger_mask_w(a4),d2

        move.l  pv_copperlist_ptr(a4),d0
        beq     .skipcopperlist
        move.l  d0,a5
        move.b  d2,1*4+3(a5)    ; dmacon
        move.b  d2,(1+1+1+5*NUM_CHANNELS)*4+3(a5)       ; dmacon after wait, dmacon, wait, 20 writes

        lea     pv_channeldata+pcd_out_base(a4),a0
        move.l  pv_sample_buffer_ptr(a4),d3
        moveq.l #0,d5
        move.w  d5,pv_trigger_mask_w(a4)
        moveq.l #-1,d1
        lea     3*4+2(a5),a1
        lea     (1+1+1+5*NUM_CHANNELS+1+1)*4+2(a5),a2   ; wait, dmacon, wait, 20 writes, dmacon, wait
        moveq.l #NUM_CHANNELS-1,d7
        moveq.l #0,d2
        move.w  ocd_loop_offset(a0),d2
        cmp.w   d1,d2
        bne.s   .is_looping_sample
        tst.b   ocd_trigger(a0)
        beq.s   .one_shot_clear_loop
        move.b  d5,ocd_trigger(a0)
        move.l  ocd_sam_ptr(a0),d0
        move.l  d3,d6                       ; set loop start
        move.w  ocd_length(a0),d4
        lsr.w   #1,d4
        move.w  d4,2*4(a1)  ; ac_len
        moveq.l #1,d4
        bra.s   .setptrvolper
        move.l  d3,d6
        move.l  d3,d0
        moveq.l #1,d4
        move.w  d4,2*4(a1)  ; ac_len
        bra.s   .setptrvolper

        move.l  ocd_sam_ptr(a0),d0
        add.l   d2,d0                       ; add loop offset to sample start
        move.l  d0,d6                       ; make a copy for loop start

        move.w  ocd_length(a0),d4
        lsr.w   #1,d4
        move.w  d4,2*4(a1)  ; ac_len
        tst.b   ocd_trigger(a0)
        beq.s   .setptrvolper
        move.b  d5,ocd_trigger(a0)
        sub.l   d2,d0                       ; if triggered, deduct loop offset so we are back at sample start
        move.w  d0,1*4(a1)  ; ac_ptr (lo)
        swap    d0
        move.w  d0,(a1)     ; ac_ptr (hi)
        move.w  ocd_period(a0),3*4(a1)
        move.b  ocd_volume(a0),4*4+1(a1)
        move.w  d6,1*4(a2)  ; ac_ptr (lo)
        swap    d6
        move.w  d6,(a2)     ; ac_ptr (hi)
        move.w  d4,2*4(a2)  ; ac_len

        lea     pcd_SIZEOF(a0),a0
        lea     5*4(a1),a1
        lea     3*4(a2),a2
        dbra    d7,.checkchan

        lea     $dff000,a5

        ; turn channels off and remember raster position
        move.w  d2,dmacon(a5)       ; turn dma channels off
        move.w  vhposr(a5),d5       ; I know this only works for the lower 256 rasterlines...
        add.w   #4<<8,d5            ; target rasterpos

        ; in the meanwhile we can update both channels that
        ; - need triggering by setting the new start and length
        ; - need updating of loop offset only
        ; update volume and period in any case
        lea     pv_channeldata+pcd_out_base(a4),a0
        lea     aud0(a5),a1
        move.l  pv_sample_buffer_ptr(a4),d3
        moveq.l #0,d6
        move.w  d6,pv_trigger_mask_w(a4)
        moveq.l #-1,d1
        moveq.l #NUM_CHANNELS-1,d7
        tst.b   ocd_trigger(a0)
        beq.s   .updateloop
        move.b  d6,ocd_trigger(a0)
        ; set start and length of the new sample
        move.w  ocd_length(a0),d0
        lsr.w   #1,d0
        move.w  d0,ac_len(a1)
        move.l  ocd_sam_ptr(a0),ac_ptr(a1)
        bra.s   .setvolperchan
        ; just update loop offset if looping
        moveq.l #0,d0
        move.w  ocd_loop_offset(a0),d0
        cmp.w   d1,d0
        beq.s   .setvolperchan
        add.l   ocd_sam_ptr(a0),d0
        move.l  d0,ac_ptr(a1)
        move.b  ocd_volume(a0),ac_vol+1(a1)
        move.w  ocd_period(a0),ac_per(a1)
        lea     pcd_SIZEOF(a0),a0
        lea     ac_SIZEOF(a1),a1
        dbra    d7,.checkchan

        tst.w   d2
        beq.s   .skiprasterwait ; if no channel needed triggering, we are done!

        or.w    #DMAF_SETCLR,d2
        cmp.w   vhposr(a5),d5
        bgt.s   .rasterwait1

        move.w  d2,dmacon(a5)   ; enable triggered channels
        add.w   #4<<8,d5        ; target rasterpos

        cmp.w   vhposr(a5),d5
        bgt.s   .rasterwait2

        lea     pv_channeldata+pcd_out_base(a4),a0
        lea     aud(a5),a1
        lsr.b   #1,d2
        bcc.s   .nosetloopchan
        moveq.l #0,d0
        move.w  ocd_loop_offset(a0),d0
        cmp.w   d1,d0
        beq.s   .setchan_no_loop2
        add.l   ocd_sam_ptr(a0),d0
        bra.s   .keepchanrunning2
        move.l  d3,d0
        move.w  #1,ac_len(a1)
        move.l  d0,ac_ptr(a1)
        lea     pcd_SIZEOF(a0),a0
        lea     ac_SIZEOF(a1),a1
        tst.b   d2
        bne.s   .chanloop


        movem.l (sp)+,d2-d7/a2-a6

; table data currently about 450 bytes
        ; Tables used by WaveGen
pre_roll_off_table: ; used by WaveGen
        dc.w    $400,$200,$180,$140,$100,$C0,$A0,$80,$78,$74,$6E
        dc.w    $69,$64,$5A,$46,$40,$38,$30,$28,$20,$1F,$1E,$1D
        dc.w    $1C,$1B,$1A,$19,$18,$17,$16,$15,$14,$13,$12,$11
        dc.w    $10,15,14,13,13,12,12,11,11,10,10,9,9,8,8,8,8,7,7
        dc.w    7,7,6,6,6,6,5,5,5,5,4,4,4,4,4,4,4,4,4,4,3,4,4,3,4
        dc.w    4,3,4,3,4,3,4,3,4,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3
        dc.w    2,3,3,2,3,3,2,3,2,3,2,3,2,3,2,3,2,2,2,2,2,2,2,2,1
        dc.w    2,1,2,1,2,1,2,1,1,2,1,1,1,2,1

pre_modulator_ramp_8: ; used by WaveGen
        ;dc.w     77,293,539,1079,1337,1877,2431,3031 ; the 1079 value is strange (938 better?)
        dc.w    $4D,$125,$21B,$437,$539,$755,$96D,$BD7

; linear then steep quadratic slope
        dc.b    2,3,4,5,6,7,8,9,10,11,12,13,14,20,40,80

; linear (a bit wonky), then a bit quadratic, then steep
        dc.b    0,8,9,10,11,12,13,14,18,20,28,40,50,70,160,255

; linear then almost quadratic
        dc.b    0,4,8,10,12,14,16,18,20,24,32,40,56,96,150,255

        dc.b    0,1,3,6,7,9,10,11,12,13,14,16,19,35,55,143

        dc.w    $400,$200,$80,$64,$50,$40,$30,$20
        dc.w    16,14,12,10,8,4,2,1

        ; -4,-3,-1,1,2,3,4,0
        dc.b    $c0,$b0,$f0,$10,$20,$30,$40,$00

        dc.w    $350
        dc.b    $30,$2e,$2a,$28,$27,$23,$20,$20,$1e,$1c,$1c,$18,$18,$17,$15,$13
        dc.b    $13,$12,$11,$10,$0f,$0e,$0e,$0c,$0c,$0b,$0b,$0a,$09,$09,$09,$07
        dc.b    $08,$06,$07,$00