From 815d5f6e29d6d0daca41a4016a92a920b178a6fb Mon Sep 17 00:00:00 2001 From: chrisly42 Date: Sat, 19 Aug 2023 21:40:08 +0200 Subject: [PATCH] Bugfix for songend detection and more optimizations. --- README.md | 21 +++-- binaries/raspberry_casket.bin | Bin 5840 -> 5802 bytes src/raspberry_casket.asm | 137 +++++++++++++++++-------------- src/raspberry_casket_wavegen.asm | 2 +- 4 files changed, 89 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index ee84a8d..fa23dc3 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ surplus code (maybe artefacts from earlier ideas that did nothing), optimizing the code where possible. This resulted in both reduced size of the replayer, faster sample calculation and speeding the tick routine up significantly. +Bugs from the original replayer were fixed. I also added a few optional features that come in handy, such as song-end detection and precalc progress support. @@ -25,9 +26,8 @@ Pretracker 1.5 tunes, given you don't use sfx or sub-songs. Also: Open source. It's 2023, keeping the code closed is just not part of the demoscene spirit (anymore?), at least for a replayer. -Also note that this is not the final state of the source code. -I could go over many places still and try to rework them. -But I wanted the code to be out in public. +This player is still being optimized and worked on since its +first release in late 2022. Productions that I know have been using Raspberry Casket so far: @@ -44,7 +44,12 @@ During the process this identical state and identical samples promise had to be dropped due to bugs in the original player and optimizations. This is especially the case for the track delay feature of Pretracker that could in some cases cause odd behaviour and unwanted muting that -has been fixed in Raspberry Casket. +has been fixed in Raspberry Casket. So the verification is now heavily +reduced to about 20 songs that still are identical. + +I do, however, now also have an emulated Paula output verification that +compares the generated sound between the original code and Raspberry Casket. +Divergences are manually checked from time to time. If you find some problems, please let me know under chrisly@platon42.de. Thank you. @@ -96,7 +101,7 @@ The original code compressed with *Blueberry's* Shrinkler goes from 18052 bytes down to 9023 bytes. Raspberry Casket, depending on the features compiled in, is about -5840 bytes and shrinkles down to ~4144 bytes (in isolation). +5802 bytes and shrinkles down to ~4125 bytes (in isolation). So this means that the optimization is not just "on the outside". @@ -107,7 +112,7 @@ the remaining code for playback. #### Sample precalculation -Sample generation is a faster than the original 1.0 player and also +Sample generation is faster than the original 1.0 player and also faster than the 1.5 player, which got a slightly better performance than the 1.0 one (compiler change?). @@ -161,7 +166,9 @@ solve this problem. - Moved pattern table init from PlayerInit to SongInit, optimized SongInit a bit. - Wave order table filling moved and optimized in SongInit. - Added Presto player draft. -- Drop-in replacement code size: 5840 bytes. +- Bugfix: Songend detection for back-jumps was broken since at least V1.1. +- Optimized some more wave selection code. +- Drop-in replacement code size: 5802 bytes. ### V1.x (unreleased) - Fixed a bug regarding the copper output mode with looping waves having a loop-offset. diff --git a/binaries/raspberry_casket.bin b/binaries/raspberry_casket.bin index a80d9d845a681f2e261605d957af57413a49e767..39a1ae5dd16b206c909c27663046cdf958c1cc71 100644 GIT binary patch delta 327 zcmcbhyGnP$X2wYqxBO?kGS!pwHiRm&} zwd*kON;@;emN9s_rCHf%Gw^t2Gq9uyT{2*^mmO+(smT>x4K>`5Ui6O$b-C0lG$=d=2?7i82D3rLL8K|DDXI@D zf*^vHYCDN=jA)34Xmpf@hK8VsXvluFy}U2)GdvgSle%l+&0ejA-&Dg(e6liZK)`+4 zun%I`okUO8*{gzLdJUT#rHW?^kLk)YEE)wESx@mvI^OxKbBr{=}^e#EMnFM!&vPK8qiV&KGbTdy<_X!?ZweBzW E2a(BT4gdfE diff --git a/src/raspberry_casket.asm b/src/raspberry_casket.asm index 2a7aae7..e7cc6e5 100755 --- a/src/raspberry_casket.asm +++ b/src/raspberry_casket.asm @@ -1,5 +1,5 @@ ;-------------------------------------------------------------------- -; Raspberry Casket Player V2.x (16-Aug-2023) +; Raspberry Casket Player V2.x (19-Aug-2023) ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ; ; Provided by Chris 'platon42' Hodges @@ -16,24 +16,33 @@ ; optimizing the code where possible. This resulted in both reduced ; size of the replayer, faster sample calculation and speeding the ; tick routine up significantly. +; Bugs from the original replayer were fixed. ; ; I also added a few optional features that come in handy, such as ; song-end detection and precalc progress support. ; -; It took me more than a month and it was not fun. -; ; Also: Open source. It's 2023, keeping the code closed is just not ; part of the demoscene spirit (anymore?), at least for a replayer. ; -; Also note that this is not the final state of the source code. -; I could go over many places still and try to rework them. -; But I wanted the code to be out in public. +; This player is still being optimized and worked on since its +; first release in late 2022. ; ; Verification ; ~~~~~~~~~~~~ -; The replayer has been verified on about 60 Pretracker tunes to -; create an identical internal state for each tick and identical -; samples (if certain optimizations switches are disabled). +; The first versions of the replayer had been verified against about +; 60 Pretracker tunes to create an identical internal state for each tick +; and identical samples (if certain optimizations switches are disabled). +; +; During the process this identical state and identical samples promise +; had to be dropped due to bugs in the original player and optimizations. +; This is especially the case for the track delay feature of Pretracker +; that could in some cases cause odd behaviour and unwanted muting that +; has been fixed in Raspberry Casket. So the verification is now heavily +; reduced to about 20 songs that still are identical. +; +; I do, however, now also have an emulated Paula output verification +; that compares the generated sound between the original code and +; Raspberry Casket. Divergences are manually checked from time to time. ; ; I might have introduced bugs though. If you find some problems, ; please let me know under chrisly@platon42.de. Thank you. @@ -87,20 +96,35 @@ ; 18052 bytes down to 9023 bytes. ; ; Raspberry Casket, depending on the features compiled in, is about -; 5840 bytes and shrinkles down to ~4144 bytes (in isolation). +; 5802 bytes and shrinkles down to ~4125 bytes (in isolation). ; ; So this means that the optimization is not just "on the outside". ; +; About 2.4 KB of the code (and data) are spent for the sample generation, +; the remaining code for playback. +; ; Timing ; ~~~~~~ -; Sample generation is a bit faster (I guess around 10-15%), but most -; of the time is spent on muls operations, so this is the limiting -; factor. +; +; 1. Sample precalculation +; +; Sample generation is faster than the original 1.0 player and also +; faster than the 1.5 player, which got a slightly better performance +; than the 1.0 one (compiler change?). +; +; According to my measurements on my set of Pretracker tunes, +; Raspberry Casket needs between 10% to 20% less instructions. +; Of these instructions, about 5% are `muls` operations and the new +; player is only able to shave off between 3% and 8% percent of those, +; so this is probably the limiting factor. +; +; 2. Playback ; ; Raspberry Casket is about twice as fast as the old replayer for playback. ; ; Unfortunately, the replayer is still pretty slow and has high ; jitter compared to other standard music replayers. +; ; This means it may take up to 32 raster lines (13-18 on average) ; which is significant more than a standard Protracker replayer ; (the original one could take about 60 raster lines worst case and @@ -745,10 +769,9 @@ pre_PlayerTick: ; Nothing sets pcd_pat_adsr_rel_delay_b from inside the player. ; It is used as a counter to release a note (ADSR) after a given time. ; It's not the same as the instrument ADSR release (see pcd_note_off_delay_b) - move.b pcd_pat_adsr_rel_delay_b(a5),d1 + tst.b pcd_pat_adsr_rel_delay_b(a5) ble.s .handle_2nd_instrument - subq.b #1,d1 - move.b d1,pcd_pat_adsr_rel_delay_b(a5) + subq.b #1,pcd_pat_adsr_rel_delay_b(a5) bne.s .handle_2nd_instrument move.w pcd_adsr_volume_w(a5),d3 @@ -769,10 +792,9 @@ pre_PlayerTick: move.b pcd_pat_2nd_inst_num4_b(a5),d1 beq.s .handle_current_instrument - move.b pcd_pat_2nd_inst_delay_b(a5),d3 + tst.b pcd_pat_2nd_inst_delay_b(a5) beq.s .trigger_2nd_instrument - subq.b #1,d3 - move.b d3,pcd_pat_2nd_inst_delay_b(a5) + subq.b #1,pcd_pat_2nd_inst_delay_b(a5) bra.s .handle_current_instrument .trigger_2nd_instrument @@ -1027,7 +1049,7 @@ pre_PlayerTick: beq.s .release_note tst.b d7 - beq.s .start_patt_effect_handling + beq .start_patt_effect_handling add.w d7,d0 ; pattern pitch shift lsl.w #4,d0 @@ -1274,7 +1296,7 @@ pre_PlayerTick: .pat_play_nop .pat_play_cont cmp.b #NUM_CHANNELS-1,pcd_channel_num_b(a5) - beq .pat_channels_loop_end + beq.s .pat_channels_loop_end lea pcd_SIZEOF(a5),a5 @@ -1290,7 +1312,7 @@ pre_PlayerTick: ; Pattern advancing and pattern break and jump handling. Song looping and song-end detection. .pattern_advancing subq.b #1,pv_pat_line_ticks_b(a4) - bne .no_pattern_advance + bne.s .no_pattern_advance ; clear note delay info moveq.l #0,d0 @@ -1334,8 +1356,8 @@ pre_PlayerTick: bmi.s .no_new_position ; has a new pattern position st pv_next_pat_pos_b(a4) IFNE PRETRACKER_SONG_END_DETECTION - cmp.b d4,d3 - bgt.s .no_backjump + cmp.b d3,d4 + bhi.s .no_backjump st pv_songend_detected_b(a4) ; detect jumping back .no_backjump ENDC @@ -1349,7 +1371,7 @@ pre_PlayerTick: .no_new_position cmp.w sv_pat_pos_len_w(a6),d3 - blt.s .no_restart_song + blo.s .no_restart_song move.w sv_pat_restart_pos_w(a6),d3 IFNE PRETRACKER_SONG_END_DETECTION st pv_songend_detected_b(a4) @@ -1472,7 +1494,7 @@ pre_PlayerTick: tst.b d3 bne.s .skippitchloading - andi.w #$3F,d1 + and.w #$3f,d1 beq.s .skippitchloading ; no new note subq.w #1,d1 lsl.w #4,d1 @@ -1516,21 +1538,20 @@ pre_PlayerTick: ; d6 = scratch ; ---------------------------------------- .inst_select_wave + pea .inst_cmd_cont_next(pc) subq.w #1,d5 cmp.w #MAX_WAVES,d5 - bhs .inst_cmd_cont_next + bhs.s .inst_set_wave_rts add.w d5,d5 add.w d5,d5 cmp.w pcd_inst_wave_num4_w(a5),d5 - beq .inst_cmd_cont_next + beq.s .inst_set_wave_rts + +.inst_select_wave_subroutine move.w d5,pcd_inst_wave_num4_w(a5) move.l sv_waveinfo_table(a6,d5.w),a3 move.l a3,pcd_waveinfo_ptr(a5) - move.l pv_wave_sample_table(a4,d5.w),a1 - - pea .inst_cmd_cont_next(pc) -.inst_select_wave_subroutine move.w wi_chipram_w(a3),d5 move.b pcd_channel_mask_b(a5),d3 @@ -1542,16 +1563,19 @@ pre_PlayerTick: tst.w d6 bne.s .inst_select_wave_nosync - move.w wi_loop_offset_w(a3),d6 ; is unlikely 32768 + move.w wi_loop_offset_w(a3),d6 ; is unlikely >= 32768 -- if it is, it will be past end of sample tst.w wi_subloop_len_w(a3) bne.s .inst_set_wave_has_subloop .inst_set_wave_has_no_subloop - adda.w d6,a1 ; add loop offset + adda.w d6,a1 ; add loop offset (which is actually not a loop offset for one-shot samples) sub.w d6,d5 ;cmp.w #1,d5 ; not necessary as increases in steps of 2 bhi.s .inst_set_wave_has_min_length moveq.l #2,d5 + IFNE PRETRACKER_BUGFIX_CODE + move.l pv_sample_buffer_ptr(a4),a1 ; fix start address to empty sample + ENDC .inst_set_wave_has_min_length move.w d5,pcd_out_len_w(a5) @@ -1566,7 +1590,7 @@ pre_PlayerTick: move.b wi_subloop_wait_b(a3),d5 addq.w #1,d5 move.w d5,pcd_inst_subloop_wait_w(a5) - +.inst_set_wave_rts rts ; ---------------------------------------- @@ -1763,20 +1787,14 @@ pre_PlayerTick: bpl.s .inst_wave_selected .inst_no_wave_selected ; FIXME this code is dubious at best -- it selects wave 0 if no wave was selected before - clr.w pcd_inst_wave_num4_w(a5) - move.l sv_waveinfo_ptr(a6),a3 - move.l a3,pcd_waveinfo_ptr(a5) - - move.l pv_wave_sample_table(a4),a1 - + moveq.l #0,d5 moveq.l #0,d6 bsr .inst_select_wave_subroutine .inst_wave_selected - cmp.b #$FF,d2 + cmp.b #$ff,d2 beq.s .inst_pat_loop_exit3 - subq.b #1,d2 - move.b d2,pcd_inst_line_ticks_b(a5) + subq.b #1,pcd_inst_line_ticks_b(a5) .inst_pat_loop_exit3 .inst_no_inst_active @@ -1879,10 +1897,9 @@ pre_PlayerTick: move.w d2,pcd_adsr_volume_w(a5) ; handle note cut-off command (EAx command) - move.b pcd_note_off_delay_b(a5),d4 + tst.b pcd_note_off_delay_b(a5) beq.s .dont_release_note - subq.b #1,d4 - move.b d4,pcd_note_off_delay_b(a5) + subq.b #1,pcd_note_off_delay_b(a5) bne.s .dont_release_note ; cut off note clr.w pcd_adsr_volume_w(a5) @@ -1943,12 +1960,12 @@ pre_PlayerTick: bra.s .wave_submove_cont .wave_with_subloop_but_no_wave_offset + move.w pcd_inst_loop_offset_w(a5),d1 subq.w #1,pcd_inst_subloop_wait_w(a5) bgt.s .wave_subloop_wait ; subloop moves! move.b pcd_inst_ping_pong_dir_b(a5),d4 - move.w pcd_inst_loop_offset_w(a5),d1 .wave_submove_cont ; reset subloop wait moveq.l #0,d5 @@ -1996,18 +2013,13 @@ pre_PlayerTick: subq.w #1,pcd_inst_subloop_wait_w(a5) ; why, oh why? .wave_new_loop_pos_fits - move.w d1,d4 - move.w d4,pcd_inst_loop_offset_w(a5) - bra.s .done_lof_calc + move.w d1,pcd_inst_loop_offset_w(a5) .wave_subloop_wait - move.w pcd_inst_loop_offset_w(a5),d4 -.done_lof_calc - move.w d4,pcd_out_lof_w(a5) + move.w d1,pcd_out_lof_w(a5) - move.w pcd_inst_wave_num4_w(a5),d1 ; FIXME can we move this to wave loading? - move.l pv_wave_sample_table(a4,d1.w),pcd_out_ptr_l(a5) - bra.s .loop_handling_done + moveq.l #0,d1 + bra.s .wave_load_sample_offset .wave_has_no_subloop moveq.l #0,d1 @@ -2036,7 +2048,7 @@ pre_PlayerTick: ENDC .waveoffset_is_not_past_end move.w d2,pcd_out_len_w(a5) - +.wave_load_sample_offset move.w pcd_inst_wave_num4_w(a5),d2 add.l pv_wave_sample_table(a4,d2.w),d1 move.l d1,pcd_out_ptr_l(a5) @@ -2059,10 +2071,9 @@ pre_PlayerTick: ; ---------------------------------------- ; vibrato processing - move.b pcd_vibrato_delay_w+1(a5),d1 + tst.b pcd_vibrato_delay_w+1(a5) beq.s .vibrato_already_active - subq.b #1,d1 - move.b d1,pcd_vibrato_delay_w+1(a5) + subq.b #1,pcd_vibrato_delay_w+1(a5) bne.s .vibrato_still_delayed .vibrato_already_active @@ -2121,8 +2132,8 @@ pre_PlayerTick: beq.s .has_no_loop_offset subq.w #1,d7 lsr.w d1,d7 - move.w d7,pcd_out_lof_w(a5) ; halve/quarter/eighth loop offset lsr.w d1,d3 + move.w d7,pcd_out_lof_w(a5) ; halve/quarter/eighth loop offset move.w d3,pcd_out_len_w(a5) ; halve/quarter/eighth loop length bra.s .cont_after_loop_fix @@ -2306,7 +2317,7 @@ pre_PlayerTick: or.b d5,d2 move.b d2,(a3)+ ; ocd_trigger (this track) or.b d2,pv_trigger_mask_w+1(a4) - bra.s .check_next_channel + bra .check_next_channel ; ---------------------------------------- .updatechannels diff --git a/src/raspberry_casket_wavegen.asm b/src/raspberry_casket_wavegen.asm index 1945751..9ec7a8f 100644 --- a/src/raspberry_casket_wavegen.asm +++ b/src/raspberry_casket_wavegen.asm @@ -1108,7 +1108,7 @@ pre_WaveGen: btst #5,wi_mod_density_b(a3) ; post bit beq.s .nopostmodulator - bsr pre_Modulator + bsr.s pre_Modulator .nopostmodulator ; ----------------------------------------