diff --git a/README.md b/README.md
index b81c2a8..ee84a8d 100644
--- a/README.md
+++ b/README.md
@@ -89,14 +89,14 @@ The source needs two common include files to compile (`custom.i` and
 ### Size
 
 The original C compiled code was... just bad. The new binary is
-about 1/3 of the original one.
+less than 1/3 of the original one.
 
 The code has been also optimized in a way that it compresses better.
 The original code compressed with *Blueberry's* Shrinkler goes from
 18052 bytes down to 9023 bytes.
 
 Raspberry Casket, depending on the features compiled in, is about
-5850 bytes and shrinkles down to ~4154 bytes (in isolation).
+5840 bytes and shrinkles down to ~4144 bytes (in isolation).
 
 So this means that the optimization is not just "on the outside".
 
@@ -159,8 +159,9 @@ solve this problem.
 - This removes a big source of cpu jitter when track delay is enabled (no longer clearing the track delay buffer).
 - This also fixes usages of illegal period 0 in the lead-in that could cause the replay to miss the first trigger.
 - Moved pattern table init from PlayerInit to SongInit, optimized SongInit a bit.
+- Wave order table filling moved and optimized in SongInit.
 - Added Presto player draft.
-- Drop-in replacement code size: 5850 bytes.
+- Drop-in replacement code size: 5840 bytes.
 
 ### V1.x (unreleased)
 - Fixed a bug regarding the copper output mode with looping waves having a loop-offset.
diff --git a/binaries/raspberry_casket.bin b/binaries/raspberry_casket.bin
index 2c4d8bc..a80d9d8 100644
Binary files a/binaries/raspberry_casket.bin and b/binaries/raspberry_casket.bin differ
diff --git a/src/raspberry_casket.asm b/src/raspberry_casket.asm
index 8918e5e..2a7aae7 100755
--- a/src/raspberry_casket.asm
+++ b/src/raspberry_casket.asm
@@ -80,14 +80,14 @@
 ; Size
 ; ~~~~
 ; The original C compiled code was... just bad. The new binary is
-; about 1/3rd of the original one.
+; less than 1/3rd of the original one.
 ;
 ; The code has been also optimized in a way that it compresses better.
 ; The original code compressed with Blueberry's Shrinkler goes from
 ; 18052 bytes down to 9023 bytes.
 ;
 ; Raspberry Casket, depending on the features compiled in, is about
-; 5850 bytes and shrinkles down to ~4154 bytes (in isolation).
+; 5840 bytes and shrinkles down to ~4144 bytes (in isolation).
 ;
 ; So this means that the optimization is not just "on the outside".
 ;
@@ -394,11 +394,27 @@ pre_SongInit:
 
         move.b  d0,sv_num_steps_b(a1)
 
-        mulu    #3,d0                           ; *3 bytes per pattern line
+        lea     sv_wavegen_order_table+MAX_WAVES(a1),a3
+        moveq.l #MAX_WAVES-1,d3                 ; fill 24 bytes with default order of waves?
+.fillcount
+        move.b  d3,-(a3)
+        dbra    d3,.fillcount
 
+        cmp.b   #$19,d2                         ; check if version is higher than 19
+        bls.s   .hasnowaveordering
+
+        moveq.l #MAX_WAVES-1,d3
+.waveorderloop
+        move.b  (a0)+,(a3)+                     ; $0042 wave generation ordering
+        dbra    d3,.waveorderloop
+
+.hasnowaveordering
+        
         lea     sv_pattern_table(a1),a0
 .pattableloop
         move.l  a4,(a0)+
+        add.w   d0,a4                           ; *3 bytes per pattern line
+        add.w   d0,a4
         add.w   d0,a4
         subq.b  #1,d1
         bne.s   .pattableloop
@@ -494,26 +510,6 @@ pre_SongInit:
 .addressiseven
         move.l  a0,sv_waveinfo_ptr(a1)
 
-        lea     sv_wavegen_order_table(a1),a0
-        cmpi.b  #$19,d2                         ; check if version is higher than 19
-        bhi.s   .haswaveorderinfo
-
-        moveq.l #0,d0                           ; fill 24 bytes with default order of waves?
-        moveq.l #MAX_WAVES-1,d7
-.fillcount
-        move.b  d0,(a0)+
-        addq.b  #1,d0
-        dbra    d7,.fillcount
-        bra.s   .contafterworkaround
-
-.haswaveorderinfo
-        moveq.l #(MAX_WAVES/4)-1,d7
-        lea     $0042(a2),a2                    ; offset into wave ordering
-.memcpyloop
-        move.l  (a2)+,(a0)+
-        dbra    d7,.memcpyloop
-
-.contafterworkaround
         moveq.l #2,d0                           ; at least empty sample
         moveq.l #0,d7
         move.b  sv_num_waves_b(a1),d7           ; has instruments?