File size: 44,597 Bytes
db3c893
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
#include "optical_model.hpp"
#include "utils.hpp"
#include <iostream>
#include <stdexcept>
#include <cmath>
#include <random>

// --- Kernel Declarations ---
__global__ void k_modulate(const float* x, const float* A, const float* P, cufftComplex* field, int N_pixels);
__global__ void k_intensity_log1p(const cufftComplex* freq, float* y, int N_elements);
// NEW: Two-layer MLP kernels
__global__ void k_linear_relu_forward(const float* input, const float* W, const float* b, float* output, int B, int input_size, int output_size);
__global__ void k_linear_forward_mlp(const float* input, const float* W, const float* b, float* output, int B, int input_size, int output_size);
__global__ void k_relu_backward(const float* grad_output, const float* forward_output, float* grad_input, int N);
__global__ void k_linear_backward_input(const float* grad_output, const float* W, float* grad_input, int B, int input_size, int output_size);
__global__ void k_accum_linear_grads(const float* input, const float* grad_output, float* gW, float* gb, int B, int input_size, int output_size);
__global__ void k_softmax_xent_loss_grad(const float* logits, const uint8_t* labels, float* grad_logits, float* total_loss, int B, int C);
__global__ void k_reduce_grad_map(const float* grad_y, int B, int S, float* grad_map);
__global__ void k_sigmoid(const float* logits, float* probs, int N);
// NEW: Multi-scale optical processing kernels
__global__ void k_downsample_2x2(const float* input, float* output, int input_h, int input_w, int B);
__global__ void k_concatenate_features(const float* scale1, const float* scale2, const float* scale3,
                                       float* multiscale, int B, int s1_size, int s2_size, int s3_size);
// NEW: 6-scale mirror concatenation kernel
__global__ void k_concatenate_6scale_mirror(const float* s1, const float* s2, const float* s3,
                                            const float* s1_mir, const float* s2_mir, const float* s3_mir,
                                            float* multiscale, int B, int s1_size, int s2_size, int s3_size);
// NEW: Memory-efficient flip kernels for mirror architecture
__global__ void k_flip_horizontal(const float* input, float* output, int height, int width, int B);
__global__ void k_flip_vertical(const float* input, float* output, int height, int width, int B);
// NEW: Diagnostic kernels for bottleneck analysis
__global__ void k_analyze_activation_saturation(const float* activations, float* stats, int N);
__global__ void k_analyze_gradient_flow(const float* gradients, float* stats, int N);
__global__ void k_bottleneck_detector(const float* input_features, const float* hidden_act,
                                     const float* logits, float* bottleneck_metrics,
                                     int batch_size, int input_size, int hidden_size, int output_size);
// BREAKTHROUGH: Rich FFT extraction preserving ALL complex information
__global__ void k_intensity_magnitude_phase(const cufftComplex* freq, float* y, int N_elements);
__global__ void k_rich_fft_extraction(const cufftComplex* freq, float* magnitude_out, float* phase_out, int N_elements);
__global__ void k_concatenate_dual_channels(const float* magnitude_channel, const float* phase_channel,
                                           float* rich_features, int B, int channel_size);

// --- Device Memory Management ---
// C++ OPTIMIZATION: Allocate persistent GPU buffers once
void allocate_device_buffers(DeviceBuffers& db, int B) {
    const size_t S = IMG_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES;
    const size_t MS = MULTISCALE_SIZE; // Enhanced 6-scale mirror feature size = 2058

    // Batch-dependent buffers
    check_cuda(cudaMalloc(&db.d_batch_in, sizeof(float) * B * S), "alloc d_batch_in");
    check_cuda(cudaMalloc(&db.d_batch_lbl, sizeof(uint8_t) * B), "alloc d_batch_lbl");

    // Multi-scale optical processing buffers
    check_cuda(cudaMalloc(&db.d_field_scale1, sizeof(cufftComplex) * B * SCALE_1_SIZE), "alloc d_field_scale1");
    check_cuda(cudaMalloc(&db.d_freq_scale1, sizeof(cufftComplex) * B * SCALE_1_SIZE), "alloc d_freq_scale1");
    check_cuda(cudaMalloc(&db.d_features_scale1, sizeof(float) * B * SCALE_1_SIZE), "alloc d_features_scale1");

    check_cuda(cudaMalloc(&db.d_field_scale2, sizeof(cufftComplex) * B * SCALE_2_SIZE), "alloc d_field_scale2");
    check_cuda(cudaMalloc(&db.d_freq_scale2, sizeof(cufftComplex) * B * SCALE_2_SIZE), "alloc d_freq_scale2");
    check_cuda(cudaMalloc(&db.d_features_scale2, sizeof(float) * B * SCALE_2_SIZE), "alloc d_features_scale2");

    check_cuda(cudaMalloc(&db.d_field_scale3, sizeof(cufftComplex) * B * SCALE_3_SIZE), "alloc d_field_scale3");
    check_cuda(cudaMalloc(&db.d_freq_scale3, sizeof(cufftComplex) * B * SCALE_3_SIZE), "alloc d_freq_scale3");
    check_cuda(cudaMalloc(&db.d_features_scale3, sizeof(float) * B * SCALE_3_SIZE), "alloc d_features_scale3");

    // Mirror architecture: allocate mirrored feature buffers
    check_cuda(cudaMalloc(&db.d_features_scale1_mirror, sizeof(float) * B * SCALE_1_SIZE), "alloc d_features_scale1_mirror");
    check_cuda(cudaMalloc(&db.d_features_scale2_mirror, sizeof(float) * B * SCALE_2_SIZE), "alloc d_features_scale2_mirror");
    check_cuda(cudaMalloc(&db.d_features_scale3_mirror, sizeof(float) * B * SCALE_3_SIZE), "alloc d_features_scale3_mirror");

    // LEGACY: Rich dual-channel processing buffers (not used in intelligent solution)
    // check_cuda(cudaMalloc(&db.d_magnitude_features, sizeof(float) * B * MS), "alloc d_magnitude_features");
    // check_cuda(cudaMalloc(&db.d_phase_features, sizeof(float) * B * MS), "alloc d_phase_features");

    check_cuda(cudaMalloc(&db.d_multiscale_features, sizeof(float) * B * MS), "alloc d_multiscale_features");
    check_cuda(cudaMalloc(&db.d_hidden, sizeof(float) * B * H), "alloc d_hidden");
    check_cuda(cudaMalloc(&db.d_logits, sizeof(float) * B * C), "alloc d_logits");
    check_cuda(cudaMalloc(&db.d_probs, sizeof(float) * B * C), "alloc d_probs");
    check_cuda(cudaMalloc(&db.d_grad_logits, sizeof(float) * B * C), "alloc d_grad_logits");
    check_cuda(cudaMalloc(&db.d_grad_hidden, sizeof(float) * B * H), "alloc d_grad_hidden");
    check_cuda(cudaMalloc(&db.d_grad_multiscale, sizeof(float) * B * MS), "alloc d_grad_multiscale");

    // Fungi buffers
    check_cuda(cudaMalloc(&db.d_A, sizeof(float) * S), "alloc d_A");
    check_cuda(cudaMalloc(&db.d_P, sizeof(float) * S), "alloc d_P");
    check_cuda(cudaMalloc(&db.d_grad_map, sizeof(float) * S), "alloc d_grad_map");

    // C++ OPTIMIZATION: Persistent weight buffers (allocated once, updated in-place)
    check_cuda(cudaMalloc(&db.d_W1, sizeof(float) * H * MS), "alloc persistent d_W1");
    check_cuda(cudaMalloc(&db.d_b1, sizeof(float) * H), "alloc persistent d_b1");
    check_cuda(cudaMalloc(&db.d_W2, sizeof(float) * C * H), "alloc persistent d_W2");
    check_cuda(cudaMalloc(&db.d_b2, sizeof(float) * C), "alloc persistent d_b2");
    check_cuda(cudaMalloc(&db.d_gW1, sizeof(float) * H * MS), "alloc persistent d_gW1");
    check_cuda(cudaMalloc(&db.d_gb1, sizeof(float) * H), "alloc persistent d_gb1");
    check_cuda(cudaMalloc(&db.d_gW2, sizeof(float) * C * H), "alloc persistent d_gW2");
    check_cuda(cudaMalloc(&db.d_gb2, sizeof(float) * C), "alloc persistent d_gb2");
    check_cuda(cudaMalloc(&db.d_loss_scalar, sizeof(float)), "alloc persistent d_loss_scalar");

    // CRITICAL: Bottleneck detection buffer - [4] metrics array
    check_cuda(cudaMalloc(&db.d_bottleneck_metrics, sizeof(float) * 4), "alloc bottleneck_metrics");
}

void free_device_buffers(DeviceBuffers& db) {
    // Free batch-dependent buffers
    if (db.d_batch_in) cudaFree(db.d_batch_in);
    if (db.d_batch_lbl) cudaFree(db.d_batch_lbl);

    // Free multi-scale optical processing buffers
    if (db.d_field_scale1) cudaFree(db.d_field_scale1);
    if (db.d_freq_scale1) cudaFree(db.d_freq_scale1);
    if (db.d_features_scale1) cudaFree(db.d_features_scale1);
    if (db.d_field_scale2) cudaFree(db.d_field_scale2);
    if (db.d_freq_scale2) cudaFree(db.d_freq_scale2);
    if (db.d_features_scale2) cudaFree(db.d_features_scale2);
    if (db.d_field_scale3) cudaFree(db.d_field_scale3);
    if (db.d_freq_scale3) cudaFree(db.d_freq_scale3);
    if (db.d_features_scale3) cudaFree(db.d_features_scale3);
    // Free mirror architecture buffers
    if (db.d_features_scale1_mirror) cudaFree(db.d_features_scale1_mirror);
    if (db.d_features_scale2_mirror) cudaFree(db.d_features_scale2_mirror);
    if (db.d_features_scale3_mirror) cudaFree(db.d_features_scale3_mirror);
    if (db.d_multiscale_features) cudaFree(db.d_multiscale_features);

    if (db.d_hidden) cudaFree(db.d_hidden);
    if (db.d_logits) cudaFree(db.d_logits);
    if (db.d_probs) cudaFree(db.d_probs);
    if (db.d_grad_logits) cudaFree(db.d_grad_logits);
    if (db.d_grad_hidden) cudaFree(db.d_grad_hidden);
    if (db.d_grad_multiscale) cudaFree(db.d_grad_multiscale);

    // Free fungi buffers
    if (db.d_A) cudaFree(db.d_A);
    if (db.d_P) cudaFree(db.d_P);
    if (db.d_grad_map) cudaFree(db.d_grad_map);

    // Free persistent weight buffers
    if (db.d_W1) cudaFree(db.d_W1);
    if (db.d_b1) cudaFree(db.d_b1);
    if (db.d_W2) cudaFree(db.d_W2);
    if (db.d_b2) cudaFree(db.d_b2);
    if (db.d_gW1) cudaFree(db.d_gW1);
    if (db.d_gb1) cudaFree(db.d_gb1);
    if (db.d_gW2) cudaFree(db.d_gW2);
    if (db.d_gb2) cudaFree(db.d_gb2);
    if (db.d_loss_scalar) cudaFree(db.d_loss_scalar);

    // CRITICAL: Free bottleneck detection buffer
    if (db.d_bottleneck_metrics) cudaFree(db.d_bottleneck_metrics);

    // LEGACY: Free rich dual-channel buffers (not used in intelligent solution)
    // if (db.d_magnitude_features) cudaFree(db.d_magnitude_features);
    // if (db.d_phase_features) cudaFree(db.d_phase_features);
}

// --- Adam Updater (Host) ---
static void adam_update(std::vector<float>& P, std::vector<float>& m, std::vector<float>& v,
                        const float* g_dev, size_t n, float lr, float wd, int t) {
    std::vector<float> g(n);
    check_cuda(cudaMemcpy(g.data(), g_dev, sizeof(float) * n, cudaMemcpyDeviceToHost), "D2H grads for Adam");
    float b1 = 0.9f, b2 = 0.999f, eps = 1e-8f;
    float b1t = 1.f - std::pow(b1, (float)t);
    float b2t = 1.f - std::pow(b2, (float)t);
    for (size_t i = 0; i < n; ++i) {
        m[i] = b1 * m[i] + (1 - b1) * g[i];
        v[i] = b2 * v[i] + (1 - b2) * g[i] * g[i];
        float mh = m[i] / b1t;
        float vh = v[i] / b2t;
        P[i] -= lr * (mh / (std::sqrt(vh) + eps) + wd * P[i]);
    }
}

// C++ OPTIMIZATION: Efficient GPU weight management
void upload_params_to_gpu(const OpticalParams& params, DeviceBuffers& db) {
    const size_t MS = MULTISCALE_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES;

    // Upload weights to persistent GPU buffers
    check_cuda(cudaMemcpy(db.d_W1, params.W1.data(), sizeof(float) * H * MS, cudaMemcpyHostToDevice), "upload W1");
    check_cuda(cudaMemcpy(db.d_b1, params.b1.data(), sizeof(float) * H, cudaMemcpyHostToDevice), "upload b1");
    check_cuda(cudaMemcpy(db.d_W2, params.W2.data(), sizeof(float) * C * H, cudaMemcpyHostToDevice), "upload W2");
    check_cuda(cudaMemcpy(db.d_b2, params.b2.data(), sizeof(float) * C, cudaMemcpyHostToDevice), "upload b2");
}

void download_params_from_gpu(OpticalParams& params, const DeviceBuffers& db) {
    const size_t MS = MULTISCALE_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES;

    // Download updated weights from GPU
    check_cuda(cudaMemcpy(params.W1.data(), db.d_W1, sizeof(float) * H * MS, cudaMemcpyDeviceToHost), "download W1");
    check_cuda(cudaMemcpy(params.b1.data(), db.d_b1, sizeof(float) * H, cudaMemcpyDeviceToHost), "download b1");
    check_cuda(cudaMemcpy(params.W2.data(), db.d_W2, sizeof(float) * C * H, cudaMemcpyDeviceToHost), "download W2");
    check_cuda(cudaMemcpy(params.b2.data(), db.d_b2, sizeof(float) * C, cudaMemcpyDeviceToHost), "download b2");
}

// --- Training Step for Multi-Scale Two-Layer MLP ---
// CHANGE LOG: Multi-scale optical processing for 90%+ accuracy
// FORWARD: multi_scale_features -> W1*features+b1 -> ReLU -> W2*hidden+b2 -> logits
// BACKWARD: Full backpropagation through both layers with multi-scale features
float train_batch(const float* h_batch_in, const uint8_t* h_batch_lbl,
                  int B, FungiSoA& fungi, OpticalParams& params,
                  DeviceBuffers& db, FFTPlan& fft,
                  float lr, float wd, int t_adam) {
    const int S = IMG_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES, MS = MULTISCALE_SIZE;

    check_cuda(cudaMemcpy(db.d_batch_in, h_batch_in, sizeof(float) * B * S, cudaMemcpyHostToDevice), "H2D input");
    check_cuda(cudaMemcpy(db.d_batch_lbl, h_batch_lbl, sizeof(uint8_t) * B, cudaMemcpyHostToDevice), "H2D labels");

    // Multi-scale optical processing for 90%+ accuracy
    fungi_build_masks_GPU(fungi, db.d_A, db.d_P);

    // DIAGNOSTIC: Analyze mask statistics every few epochs
    static int debug_counter = 0;
    if (debug_counter % 10 == 0) { // Every 10 batches
        fungi_analyze_mask_statistics(db.d_A, db.d_P, IMG_SIZE);
    }
    debug_counter++;

    // Scale 1: Full resolution 28x28 = 784 features
    k_modulate<<<(B * S + 255) / 256, 256>>>(db.d_batch_in, db.d_A, db.d_P, db.d_field_scale1, B * S);
    cufftExecC2C(fft.plan_fwd_scale1, db.d_field_scale1, db.d_freq_scale1, CUFFT_FORWARD);
    k_intensity_magnitude_phase<<<(B * SCALE_1_SIZE + 255) / 256, 256>>>(db.d_freq_scale1, db.d_features_scale1, B * SCALE_1_SIZE);

    // Scale 2: Half resolution 14x14 = 196 features
    k_downsample_2x2<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_batch_in, reinterpret_cast<float*>(db.d_field_scale2), IMG_H, IMG_W, B);
    k_modulate<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(reinterpret_cast<float*>(db.d_field_scale2), db.d_A, db.d_P, db.d_field_scale2, B * SCALE_2_SIZE);
    cufftExecC2C(fft.plan_fwd_scale2, db.d_field_scale2, db.d_freq_scale2, CUFFT_FORWARD);
    k_intensity_magnitude_phase<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_freq_scale2, db.d_features_scale2, B * SCALE_2_SIZE);

    // Scale 3: Quarter resolution 7x7 = 49 features (downsample from scale2)
    k_downsample_2x2<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_features_scale2, reinterpret_cast<float*>(db.d_field_scale3), SCALE_2, SCALE_2, B);
    k_modulate<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(reinterpret_cast<float*>(db.d_field_scale3), db.d_A, db.d_P, db.d_field_scale3, B * SCALE_3_SIZE);
    cufftExecC2C(fft.plan_fwd_scale3, db.d_field_scale3, db.d_freq_scale3, CUFFT_FORWARD);
    k_intensity_magnitude_phase<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_freq_scale3, db.d_features_scale3, B * SCALE_3_SIZE);

    // Mirror processing: create horizontally flipped versions for enhanced features
    k_flip_horizontal<<<(B * SCALE_1_SIZE + 255) / 256, 256>>>(db.d_features_scale1, db.d_features_scale1_mirror, SCALE_1, SCALE_1, B);
    k_flip_horizontal<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_features_scale2, db.d_features_scale2_mirror, SCALE_2, SCALE_2, B);
    k_flip_horizontal<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_features_scale3, db.d_features_scale3_mirror, SCALE_3, SCALE_3, B);

    // INTELLIGENT SOLUTION: Enhanced 6-scale mirror with optimized FFT kernel
    k_concatenate_6scale_mirror<<<(B * MS + 255) / 256, 256>>>(
        db.d_features_scale1, db.d_features_scale2, db.d_features_scale3,
        db.d_features_scale1_mirror, db.d_features_scale2_mirror, db.d_features_scale3_mirror,
        db.d_multiscale_features, B, SCALE_1_SIZE, SCALE_2_SIZE, SCALE_3_SIZE);

    // C++ OPTIMIZATION: Use persistent GPU buffers (NO malloc/free per batch!)
    // Forward pass: Layer 1 (with ReLU) - Enhanced 2058 features with better FFT extraction
    k_linear_relu_forward<<<(B*H+255)/256, 256>>>(db.d_multiscale_features, db.d_W1, db.d_b1, db.d_hidden, B, MS, H);

    // Forward pass: Layer 2 (linear)
    k_linear_forward_mlp<<<(B*C+255)/256, 256>>>(db.d_hidden, db.d_W2, db.d_b2, db.d_logits, B, H, C);

    // CRITICAL: Real-time bottleneck detection - analyze information flow
    static int bottleneck_counter = 0;
    if (bottleneck_counter % 5 == 0) { // Every 5 batches for performance
        cudaMemset(db.d_bottleneck_metrics, 0, sizeof(float) * 4);
        int max_threads = fmaxf(fmaxf(MS, H), C);
        k_bottleneck_detector<<<(max_threads + 255) / 256, 256>>>(
            db.d_multiscale_features, db.d_hidden, db.d_logits, db.d_bottleneck_metrics,
            B, MS, H, C);

        // Download metrics and report critical bottlenecks
        float h_metrics[4] = {0};
        cudaMemcpy(h_metrics, db.d_bottleneck_metrics, sizeof(float) * 4, cudaMemcpyDeviceToHost);

        float dead_features_pct = (h_metrics[0] / MS) * 100.0f;        // % dead input features
        float dead_neurons_pct = (h_metrics[1] / H) * 100.0f;         // % dead hidden neurons
        float saturated_neurons_pct = (h_metrics[2] / H) * 100.0f;    // % saturated hidden neurons
        float poor_discrimination_pct = (h_metrics[3] / C) * 100.0f;  // % poor output discrimination

        // ALERT: Critical bottleneck detection
        if (dead_features_pct > 20.0f || dead_neurons_pct > 30.0f ||
            saturated_neurons_pct > 30.0f || poor_discrimination_pct > 40.0f) {
            printf("🚨 CRITICAL BOTTLENECK DETECTED:\n");
            printf("   📉 Dead Features: %.1f%% | Dead Neurons: %.1f%% | Saturated: %.1f%% | Poor Discrim: %.1f%%\n",
                   dead_features_pct, dead_neurons_pct, saturated_neurons_pct, poor_discrimination_pct);
        }
    }
    bottleneck_counter++;

    // Loss computation (using persistent buffer)
    cudaMemset(db.d_loss_scalar, 0, sizeof(float));
    k_softmax_xent_loss_grad<<<(B + 255) / 256, 256>>>(db.d_logits, (const uint8_t*)db.d_batch_lbl, db.d_grad_logits, db.d_loss_scalar, B, C);

    // Backward pass: Layer 2 gradients (using persistent buffers)
    cudaMemset(db.d_gW2, 0, sizeof(float)*C*H);
    cudaMemset(db.d_gb2, 0, sizeof(float)*C);
    k_accum_linear_grads<<<(C+255)/256, 256>>>(db.d_hidden, db.d_grad_logits, db.d_gW2, db.d_gb2, B, H, C);

    // Backward pass: Hidden layer gradient
    k_linear_backward_input<<<(B*H+255)/256, 256>>>(db.d_grad_logits, db.d_W2, db.d_grad_hidden, B, H, C);

    // Backward pass: ReLU gradient
    k_relu_backward<<<(B*H+255)/256, 256>>>(db.d_grad_hidden, db.d_hidden, db.d_grad_hidden, B*H);

    // Backward pass: Layer 1 gradients (using persistent buffers with multi-scale)
    cudaMemset(db.d_gW1, 0, sizeof(float)*H*MS);
    cudaMemset(db.d_gb1, 0, sizeof(float)*H);
    k_accum_linear_grads<<<(H+255)/256, 256>>>(db.d_multiscale_features, db.d_grad_hidden, db.d_gW1, db.d_gb1, B, MS, H);

    // Backward pass: Multi-scale gradient for fungi (simplified - use scale 1 gradient)
    k_linear_backward_input<<<(B*MS+255)/256, 256>>>(db.d_grad_hidden, db.d_W1, db.d_grad_multiscale, B, MS, H);
    k_reduce_grad_map<<<(S + 255) / 256, 256>>>(db.d_grad_multiscale, B, S, db.d_grad_map); // Only use first S elements

    // Adam updates for all parameters (using persistent buffers with multi-scale)
    adam_update(params.W1, params.m_W1, params.v_W1, db.d_gW1, H * MS, lr, wd, t_adam);
    adam_update(params.b1, params.m_b1, params.v_b1, db.d_gb1, H, lr, 0.0f, t_adam);
    adam_update(params.W2, params.m_W2, params.v_W2, db.d_gW2, C * H, lr, wd, t_adam);
    adam_update(params.b2, params.m_b2, params.v_b2, db.d_gb2, C, lr, 0.0f, t_adam);

    // BUGFIX: Upload updated weights back to GPU after Adam
    upload_params_to_gpu(params, db);

    float h_loss;
    check_cuda(cudaMemcpy(&h_loss, db.d_loss_scalar, sizeof(float), cudaMemcpyDeviceToHost), "D2H loss");

    return h_loss / B;
}

// --- Inference for Multi-Scale Two-Layer MLP ---
void infer_batch(const float* h_batch_in, int B,
                 const FungiSoA& fungi, const OpticalParams& params,
                 DeviceBuffers& db, FFTPlan& fft,
                 std::vector<int>& out_predictions) {
    const int S = IMG_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES, MS = MULTISCALE_SIZE;
    check_cuda(cudaMemcpy(db.d_batch_in, h_batch_in, sizeof(float) * B * S, cudaMemcpyHostToDevice), "H2D infer input");

    // Multi-scale optical processing for inference
    fungi_build_masks_GPU(const_cast<FungiSoA&>(fungi), db.d_A, db.d_P);

    // Scale 1: Full resolution 28x28 = 784 features
    k_modulate<<<(B * S + 255) / 256, 256>>>(db.d_batch_in, db.d_A, db.d_P, db.d_field_scale1, B * S);
    cufftExecC2C(fft.plan_fwd_scale1, db.d_field_scale1, db.d_freq_scale1, CUFFT_FORWARD);
    k_intensity_magnitude_phase<<<(B * SCALE_1_SIZE + 255) / 256, 256>>>(db.d_freq_scale1, db.d_features_scale1, B * SCALE_1_SIZE);

    // Scale 2: Half resolution 14x14 = 196 features
    k_downsample_2x2<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_batch_in, reinterpret_cast<float*>(db.d_field_scale2), IMG_H, IMG_W, B);
    k_modulate<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(reinterpret_cast<float*>(db.d_field_scale2), db.d_A, db.d_P, db.d_field_scale2, B * SCALE_2_SIZE);
    cufftExecC2C(fft.plan_fwd_scale2, db.d_field_scale2, db.d_freq_scale2, CUFFT_FORWARD);
    k_intensity_magnitude_phase<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_freq_scale2, db.d_features_scale2, B * SCALE_2_SIZE);

    // Scale 3: Quarter resolution 7x7 = 49 features (downsample from scale2)
    k_downsample_2x2<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_features_scale2, reinterpret_cast<float*>(db.d_field_scale3), SCALE_2, SCALE_2, B);
    k_modulate<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(reinterpret_cast<float*>(db.d_field_scale3), db.d_A, db.d_P, db.d_field_scale3, B * SCALE_3_SIZE);
    cufftExecC2C(fft.plan_fwd_scale3, db.d_field_scale3, db.d_freq_scale3, CUFFT_FORWARD);
    k_intensity_magnitude_phase<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_freq_scale3, db.d_features_scale3, B * SCALE_3_SIZE);

    // Mirror processing: create horizontally flipped versions for enhanced features
    k_flip_horizontal<<<(B * SCALE_1_SIZE + 255) / 256, 256>>>(db.d_features_scale1, db.d_features_scale1_mirror, SCALE_1, SCALE_1, B);
    k_flip_horizontal<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_features_scale2, db.d_features_scale2_mirror, SCALE_2, SCALE_2, B);
    k_flip_horizontal<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_features_scale3, db.d_features_scale3_mirror, SCALE_3, SCALE_3, B);

    // INTELLIGENT SOLUTION: Enhanced 6-scale mirror with optimized FFT kernel (INFERENCE)
    k_concatenate_6scale_mirror<<<(B * MS + 255) / 256, 256>>>(
        db.d_features_scale1, db.d_features_scale2, db.d_features_scale3,
        db.d_features_scale1_mirror, db.d_features_scale2_mirror, db.d_features_scale3_mirror,
        db.d_multiscale_features, B, SCALE_1_SIZE, SCALE_2_SIZE, SCALE_3_SIZE);

    // C++ OPTIMIZATION: Use persistent GPU buffers for inference
    // Forward pass: Layer 1 (with ReLU) - Enhanced 2058 features with better FFT extraction
    k_linear_relu_forward<<<(B*H+255)/256, 256>>>(db.d_multiscale_features, db.d_W1, db.d_b1, db.d_hidden, B, MS, H);

    // Forward pass: Layer 2 (linear)
    k_linear_forward_mlp<<<(B*C+255)/256, 256>>>(db.d_hidden, db.d_W2, db.d_b2, db.d_logits, B, H, C);

    std::vector<float> h_logits(B * C);
    check_cuda(cudaMemcpy(h_logits.data(), db.d_logits, sizeof(float) * B * C, cudaMemcpyDeviceToHost), "D2H logits");

    out_predictions.resize(B);
    for (int b = 0; b < B; ++b) {
        int best_class = 0;
        float max_logit = h_logits[b * C];
        for (int c = 1; c < C; ++c) {
            if (h_logits[b * C + c] > max_logit) {
                max_logit = h_logits[b * C + c];
                best_class = c;
            }
        }
        out_predictions[b] = best_class;
    }
}

// --- Kernels ---
__global__ void k_modulate(const float* x, const float* A, const float* P, cufftComplex* field, int N_pixels) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N_pixels) return;

    int pixel_idx = i % IMG_SIZE;

    float input_val = x[i];
    float amp = A[pixel_idx];
    float phase = P[pixel_idx];

    field[i].x = input_val * amp * cosf(phase);
    field[i].y = input_val * amp * sinf(phase);
}

__global__ void k_intensity_log1p(const cufftComplex* freq, float* y, int N_elements) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N_elements) return;

    float intensity = freq[i].x * freq[i].x + freq[i].y * freq[i].y;
    y[i] = log1pf(intensity);
}

// BOTTLENECK FIX: Enhanced extraction with magnitude and phase information
__global__ void k_intensity_magnitude_phase(const cufftComplex* freq, float* y, int N_elements) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N_elements) return;

    float real = freq[i].x;
    float imag = freq[i].y;

    // Preserve both magnitude and phase information
    float magnitude = sqrtf(real * real + imag * imag);
    float phase = atan2f(imag, real);

    // BREAKTHROUGH FIX: Instead of crushing to 1D, preserve rich information
    // Method 1: Enhanced representation with multiple components
    y[i] = log1pf(magnitude) + 0.5f * tanhf(phase) + 0.2f * (real / (fabsf(real) + 1e-6f)) + 0.1f * (imag / (fabsf(imag) + 1e-6f));
}

// REVOLUTIONARY: Rich FFT extraction - DOUBLES information capacity
__global__ void k_rich_fft_extraction(const cufftComplex* freq, float* magnitude_out, float* phase_out, int N_elements) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N_elements) return;

    float real = freq[i].x;
    float imag = freq[i].y;

    // Preserve magnitude with enhanced dynamic range
    float magnitude = sqrtf(real * real + imag * imag);
    magnitude_out[i] = log1pf(magnitude) + 0.1f * atan2f(magnitude, 1.0f); // Enhanced magnitude

    // Preserve phase with full resolution
    float phase = atan2f(imag, real);
    phase_out[i] = tanhf(2.0f * phase / 3.14159f); // Full phase preservation [-1,1]
}

// BREAKTHROUGH: Concatenate magnitude and phase channels into rich feature vector
__global__ void k_concatenate_dual_channels(const float* magnitude_channel, const float* phase_channel,
                                           float* rich_features, int B, int channel_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total_size = 2 * channel_size; // magnitude + phase = double size

    if (idx >= B * total_size) return;

    int batch_idx = idx / total_size;
    int feature_idx = idx % total_size;

    if (feature_idx < channel_size) {
        // First half: magnitude channel [0, channel_size)
        rich_features[idx] = magnitude_channel[batch_idx * channel_size + feature_idx];
    } else {
        // Second half: phase channel [channel_size, 2*channel_size)
        int phase_idx = feature_idx - channel_size;
        rich_features[idx] = phase_channel[batch_idx * channel_size + phase_idx];
    }
}

__global__ void k_linear_forward(const float* y, const float* W, const float* b, float* logits, int B, int S, int C) {
    int batch_class = blockIdx.x * blockDim.x + threadIdx.x;
    if (batch_class >= B * C) return;

    int batch_idx = batch_class / C;
    int class_idx = batch_class % C;

    float sum = b[class_idx];
    for (int s = 0; s < S; ++s) {
        sum += W[class_idx * S + s] * y[batch_idx * S + s];
    }
    logits[batch_class] = sum;
}

__global__ void k_softmax_xent_loss_grad(const float* logits, const uint8_t* labels, float* grad_logits, float* total_loss, int B, int C) {
    int b = blockIdx.x * blockDim.x + threadIdx.x;
    if (b >= B) return;

    const float* b_logits = logits + b * C;
    float max_val = -1e20f;
    for (int c = 0; c < C; ++c) {
        if (b_logits[c] > max_val) max_val = b_logits[c];
    }

    float exp_sum = 0.f;
    float exp_vals[10];
    for (int c = 0; c < C; ++c) {
        exp_vals[c] = expf(b_logits[c] - max_val);
        exp_sum += exp_vals[c];
    }

    uint8_t true_label = labels[b];
    float* b_grad = grad_logits + b * C;
    float loss = 0.f;

    for (int c = 0; c < C; ++c) {
        float prob = exp_vals[c] / exp_sum;
        b_grad[c] = prob - (c == true_label ? 1.f : 0.f);
        if (c == true_label) {
            loss = -logf(fmaxf(prob, 1e-9f));
        }
    }
    atomicAdd(total_loss, loss);
}

__global__ void k_backprop_y(const float* grad_logits, const float* W, float* grad_y, int B, int S, int C) {
    int batch_pixel = blockIdx.x * blockDim.x + threadIdx.x;
    if (batch_pixel >= B * S) return;

    int batch_idx = batch_pixel / S;
    int pixel_idx = batch_pixel % S;

    float sum = 0.f;
    for (int c = 0; c < C; ++c) {
        sum += grad_logits[batch_idx * C + c] * W[c * S + pixel_idx];
    }
    grad_y[batch_pixel] = sum;
}

__global__ void k_accum_grads_Wb(const float* y, const float* grad_logits, float* gW, float* gb, int B, int S, int C) {
    int class_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (class_idx >= C) return;

    float gb_sum = 0.f;
    for (int b = 0; b < B; ++b) {
        gb_sum += grad_logits[b * C + class_idx];
    }
    gb[class_idx] = gb_sum;

    for (int s = 0; s < S; ++s) {
        float gw_sum = 0.f;
        for (int b = 0; b < B; ++b) {
            gw_sum += grad_logits[b * C + class_idx] * y[b * S + s];
        }
        gW[class_idx * S + s] = gw_sum;
    }
}

__global__ void k_reduce_grad_map(const float* grad_y, int B, int S, float* grad_map) {
    int pixel = blockIdx.x * blockDim.x + threadIdx.x;
    if (pixel >= S) return;

    float acc = 0.f;
    for (int b = 0; b < B; ++b) {
        acc += fabsf(grad_y[b * S + pixel]);
    }
    grad_map[pixel] = acc / static_cast<float>(B);
}

__global__ void k_sigmoid(const float* logits, float* probs, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N) return;
    probs[i] = 1.f / (1.f + expf(-logits[i]));
}

void create_fft_plan(FFTPlan& fft, int batch) {
    // Scale 1: 28x28 FFT plans
    int n1[2] = {SCALE_1, SCALE_1};
    cufftPlanMany(&fft.plan_fwd_scale1, 2, n1, nullptr, 1, SCALE_1_SIZE, nullptr, 1, SCALE_1_SIZE, CUFFT_C2C, batch);
    cufftPlanMany(&fft.plan_inv_scale1, 2, n1, nullptr, 1, SCALE_1_SIZE, nullptr, 1, SCALE_1_SIZE, CUFFT_C2C, batch);

    // Scale 2: 14x14 FFT plans
    int n2[2] = {SCALE_2, SCALE_2};
    cufftPlanMany(&fft.plan_fwd_scale2, 2, n2, nullptr, 1, SCALE_2_SIZE, nullptr, 1, SCALE_2_SIZE, CUFFT_C2C, batch);
    cufftPlanMany(&fft.plan_inv_scale2, 2, n2, nullptr, 1, SCALE_2_SIZE, nullptr, 1, SCALE_2_SIZE, CUFFT_C2C, batch);

    // Scale 3: 7x7 FFT plans
    int n3[2] = {SCALE_3, SCALE_3};
    cufftPlanMany(&fft.plan_fwd_scale3, 2, n3, nullptr, 1, SCALE_3_SIZE, nullptr, 1, SCALE_3_SIZE, CUFFT_C2C, batch);
    cufftPlanMany(&fft.plan_inv_scale3, 2, n3, nullptr, 1, SCALE_3_SIZE, nullptr, 1, SCALE_3_SIZE, CUFFT_C2C, batch);
}

void destroy_fft_plan(FFTPlan& fft) {
    if (fft.plan_fwd_scale1) cufftDestroy(fft.plan_fwd_scale1);
    if (fft.plan_inv_scale1) cufftDestroy(fft.plan_inv_scale1);
    if (fft.plan_fwd_scale2) cufftDestroy(fft.plan_fwd_scale2);
    if (fft.plan_inv_scale2) cufftDestroy(fft.plan_inv_scale2);
    if (fft.plan_fwd_scale3) cufftDestroy(fft.plan_fwd_scale3);
    if (fft.plan_inv_scale3) cufftDestroy(fft.plan_inv_scale3);
}

// CHANGE LOG: Updated initialization for 6-scale mirror two-layer MLP
// ORIGINAL: Single layer initialization (IMG_SIZE=784)
// NEW: Xavier/Glorot initialization for both layers (MULTISCALE_SIZE=2058)
void init_params(OpticalParams& p, unsigned seed) {
    std::mt19937 gen(seed);

    // Xavier initialization: std = sqrt(2 / (fan_in + fan_out))
    float std_W1 = std::sqrt(2.0f / (MULTISCALE_SIZE + HIDDEN_SIZE));
    float std_W2 = std::sqrt(2.0f / (HIDDEN_SIZE + NUM_CLASSES));
    std::normal_distribution<float> dist_W1(0.f, std_W1);
    std::normal_distribution<float> dist_W2(0.f, std_W2);

    // First layer: MULTISCALE_SIZE -> HIDDEN_SIZE
    size_t W1_size = HIDDEN_SIZE * MULTISCALE_SIZE;
    p.W1.resize(W1_size);
    p.b1.resize(HIDDEN_SIZE);
    p.m_W1.resize(W1_size);
    p.v_W1.resize(W1_size);
    p.m_b1.resize(HIDDEN_SIZE);
    p.v_b1.resize(HIDDEN_SIZE);

    for (size_t i = 0; i < W1_size; ++i) {
        p.W1[i] = dist_W1(gen);
        p.m_W1[i] = 0.f;
        p.v_W1[i] = 0.f;
    }
    for (size_t i = 0; i < HIDDEN_SIZE; ++i) {
        p.b1[i] = 0.f;
        p.m_b1[i] = 0.f;
        p.v_b1[i] = 0.f;
    }

    // Second layer: HIDDEN_SIZE -> NUM_CLASSES
    size_t W2_size = NUM_CLASSES * HIDDEN_SIZE;
    p.W2.resize(W2_size);
    p.b2.resize(NUM_CLASSES);
    p.m_W2.resize(W2_size);
    p.v_W2.resize(W2_size);
    p.m_b2.resize(NUM_CLASSES);
    p.v_b2.resize(NUM_CLASSES);

    for (size_t i = 0; i < W2_size; ++i) {
        p.W2[i] = dist_W2(gen);
        p.m_W2[i] = 0.f;
        p.v_W2[i] = 0.f;
    }
    for (size_t i = 0; i < NUM_CLASSES; ++i) {
        p.b2[i] = 0.f;
        p.m_b2[i] = 0.f;
        p.v_b2[i] = 0.f;
    }
}

// --- NEW KERNELS FOR TWO-LAYER MLP ---

// Linear layer with ReLU activation: output = ReLU(W * input + b)
__global__ void k_linear_relu_forward(const float* input, const float* W, const float* b, float* output, int B, int input_size, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= B * output_size) return;

    int batch_idx = idx / output_size;
    int out_idx = idx % output_size;

    float sum = b[out_idx];
    for (int i = 0; i < input_size; ++i) {
        sum += W[out_idx * input_size + i] * input[batch_idx * input_size + i];
    }
    output[idx] = fmaxf(0.0f, sum); // ReLU activation
}

// Linear layer without activation: output = W * input + b
__global__ void k_linear_forward_mlp(const float* input, const float* W, const float* b, float* output, int B, int input_size, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= B * output_size) return;

    int batch_idx = idx / output_size;
    int out_idx = idx % output_size;

    float sum = b[out_idx];
    for (int i = 0; i < input_size; ++i) {
        sum += W[out_idx * input_size + i] * input[batch_idx * input_size + i];
    }
    output[idx] = sum;
}

// ReLU backward: grad_input = grad_output * (forward_output > 0)
__global__ void k_relu_backward(const float* grad_output, const float* forward_output, float* grad_input, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= N) return;
    grad_input[i] = grad_output[i] * (forward_output[i] > 0.0f ? 1.0f : 0.0f);
}

// Linear backward (input gradients): grad_input = W^T * grad_output
__global__ void k_linear_backward_input(const float* grad_output, const float* W, float* grad_input, int B, int input_size, int output_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= B * input_size) return;

    int batch_idx = idx / input_size;
    int in_idx = idx % input_size;

    float sum = 0.0f;
    for (int o = 0; o < output_size; ++o) {
        sum += W[o * input_size + in_idx] * grad_output[batch_idx * output_size + o];
    }
    grad_input[idx] = sum;
}

// Accumulate gradients for linear layer weights and biases
__global__ void k_accum_linear_grads(const float* input, const float* grad_output, float* gW, float* gb, int B, int input_size, int output_size) {
    int out_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (out_idx >= output_size) return;

    // Accumulate bias gradient
    float gb_sum = 0.0f;
    for (int b = 0; b < B; ++b) {
        gb_sum += grad_output[b * output_size + out_idx];
    }
    gb[out_idx] = gb_sum;

    // Accumulate weight gradients
    for (int in_idx = 0; in_idx < input_size; ++in_idx) {
        float gw_sum = 0.0f;
        for (int b = 0; b < B; ++b) {
            gw_sum += grad_output[b * output_size + out_idx] * input[b * input_size + in_idx];
        }
        gW[out_idx * input_size + in_idx] = gw_sum;
    }
}

// --- NEW KERNELS FOR MULTI-SCALE OPTICAL PROCESSING ---

// Downsample 28x28 to 14x14 using 2x2 average pooling
__global__ void k_downsample_2x2(const float* input, float* output, int input_h, int input_w, int B) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int output_h = input_h / 2;
    int output_w = input_w / 2;
    int output_size = output_h * output_w;

    if (idx >= B * output_size) return;

    int batch_idx = idx / output_size;
    int out_pixel = idx % output_size;
    int out_y = out_pixel / output_w;
    int out_x = out_pixel % output_w;

    // Average 2x2 region
    float sum = 0.0f;
    for (int dy = 0; dy < 2; ++dy) {
        for (int dx = 0; dx < 2; ++dx) {
            int in_y = out_y * 2 + dy;
            int in_x = out_x * 2 + dx;
            int in_pixel = in_y * input_w + in_x;
            sum += input[batch_idx * (input_h * input_w) + in_pixel];
        }
    }
    output[idx] = sum * 0.25f; // Average of 4 pixels
}


// Concatenate multi-scale features: [scale1 | scale2 | scale3]
__global__ void k_concatenate_features(const float* scale1, const float* scale2, const float* scale3,
                                       float* multiscale, int B, int s1_size, int s2_size, int s3_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total_size = s1_size + s2_size + s3_size;

    if (idx >= B * total_size) return;

    int batch_idx = idx / total_size;
    int feature_idx = idx % total_size;

    if (feature_idx < s1_size) {
        // Copy from scale1
        multiscale[idx] = scale1[batch_idx * s1_size + feature_idx];
    } else if (feature_idx < s1_size + s2_size) {
        // Copy from scale2
        int s2_idx = feature_idx - s1_size;
        multiscale[idx] = scale2[batch_idx * s2_size + s2_idx];
    } else {
        // Copy from scale3
        int s3_idx = feature_idx - s1_size - s2_size;
        multiscale[idx] = scale3[batch_idx * s3_size + s3_idx];
    }
}

// Memory-efficient horizontal flip: flip left-right in place
__global__ void k_flip_horizontal(const float* input, float* output, int height, int width, int B) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total_pixels = B * height * width;

    if (idx >= total_pixels) return;

    int batch_idx = idx / (height * width);
    int pixel_idx = idx % (height * width);
    int row = pixel_idx / width;
    int col = pixel_idx % width;

    // Flip column: new_col = width - 1 - col
    int flipped_col = width - 1 - col;
    int flipped_idx = batch_idx * (height * width) + row * width + flipped_col;

    output[idx] = input[flipped_idx];
}

// Memory-efficient vertical flip: flip top-bottom in place
__global__ void k_flip_vertical(const float* input, float* output, int height, int width, int B) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total_pixels = B * height * width;

    if (idx >= total_pixels) return;

    int batch_idx = idx / (height * width);
    int pixel_idx = idx % (height * width);
    int row = pixel_idx / width;
    int col = pixel_idx % width;

    // Flip row: new_row = height - 1 - row
    int flipped_row = height - 1 - row;
    int flipped_idx = batch_idx * (height * width) + flipped_row * width + col;

    output[idx] = input[flipped_idx];
}

// 6-scale mirror concatenation: [s1 | s2 | s3 | s1_mir | s2_mir | s3_mir]
__global__ void k_concatenate_6scale_mirror(const float* s1, const float* s2, const float* s3,
                                            const float* s1_mir, const float* s2_mir, const float* s3_mir,
                                            float* multiscale, int B, int s1_size, int s2_size, int s3_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int single_size = s1_size + s2_size + s3_size; // 1029
    int total_size = 2 * single_size; // 2058

    if (idx >= B * total_size) return;

    int batch_idx = idx / total_size;
    int feature_idx = idx % total_size;

    if (feature_idx < single_size) {
        // First half: normal features [s1 | s2 | s3]
        if (feature_idx < s1_size) {
            // Copy from scale1
            multiscale[idx] = s1[batch_idx * s1_size + feature_idx];
        } else if (feature_idx < s1_size + s2_size) {
            // Copy from scale2
            int s2_idx = feature_idx - s1_size;
            multiscale[idx] = s2[batch_idx * s2_size + s2_idx];
        } else {
            // Copy from scale3
            int s3_idx = feature_idx - s1_size - s2_size;
            multiscale[idx] = s3[batch_idx * s3_size + s3_idx];
        }
    } else {
        // Second half: mirrored features [s1_mir | s2_mir | s3_mir]
        int mirror_idx = feature_idx - single_size;
        if (mirror_idx < s1_size) {
            // Copy from mirrored scale1
            multiscale[idx] = s1_mir[batch_idx * s1_size + mirror_idx];
        } else if (mirror_idx < s1_size + s2_size) {
            // Copy from mirrored scale2
            int s2_idx = mirror_idx - s1_size;
            multiscale[idx] = s2_mir[batch_idx * s2_size + s2_idx];
        } else {
            // Copy from mirrored scale3
            int s3_idx = mirror_idx - s1_size - s2_size;
            multiscale[idx] = s3_mir[batch_idx * s3_size + s3_idx];
        }
    }
}


// ================= BOTTLENECK ANALYSIS KERNELS =================

// Analyze activation saturation (ReLU dead neurons)
__global__ void k_analyze_activation_saturation(const float* activations, float* stats, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= N) return;

    float val = activations[idx];
    
    // Use atomic operations to gather statistics
    if (val <= 1e-6f) {
        atomicAdd(&stats[0], 1.0f); // Dead neurons (ReLU=0)
    } else if (val >= 0.99f) {
        atomicAdd(&stats[1], 1.0f); // Saturated neurons  
    }
    atomicAdd(&stats[2], val);      // Sum for mean
    atomicAdd(&stats[3], val*val);  // Sum squares for variance
}

// Analyze gradient flow (vanishing/exploding gradients)
__global__ void k_analyze_gradient_flow(const float* gradients, float* stats, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= N) return;

    float grad = gradients[idx];
    float abs_grad = fabsf(grad);

    if (abs_grad < 1e-6f) {
        atomicAdd(&stats[0], 1.0f); // Vanishing gradients
    } else if (abs_grad > 10.0f) {
        atomicAdd(&stats[1], 1.0f); // Exploding gradients
    }
    atomicAdd(&stats[2], abs_grad); // Sum for mean
}

// CRITICAL: Real-time bottleneck detection - identifies where information is lost
__global__ void k_bottleneck_detector(const float* input_features, const float* hidden_act,
                                     const float* logits, float* bottleneck_metrics,
                                     int batch_size, int input_size, int hidden_size, int output_size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    // Feature diversity analysis (input layer) - detect information collapse
    if (tid < input_size) {
        float feature_sum = 0.0f, feature_var = 0.0f;
        for (int b = 0; b < batch_size; b++) {
            float val = input_features[b * input_size + tid];
            feature_sum += val;
        }
        float mean = feature_sum / batch_size;

        for (int b = 0; b < batch_size; b++) {
            float val = input_features[b * input_size + tid];
            feature_var += (val - mean) * (val - mean);
        }
        feature_var /= batch_size;

        // Low variance = information loss (features all the same value)
        if (feature_var < 1e-4f) atomicAdd(&bottleneck_metrics[0], 1.0f); // Dead features count
    }

    // Hidden activation analysis - detect neural saturation
    if (tid < hidden_size) {
        float hidden_sum = 0.0f;
        for (int b = 0; b < batch_size; b++) {
            hidden_sum += hidden_act[b * hidden_size + tid];
        }
        float hidden_mean = hidden_sum / batch_size;

        // Saturation detection (critical bottleneck indicators)
        if (hidden_mean < 0.01f) atomicAdd(&bottleneck_metrics[1], 1.0f);     // Dead neurons
        if (hidden_mean > 0.99f) atomicAdd(&bottleneck_metrics[2], 1.0f);     // Saturated neurons
    }

    // Logits analysis (output bottleneck) - detect poor class discrimination
    if (tid < output_size) {
        float logit_range = 0.0f;
        float min_logit = 1e10f, max_logit = -1e10f;

        for (int b = 0; b < batch_size; b++) {
            float val = logits[b * output_size + tid];
            min_logit = fminf(min_logit, val);
            max_logit = fmaxf(max_logit, val);
        }
        logit_range = max_logit - min_logit;

        // Small range = poor discrimination ability (critical bottleneck)
        if (logit_range < 0.1f) atomicAdd(&bottleneck_metrics[3], 1.0f); // Poor discrimination count
    }
}