|
|
#include "optical_model.hpp" |
|
|
#include "utils.hpp" |
|
|
#include <iostream> |
|
|
#include <stdexcept> |
|
|
#include <cmath> |
|
|
#include <random> |
|
|
|
|
|
// |
|
|
__global__ void k_modulate(const float* x, const float* A, const float* P, cufftComplex* field, int N_pixels); |
|
|
__global__ void k_intensity_log1p(const cufftComplex* freq, float* y, int N_elements); |
|
|
// NEW: Two-layer MLP kernels |
|
|
__global__ void k_linear_relu_forward(const float* input, const float* W, const float* b, float* output, int B, int input_size, int output_size); |
|
|
__global__ void k_linear_forward_mlp(const float* input, const float* W, const float* b, float* output, int B, int input_size, int output_size); |
|
|
__global__ void k_relu_backward(const float* grad_output, const float* forward_output, float* grad_input, int N); |
|
|
__global__ void k_linear_backward_input(const float* grad_output, const float* W, float* grad_input, int B, int input_size, int output_size); |
|
|
__global__ void k_accum_linear_grads(const float* input, const float* grad_output, float* gW, float* gb, int B, int input_size, int output_size); |
|
|
__global__ void k_softmax_xent_loss_grad(const float* logits, const uint8_t* labels, float* grad_logits, float* total_loss, int B, int C); |
|
|
__global__ void k_reduce_grad_map(const float* grad_y, int B, int S, float* grad_map); |
|
|
__global__ void k_sigmoid(const float* logits, float* probs, int N); |
|
|
// NEW: Multi-scale optical processing kernels |
|
|
__global__ void k_downsample_2x2(const float* input, float* output, int input_h, int input_w, int B); |
|
|
__global__ void k_concatenate_features(const float* scale1, const float* scale2, const float* scale3, |
|
|
float* multiscale, int B, int s1_size, int s2_size, int s3_size); |
|
|
// NEW: 6-scale mirror concatenation kernel |
|
|
__global__ void k_concatenate_6scale_mirror(const float* s1, const float* s2, const float* s3, |
|
|
const float* s1_mir, const float* s2_mir, const float* s3_mir, |
|
|
float* multiscale, int B, int s1_size, int s2_size, int s3_size); |
|
|
// NEW: Memory-efficient flip kernels for mirror architecture |
|
|
__global__ void k_flip_horizontal(const float* input, float* output, int height, int width, int B); |
|
|
__global__ void k_flip_vertical(const float* input, float* output, int height, int width, int B); |
|
|
// NEW: Diagnostic kernels for bottleneck analysis |
|
|
__global__ void k_analyze_activation_saturation(const float* activations, float* stats, int N); |
|
|
__global__ void k_analyze_gradient_flow(const float* gradients, float* stats, int N); |
|
|
__global__ void k_bottleneck_detector(const float* input_features, const float* hidden_act, |
|
|
const float* logits, float* bottleneck_metrics, |
|
|
int batch_size, int input_size, int hidden_size, int output_size); |
|
|
// BREAKTHROUGH: Rich FFT extraction preserving ALL complex information |
|
|
__global__ void k_intensity_magnitude_phase(const cufftComplex* freq, float* y, int N_elements); |
|
|
__global__ void k_rich_fft_extraction(const cufftComplex* freq, float* magnitude_out, float* phase_out, int N_elements); |
|
|
__global__ void k_concatenate_dual_channels(const float* magnitude_channel, const float* phase_channel, |
|
|
float* rich_features, int B, int channel_size); |
|
|
|
|
|
// |
|
|
// C++ OPTIMIZATION: Allocate persistent GPU buffers once |
|
|
void allocate_device_buffers(DeviceBuffers& db, int B) { |
|
|
const size_t S = IMG_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES; |
|
|
const size_t MS = MULTISCALE_SIZE; // Enhanced 6-scale mirror feature size = 2058 |
|
|
|
|
|
// Batch-dependent buffers |
|
|
check_cuda(cudaMalloc(&db.d_batch_in, sizeof(float) * B * S), "alloc d_batch_in"); |
|
|
check_cuda(cudaMalloc(&db.d_batch_lbl, sizeof(uint8_t) * B), "alloc d_batch_lbl"); |
|
|
|
|
|
// Multi-scale optical processing buffers |
|
|
check_cuda(cudaMalloc(&db.d_field_scale1, sizeof(cufftComplex) * B * SCALE_1_SIZE), "alloc d_field_scale1"); |
|
|
check_cuda(cudaMalloc(&db.d_freq_scale1, sizeof(cufftComplex) * B * SCALE_1_SIZE), "alloc d_freq_scale1"); |
|
|
check_cuda(cudaMalloc(&db.d_features_scale1, sizeof(float) * B * SCALE_1_SIZE), "alloc d_features_scale1"); |
|
|
|
|
|
check_cuda(cudaMalloc(&db.d_field_scale2, sizeof(cufftComplex) * B * SCALE_2_SIZE), "alloc d_field_scale2"); |
|
|
check_cuda(cudaMalloc(&db.d_freq_scale2, sizeof(cufftComplex) * B * SCALE_2_SIZE), "alloc d_freq_scale2"); |
|
|
check_cuda(cudaMalloc(&db.d_features_scale2, sizeof(float) * B * SCALE_2_SIZE), "alloc d_features_scale2"); |
|
|
|
|
|
check_cuda(cudaMalloc(&db.d_field_scale3, sizeof(cufftComplex) * B * SCALE_3_SIZE), "alloc d_field_scale3"); |
|
|
check_cuda(cudaMalloc(&db.d_freq_scale3, sizeof(cufftComplex) * B * SCALE_3_SIZE), "alloc d_freq_scale3"); |
|
|
check_cuda(cudaMalloc(&db.d_features_scale3, sizeof(float) * B * SCALE_3_SIZE), "alloc d_features_scale3"); |
|
|
|
|
|
// Mirror architecture: allocate mirrored feature buffers |
|
|
check_cuda(cudaMalloc(&db.d_features_scale1_mirror, sizeof(float) * B * SCALE_1_SIZE), "alloc d_features_scale1_mirror"); |
|
|
check_cuda(cudaMalloc(&db.d_features_scale2_mirror, sizeof(float) * B * SCALE_2_SIZE), "alloc d_features_scale2_mirror"); |
|
|
check_cuda(cudaMalloc(&db.d_features_scale3_mirror, sizeof(float) * B * SCALE_3_SIZE), "alloc d_features_scale3_mirror"); |
|
|
|
|
|
// LEGACY: Rich dual-channel processing buffers (not used in intelligent solution) |
|
|
// check_cuda(cudaMalloc(&db.d_magnitude_features, sizeof(float) * B * MS), "alloc d_magnitude_features"); |
|
|
// check_cuda(cudaMalloc(&db.d_phase_features, sizeof(float) * B * MS), "alloc d_phase_features"); |
|
|
|
|
|
check_cuda(cudaMalloc(&db.d_multiscale_features, sizeof(float) * B * MS), "alloc d_multiscale_features"); |
|
|
check_cuda(cudaMalloc(&db.d_hidden, sizeof(float) * B * H), "alloc d_hidden"); |
|
|
check_cuda(cudaMalloc(&db.d_logits, sizeof(float) * B * C), "alloc d_logits"); |
|
|
check_cuda(cudaMalloc(&db.d_probs, sizeof(float) * B * C), "alloc d_probs"); |
|
|
check_cuda(cudaMalloc(&db.d_grad_logits, sizeof(float) * B * C), "alloc d_grad_logits"); |
|
|
check_cuda(cudaMalloc(&db.d_grad_hidden, sizeof(float) * B * H), "alloc d_grad_hidden"); |
|
|
check_cuda(cudaMalloc(&db.d_grad_multiscale, sizeof(float) * B * MS), "alloc d_grad_multiscale"); |
|
|
|
|
|
// Fungi buffers |
|
|
check_cuda(cudaMalloc(&db.d_A, sizeof(float) * S), "alloc d_A"); |
|
|
check_cuda(cudaMalloc(&db.d_P, sizeof(float) * S), "alloc d_P"); |
|
|
check_cuda(cudaMalloc(&db.d_grad_map, sizeof(float) * S), "alloc d_grad_map"); |
|
|
|
|
|
// C++ OPTIMIZATION: Persistent weight buffers (allocated once, updated in-place) |
|
|
check_cuda(cudaMalloc(&db.d_W1, sizeof(float) * H * MS), "alloc persistent d_W1"); |
|
|
check_cuda(cudaMalloc(&db.d_b1, sizeof(float) * H), "alloc persistent d_b1"); |
|
|
check_cuda(cudaMalloc(&db.d_W2, sizeof(float) * C * H), "alloc persistent d_W2"); |
|
|
check_cuda(cudaMalloc(&db.d_b2, sizeof(float) * C), "alloc persistent d_b2"); |
|
|
check_cuda(cudaMalloc(&db.d_gW1, sizeof(float) * H * MS), "alloc persistent d_gW1"); |
|
|
check_cuda(cudaMalloc(&db.d_gb1, sizeof(float) * H), "alloc persistent d_gb1"); |
|
|
check_cuda(cudaMalloc(&db.d_gW2, sizeof(float) * C * H), "alloc persistent d_gW2"); |
|
|
check_cuda(cudaMalloc(&db.d_gb2, sizeof(float) * C), "alloc persistent d_gb2"); |
|
|
check_cuda(cudaMalloc(&db.d_loss_scalar, sizeof(float)), "alloc persistent d_loss_scalar"); |
|
|
|
|
|
// CRITICAL: Bottleneck detection buffer - [4] metrics array |
|
|
check_cuda(cudaMalloc(&db.d_bottleneck_metrics, sizeof(float) * 4), "alloc bottleneck_metrics"); |
|
|
} |
|
|
|
|
|
void free_device_buffers(DeviceBuffers& db) { |
|
|
// Free batch-dependent buffers |
|
|
if (db.d_batch_in) cudaFree(db.d_batch_in); |
|
|
if (db.d_batch_lbl) cudaFree(db.d_batch_lbl); |
|
|
|
|
|
// Free multi-scale optical processing buffers |
|
|
if (db.d_field_scale1) cudaFree(db.d_field_scale1); |
|
|
if (db.d_freq_scale1) cudaFree(db.d_freq_scale1); |
|
|
if (db.d_features_scale1) cudaFree(db.d_features_scale1); |
|
|
if (db.d_field_scale2) cudaFree(db.d_field_scale2); |
|
|
if (db.d_freq_scale2) cudaFree(db.d_freq_scale2); |
|
|
if (db.d_features_scale2) cudaFree(db.d_features_scale2); |
|
|
if (db.d_field_scale3) cudaFree(db.d_field_scale3); |
|
|
if (db.d_freq_scale3) cudaFree(db.d_freq_scale3); |
|
|
if (db.d_features_scale3) cudaFree(db.d_features_scale3); |
|
|
// Free mirror architecture buffers |
|
|
if (db.d_features_scale1_mirror) cudaFree(db.d_features_scale1_mirror); |
|
|
if (db.d_features_scale2_mirror) cudaFree(db.d_features_scale2_mirror); |
|
|
if (db.d_features_scale3_mirror) cudaFree(db.d_features_scale3_mirror); |
|
|
if (db.d_multiscale_features) cudaFree(db.d_multiscale_features); |
|
|
|
|
|
if (db.d_hidden) cudaFree(db.d_hidden); |
|
|
if (db.d_logits) cudaFree(db.d_logits); |
|
|
if (db.d_probs) cudaFree(db.d_probs); |
|
|
if (db.d_grad_logits) cudaFree(db.d_grad_logits); |
|
|
if (db.d_grad_hidden) cudaFree(db.d_grad_hidden); |
|
|
if (db.d_grad_multiscale) cudaFree(db.d_grad_multiscale); |
|
|
|
|
|
// Free fungi buffers |
|
|
if (db.d_A) cudaFree(db.d_A); |
|
|
if (db.d_P) cudaFree(db.d_P); |
|
|
if (db.d_grad_map) cudaFree(db.d_grad_map); |
|
|
|
|
|
// Free persistent weight buffers |
|
|
if (db.d_W1) cudaFree(db.d_W1); |
|
|
if (db.d_b1) cudaFree(db.d_b1); |
|
|
if (db.d_W2) cudaFree(db.d_W2); |
|
|
if (db.d_b2) cudaFree(db.d_b2); |
|
|
if (db.d_gW1) cudaFree(db.d_gW1); |
|
|
if (db.d_gb1) cudaFree(db.d_gb1); |
|
|
if (db.d_gW2) cudaFree(db.d_gW2); |
|
|
if (db.d_gb2) cudaFree(db.d_gb2); |
|
|
if (db.d_loss_scalar) cudaFree(db.d_loss_scalar); |
|
|
|
|
|
// CRITICAL: Free bottleneck detection buffer |
|
|
if (db.d_bottleneck_metrics) cudaFree(db.d_bottleneck_metrics); |
|
|
|
|
|
// LEGACY: Free rich dual-channel buffers (not used in intelligent solution) |
|
|
// if (db.d_magnitude_features) cudaFree(db.d_magnitude_features); |
|
|
// if (db.d_phase_features) cudaFree(db.d_phase_features); |
|
|
} |
|
|
|
|
|
// |
|
|
static void adam_update(std::vector<float>& P, std::vector<float>& m, std::vector<float>& v, |
|
|
const float* g_dev, size_t n, float lr, float wd, int t) { |
|
|
std::vector<float> g(n); |
|
|
check_cuda(cudaMemcpy(g.data(), g_dev, sizeof(float) * n, cudaMemcpyDeviceToHost), "D2H grads for Adam"); |
|
|
float b1 = 0.9f, b2 = 0.999f, eps = 1e-8f; |
|
|
float b1t = 1.f - std::pow(b1, (float)t); |
|
|
float b2t = 1.f - std::pow(b2, (float)t); |
|
|
for (size_t i = 0; i < n; ++i) { |
|
|
m[i] = b1 * m[i] + (1 - b1) * g[i]; |
|
|
v[i] = b2 * v[i] + (1 - b2) * g[i] * g[i]; |
|
|
float mh = m[i] / b1t; |
|
|
float vh = v[i] / b2t; |
|
|
P[i] -= lr * (mh / (std::sqrt(vh) + eps) + wd * P[i]); |
|
|
} |
|
|
} |
|
|
|
|
|
// C++ OPTIMIZATION: Efficient GPU weight management |
|
|
void upload_params_to_gpu(const OpticalParams& params, DeviceBuffers& db) { |
|
|
const size_t MS = MULTISCALE_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES; |
|
|
|
|
|
// Upload weights to persistent GPU buffers |
|
|
check_cuda(cudaMemcpy(db.d_W1, params.W1.data(), sizeof(float) * H * MS, cudaMemcpyHostToDevice), "upload W1"); |
|
|
check_cuda(cudaMemcpy(db.d_b1, params.b1.data(), sizeof(float) * H, cudaMemcpyHostToDevice), "upload b1"); |
|
|
check_cuda(cudaMemcpy(db.d_W2, params.W2.data(), sizeof(float) * C * H, cudaMemcpyHostToDevice), "upload W2"); |
|
|
check_cuda(cudaMemcpy(db.d_b2, params.b2.data(), sizeof(float) * C, cudaMemcpyHostToDevice), "upload b2"); |
|
|
} |
|
|
|
|
|
void download_params_from_gpu(OpticalParams& params, const DeviceBuffers& db) { |
|
|
const size_t MS = MULTISCALE_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES; |
|
|
|
|
|
// Download updated weights from GPU |
|
|
check_cuda(cudaMemcpy(params.W1.data(), db.d_W1, sizeof(float) * H * MS, cudaMemcpyDeviceToHost), "download W1"); |
|
|
check_cuda(cudaMemcpy(params.b1.data(), db.d_b1, sizeof(float) * H, cudaMemcpyDeviceToHost), "download b1"); |
|
|
check_cuda(cudaMemcpy(params.W2.data(), db.d_W2, sizeof(float) * C * H, cudaMemcpyDeviceToHost), "download W2"); |
|
|
check_cuda(cudaMemcpy(params.b2.data(), db.d_b2, sizeof(float) * C, cudaMemcpyDeviceToHost), "download b2"); |
|
|
} |
|
|
|
|
|
// |
|
|
// CHANGE LOG: Multi-scale optical processing for 90%+ accuracy |
|
|
// FORWARD: multi_scale_features -> W1*features+b1 -> ReLU -> W2*hidden+b2 -> logits |
|
|
// BACKWARD: Full backpropagation through both layers with multi-scale features |
|
|
float train_batch(const float* h_batch_in, const uint8_t* h_batch_lbl, |
|
|
int B, FungiSoA& fungi, OpticalParams& params, |
|
|
DeviceBuffers& db, FFTPlan& fft, |
|
|
float lr, float wd, int t_adam) { |
|
|
const int S = IMG_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES, MS = MULTISCALE_SIZE; |
|
|
|
|
|
check_cuda(cudaMemcpy(db.d_batch_in, h_batch_in, sizeof(float) * B * S, cudaMemcpyHostToDevice), "H2D input"); |
|
|
check_cuda(cudaMemcpy(db.d_batch_lbl, h_batch_lbl, sizeof(uint8_t) * B, cudaMemcpyHostToDevice), "H2D labels"); |
|
|
|
|
|
// Multi-scale optical processing for 90%+ accuracy |
|
|
fungi_build_masks_GPU(fungi, db.d_A, db.d_P); |
|
|
|
|
|
// DIAGNOSTIC: Analyze mask statistics every few epochs |
|
|
static int debug_counter = 0; |
|
|
if (debug_counter % 10 == 0) { // Every 10 batches |
|
|
fungi_analyze_mask_statistics(db.d_A, db.d_P, IMG_SIZE); |
|
|
} |
|
|
debug_counter++; |
|
|
|
|
|
// Scale 1: Full resolution 28x28 = 784 features |
|
|
k_modulate<<<(B * S + 255) / 256, 256>>>(db.d_batch_in, db.d_A, db.d_P, db.d_field_scale1, B * S); |
|
|
cufftExecC2C(fft.plan_fwd_scale1, db.d_field_scale1, db.d_freq_scale1, CUFFT_FORWARD); |
|
|
k_intensity_magnitude_phase<<<(B * SCALE_1_SIZE + 255) / 256, 256>>>(db.d_freq_scale1, db.d_features_scale1, B * SCALE_1_SIZE); |
|
|
|
|
|
// Scale 2: Half resolution 14x14 = 196 features |
|
|
k_downsample_2x2<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_batch_in, reinterpret_cast<float*>(db.d_field_scale2), IMG_H, IMG_W, B); |
|
|
k_modulate<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(reinterpret_cast<float*>(db.d_field_scale2), db.d_A, db.d_P, db.d_field_scale2, B * SCALE_2_SIZE); |
|
|
cufftExecC2C(fft.plan_fwd_scale2, db.d_field_scale2, db.d_freq_scale2, CUFFT_FORWARD); |
|
|
k_intensity_magnitude_phase<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_freq_scale2, db.d_features_scale2, B * SCALE_2_SIZE); |
|
|
|
|
|
// Scale 3: Quarter resolution 7x7 = 49 features (downsample from scale2) |
|
|
k_downsample_2x2<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_features_scale2, reinterpret_cast<float*>(db.d_field_scale3), SCALE_2, SCALE_2, B); |
|
|
k_modulate<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(reinterpret_cast<float*>(db.d_field_scale3), db.d_A, db.d_P, db.d_field_scale3, B * SCALE_3_SIZE); |
|
|
cufftExecC2C(fft.plan_fwd_scale3, db.d_field_scale3, db.d_freq_scale3, CUFFT_FORWARD); |
|
|
k_intensity_magnitude_phase<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_freq_scale3, db.d_features_scale3, B * SCALE_3_SIZE); |
|
|
|
|
|
// Mirror processing: create horizontally flipped versions for enhanced features |
|
|
k_flip_horizontal<<<(B * SCALE_1_SIZE + 255) / 256, 256>>>(db.d_features_scale1, db.d_features_scale1_mirror, SCALE_1, SCALE_1, B); |
|
|
k_flip_horizontal<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_features_scale2, db.d_features_scale2_mirror, SCALE_2, SCALE_2, B); |
|
|
k_flip_horizontal<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_features_scale3, db.d_features_scale3_mirror, SCALE_3, SCALE_3, B); |
|
|
|
|
|
// INTELLIGENT SOLUTION: Enhanced 6-scale mirror with optimized FFT kernel |
|
|
k_concatenate_6scale_mirror<<<(B * MS + 255) / 256, 256>>>( |
|
|
db.d_features_scale1, db.d_features_scale2, db.d_features_scale3, |
|
|
db.d_features_scale1_mirror, db.d_features_scale2_mirror, db.d_features_scale3_mirror, |
|
|
db.d_multiscale_features, B, SCALE_1_SIZE, SCALE_2_SIZE, SCALE_3_SIZE); |
|
|
|
|
|
// C++ OPTIMIZATION: Use persistent GPU buffers (NO malloc/free per batch!) |
|
|
// Forward pass: Layer 1 (with ReLU) - Enhanced 2058 features with better FFT extraction |
|
|
k_linear_relu_forward<<<(B*H+255)/256, 256>>>(db.d_multiscale_features, db.d_W1, db.d_b1, db.d_hidden, B, MS, H); |
|
|
|
|
|
// Forward pass: Layer 2 (linear) |
|
|
k_linear_forward_mlp<<<(B*C+255)/256, 256>>>(db.d_hidden, db.d_W2, db.d_b2, db.d_logits, B, H, C); |
|
|
|
|
|
// CRITICAL: Real-time bottleneck detection - analyze information flow |
|
|
static int bottleneck_counter = 0; |
|
|
if (bottleneck_counter % 5 == 0) { // Every 5 batches for performance |
|
|
cudaMemset(db.d_bottleneck_metrics, 0, sizeof(float) * 4); |
|
|
int max_threads = fmaxf(fmaxf(MS, H), C); |
|
|
k_bottleneck_detector<<<(max_threads + 255) / 256, 256>>>( |
|
|
db.d_multiscale_features, db.d_hidden, db.d_logits, db.d_bottleneck_metrics, |
|
|
B, MS, H, C); |
|
|
|
|
|
// Download metrics and report critical bottlenecks |
|
|
float h_metrics[4] = {0}; |
|
|
cudaMemcpy(h_metrics, db.d_bottleneck_metrics, sizeof(float) * 4, cudaMemcpyDeviceToHost); |
|
|
|
|
|
float dead_features_pct = (h_metrics[0] / MS) * 100.0f; // % dead input features |
|
|
float dead_neurons_pct = (h_metrics[1] / H) * 100.0f; // % dead hidden neurons |
|
|
float saturated_neurons_pct = (h_metrics[2] / H) * 100.0f; // % saturated hidden neurons |
|
|
float poor_discrimination_pct = (h_metrics[3] / C) * 100.0f; // % poor output discrimination |
|
|
|
|
|
// ALERT: Critical bottleneck detection |
|
|
if (dead_features_pct > 20.0f || dead_neurons_pct > 30.0f || |
|
|
saturated_neurons_pct > 30.0f || poor_discrimination_pct > 40.0f) { |
|
|
printf("🚨 CRITICAL BOTTLENECK DETECTED:\n"); |
|
|
printf(" 📉 Dead Features: %.1f%% | Dead Neurons: %.1f%% | Saturated: %.1f%% | Poor Discrim: %.1f%%\n", |
|
|
dead_features_pct, dead_neurons_pct, saturated_neurons_pct, poor_discrimination_pct); |
|
|
} |
|
|
} |
|
|
bottleneck_counter++; |
|
|
|
|
|
// Loss computation (using persistent buffer) |
|
|
cudaMemset(db.d_loss_scalar, 0, sizeof(float)); |
|
|
k_softmax_xent_loss_grad<<<(B + 255) / 256, 256>>>(db.d_logits, (const uint8_t*)db.d_batch_lbl, db.d_grad_logits, db.d_loss_scalar, B, C); |
|
|
|
|
|
// Backward pass: Layer 2 gradients (using persistent buffers) |
|
|
cudaMemset(db.d_gW2, 0, sizeof(float)*C*H); |
|
|
cudaMemset(db.d_gb2, 0, sizeof(float)*C); |
|
|
k_accum_linear_grads<<<(C+255)/256, 256>>>(db.d_hidden, db.d_grad_logits, db.d_gW2, db.d_gb2, B, H, C); |
|
|
|
|
|
// Backward pass: Hidden layer gradient |
|
|
k_linear_backward_input<<<(B*H+255)/256, 256>>>(db.d_grad_logits, db.d_W2, db.d_grad_hidden, B, H, C); |
|
|
|
|
|
// Backward pass: ReLU gradient |
|
|
k_relu_backward<<<(B*H+255)/256, 256>>>(db.d_grad_hidden, db.d_hidden, db.d_grad_hidden, B*H); |
|
|
|
|
|
// Backward pass: Layer 1 gradients (using persistent buffers with multi-scale) |
|
|
cudaMemset(db.d_gW1, 0, sizeof(float)*H*MS); |
|
|
cudaMemset(db.d_gb1, 0, sizeof(float)*H); |
|
|
k_accum_linear_grads<<<(H+255)/256, 256>>>(db.d_multiscale_features, db.d_grad_hidden, db.d_gW1, db.d_gb1, B, MS, H); |
|
|
|
|
|
// Backward pass: Multi-scale gradient for fungi (simplified - use scale 1 gradient) |
|
|
k_linear_backward_input<<<(B*MS+255)/256, 256>>>(db.d_grad_hidden, db.d_W1, db.d_grad_multiscale, B, MS, H); |
|
|
k_reduce_grad_map<<<(S + 255) / 256, 256>>>(db.d_grad_multiscale, B, S, db.d_grad_map); // Only use first S elements |
|
|
|
|
|
// Adam updates for all parameters (using persistent buffers with multi-scale) |
|
|
adam_update(params.W1, params.m_W1, params.v_W1, db.d_gW1, H * MS, lr, wd, t_adam); |
|
|
adam_update(params.b1, params.m_b1, params.v_b1, db.d_gb1, H, lr, 0.0f, t_adam); |
|
|
adam_update(params.W2, params.m_W2, params.v_W2, db.d_gW2, C * H, lr, wd, t_adam); |
|
|
adam_update(params.b2, params.m_b2, params.v_b2, db.d_gb2, C, lr, 0.0f, t_adam); |
|
|
|
|
|
// BUGFIX: Upload updated weights back to GPU after Adam |
|
|
upload_params_to_gpu(params, db); |
|
|
|
|
|
float h_loss; |
|
|
check_cuda(cudaMemcpy(&h_loss, db.d_loss_scalar, sizeof(float), cudaMemcpyDeviceToHost), "D2H loss"); |
|
|
|
|
|
return h_loss / B; |
|
|
} |
|
|
|
|
|
// |
|
|
void infer_batch(const float* h_batch_in, int B, |
|
|
const FungiSoA& fungi, const OpticalParams& params, |
|
|
DeviceBuffers& db, FFTPlan& fft, |
|
|
std::vector<int>& out_predictions) { |
|
|
const int S = IMG_SIZE, H = HIDDEN_SIZE, C = NUM_CLASSES, MS = MULTISCALE_SIZE; |
|
|
check_cuda(cudaMemcpy(db.d_batch_in, h_batch_in, sizeof(float) * B * S, cudaMemcpyHostToDevice), "H2D infer input"); |
|
|
|
|
|
// Multi-scale optical processing for inference |
|
|
fungi_build_masks_GPU(const_cast<FungiSoA&>(fungi), db.d_A, db.d_P); |
|
|
|
|
|
// Scale 1: Full resolution 28x28 = 784 features |
|
|
k_modulate<<<(B * S + 255) / 256, 256>>>(db.d_batch_in, db.d_A, db.d_P, db.d_field_scale1, B * S); |
|
|
cufftExecC2C(fft.plan_fwd_scale1, db.d_field_scale1, db.d_freq_scale1, CUFFT_FORWARD); |
|
|
k_intensity_magnitude_phase<<<(B * SCALE_1_SIZE + 255) / 256, 256>>>(db.d_freq_scale1, db.d_features_scale1, B * SCALE_1_SIZE); |
|
|
|
|
|
// Scale 2: Half resolution 14x14 = 196 features |
|
|
k_downsample_2x2<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_batch_in, reinterpret_cast<float*>(db.d_field_scale2), IMG_H, IMG_W, B); |
|
|
k_modulate<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(reinterpret_cast<float*>(db.d_field_scale2), db.d_A, db.d_P, db.d_field_scale2, B * SCALE_2_SIZE); |
|
|
cufftExecC2C(fft.plan_fwd_scale2, db.d_field_scale2, db.d_freq_scale2, CUFFT_FORWARD); |
|
|
k_intensity_magnitude_phase<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_freq_scale2, db.d_features_scale2, B * SCALE_2_SIZE); |
|
|
|
|
|
// Scale 3: Quarter resolution 7x7 = 49 features (downsample from scale2) |
|
|
k_downsample_2x2<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_features_scale2, reinterpret_cast<float*>(db.d_field_scale3), SCALE_2, SCALE_2, B); |
|
|
k_modulate<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(reinterpret_cast<float*>(db.d_field_scale3), db.d_A, db.d_P, db.d_field_scale3, B * SCALE_3_SIZE); |
|
|
cufftExecC2C(fft.plan_fwd_scale3, db.d_field_scale3, db.d_freq_scale3, CUFFT_FORWARD); |
|
|
k_intensity_magnitude_phase<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_freq_scale3, db.d_features_scale3, B * SCALE_3_SIZE); |
|
|
|
|
|
// Mirror processing: create horizontally flipped versions for enhanced features |
|
|
k_flip_horizontal<<<(B * SCALE_1_SIZE + 255) / 256, 256>>>(db.d_features_scale1, db.d_features_scale1_mirror, SCALE_1, SCALE_1, B); |
|
|
k_flip_horizontal<<<(B * SCALE_2_SIZE + 255) / 256, 256>>>(db.d_features_scale2, db.d_features_scale2_mirror, SCALE_2, SCALE_2, B); |
|
|
k_flip_horizontal<<<(B * SCALE_3_SIZE + 255) / 256, 256>>>(db.d_features_scale3, db.d_features_scale3_mirror, SCALE_3, SCALE_3, B); |
|
|
|
|
|
// INTELLIGENT SOLUTION: Enhanced 6-scale mirror with optimized FFT kernel (INFERENCE) |
|
|
k_concatenate_6scale_mirror<<<(B * MS + 255) / 256, 256>>>( |
|
|
db.d_features_scale1, db.d_features_scale2, db.d_features_scale3, |
|
|
db.d_features_scale1_mirror, db.d_features_scale2_mirror, db.d_features_scale3_mirror, |
|
|
db.d_multiscale_features, B, SCALE_1_SIZE, SCALE_2_SIZE, SCALE_3_SIZE); |
|
|
|
|
|
// C++ OPTIMIZATION: Use persistent GPU buffers for inference |
|
|
// Forward pass: Layer 1 (with ReLU) - Enhanced 2058 features with better FFT extraction |
|
|
k_linear_relu_forward<<<(B*H+255)/256, 256>>>(db.d_multiscale_features, db.d_W1, db.d_b1, db.d_hidden, B, MS, H); |
|
|
|
|
|
// Forward pass: Layer 2 (linear) |
|
|
k_linear_forward_mlp<<<(B*C+255)/256, 256>>>(db.d_hidden, db.d_W2, db.d_b2, db.d_logits, B, H, C); |
|
|
|
|
|
std::vector<float> h_logits(B * C); |
|
|
check_cuda(cudaMemcpy(h_logits.data(), db.d_logits, sizeof(float) * B * C, cudaMemcpyDeviceToHost), "D2H logits"); |
|
|
|
|
|
out_predictions.resize(B); |
|
|
for (int b = 0; b < B; ++b) { |
|
|
int best_class = 0; |
|
|
float max_logit = h_logits[b * C]; |
|
|
for (int c = 1; c < C; ++c) { |
|
|
if (h_logits[b * C + c] > max_logit) { |
|
|
max_logit = h_logits[b * C + c]; |
|
|
best_class = c; |
|
|
} |
|
|
} |
|
|
out_predictions[b] = best_class; |
|
|
} |
|
|
} |
|
|
|
|
|
// |
|
|
__global__ void k_modulate(const float* x, const float* A, const float* P, cufftComplex* field, int N_pixels) { |
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (i >= N_pixels) return; |
|
|
|
|
|
int pixel_idx = i % IMG_SIZE; |
|
|
|
|
|
float input_val = x[i]; |
|
|
float amp = A[pixel_idx]; |
|
|
float phase = P[pixel_idx]; |
|
|
|
|
|
field[i].x = input_val * amp * cosf(phase); |
|
|
field[i].y = input_val * amp * sinf(phase); |
|
|
} |
|
|
|
|
|
__global__ void k_intensity_log1p(const cufftComplex* freq, float* y, int N_elements) { |
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (i >= N_elements) return; |
|
|
|
|
|
float intensity = freq[i].x * freq[i].x + freq[i].y * freq[i].y; |
|
|
y[i] = log1pf(intensity); |
|
|
} |
|
|
|
|
|
// BOTTLENECK FIX: Enhanced extraction with magnitude and phase information |
|
|
__global__ void k_intensity_magnitude_phase(const cufftComplex* freq, float* y, int N_elements) { |
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (i >= N_elements) return; |
|
|
|
|
|
float real = freq[i].x; |
|
|
float imag = freq[i].y; |
|
|
|
|
|
// Preserve both magnitude and phase information |
|
|
float magnitude = sqrtf(real * real + imag * imag); |
|
|
float phase = atan2f(imag, real); |
|
|
|
|
|
// BREAKTHROUGH FIX: Instead of crushing to 1D, preserve rich information |
|
|
// Method 1: Enhanced representation with multiple components |
|
|
y[i] = log1pf(magnitude) + 0.5f * tanhf(phase) + 0.2f * (real / (fabsf(real) + 1e-6f)) + 0.1f * (imag / (fabsf(imag) + 1e-6f)); |
|
|
} |
|
|
|
|
|
// REVOLUTIONARY: Rich FFT extraction - DOUBLES information capacity |
|
|
__global__ void k_rich_fft_extraction(const cufftComplex* freq, float* magnitude_out, float* phase_out, int N_elements) { |
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (i >= N_elements) return; |
|
|
|
|
|
float real = freq[i].x; |
|
|
float imag = freq[i].y; |
|
|
|
|
|
// Preserve magnitude with enhanced dynamic range |
|
|
float magnitude = sqrtf(real * real + imag * imag); |
|
|
magnitude_out[i] = log1pf(magnitude) + 0.1f * atan2f(magnitude, 1.0f); // Enhanced magnitude |
|
|
|
|
|
// Preserve phase with full resolution |
|
|
float phase = atan2f(imag, real); |
|
|
phase_out[i] = tanhf(2.0f * phase / 3.14159f); // Full phase preservation [-1,1] |
|
|
} |
|
|
|
|
|
// BREAKTHROUGH: Concatenate magnitude and phase channels into rich feature vector |
|
|
__global__ void k_concatenate_dual_channels(const float* magnitude_channel, const float* phase_channel, |
|
|
float* rich_features, int B, int channel_size) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
int total_size = 2 * channel_size; // magnitude + phase = double size |
|
|
|
|
|
if (idx >= B * total_size) return; |
|
|
|
|
|
int batch_idx = idx / total_size; |
|
|
int feature_idx = idx % total_size; |
|
|
|
|
|
if (feature_idx < channel_size) { |
|
|
// First half: magnitude channel [0, channel_size) |
|
|
rich_features[idx] = magnitude_channel[batch_idx * channel_size + feature_idx]; |
|
|
} else { |
|
|
// Second half: phase channel [channel_size, 2*channel_size) |
|
|
int phase_idx = feature_idx - channel_size; |
|
|
rich_features[idx] = phase_channel[batch_idx * channel_size + phase_idx]; |
|
|
} |
|
|
} |
|
|
|
|
|
__global__ void k_linear_forward(const float* y, const float* W, const float* b, float* logits, int B, int S, int C) { |
|
|
int batch_class = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (batch_class >= B * C) return; |
|
|
|
|
|
int batch_idx = batch_class / C; |
|
|
int class_idx = batch_class % C; |
|
|
|
|
|
float sum = b[class_idx]; |
|
|
for (int s = 0; s < S; ++s) { |
|
|
sum += W[class_idx * S + s] * y[batch_idx * S + s]; |
|
|
} |
|
|
logits[batch_class] = sum; |
|
|
} |
|
|
|
|
|
__global__ void k_softmax_xent_loss_grad(const float* logits, const uint8_t* labels, float* grad_logits, float* total_loss, int B, int C) { |
|
|
int b = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (b >= B) return; |
|
|
|
|
|
const float* b_logits = logits + b * C; |
|
|
float max_val = -1e20f; |
|
|
for (int c = 0; c < C; ++c) { |
|
|
if (b_logits[c] > max_val) max_val = b_logits[c]; |
|
|
} |
|
|
|
|
|
float exp_sum = 0.f; |
|
|
float exp_vals[10]; |
|
|
for (int c = 0; c < C; ++c) { |
|
|
exp_vals[c] = expf(b_logits[c] - max_val); |
|
|
exp_sum += exp_vals[c]; |
|
|
} |
|
|
|
|
|
uint8_t true_label = labels[b]; |
|
|
float* b_grad = grad_logits + b * C; |
|
|
float loss = 0.f; |
|
|
|
|
|
for (int c = 0; c < C; ++c) { |
|
|
float prob = exp_vals[c] / exp_sum; |
|
|
b_grad[c] = prob - (c == true_label ? 1.f : 0.f); |
|
|
if (c == true_label) { |
|
|
loss = -logf(fmaxf(prob, 1e-9f)); |
|
|
} |
|
|
} |
|
|
atomicAdd(total_loss, loss); |
|
|
} |
|
|
|
|
|
__global__ void k_backprop_y(const float* grad_logits, const float* W, float* grad_y, int B, int S, int C) { |
|
|
int batch_pixel = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (batch_pixel >= B * S) return; |
|
|
|
|
|
int batch_idx = batch_pixel / S; |
|
|
int pixel_idx = batch_pixel % S; |
|
|
|
|
|
float sum = 0.f; |
|
|
for (int c = 0; c < C; ++c) { |
|
|
sum += grad_logits[batch_idx * C + c] * W[c * S + pixel_idx]; |
|
|
} |
|
|
grad_y[batch_pixel] = sum; |
|
|
} |
|
|
|
|
|
__global__ void k_accum_grads_Wb(const float* y, const float* grad_logits, float* gW, float* gb, int B, int S, int C) { |
|
|
int class_idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (class_idx >= C) return; |
|
|
|
|
|
float gb_sum = 0.f; |
|
|
for (int b = 0; b < B; ++b) { |
|
|
gb_sum += grad_logits[b * C + class_idx]; |
|
|
} |
|
|
gb[class_idx] = gb_sum; |
|
|
|
|
|
for (int s = 0; s < S; ++s) { |
|
|
float gw_sum = 0.f; |
|
|
for (int b = 0; b < B; ++b) { |
|
|
gw_sum += grad_logits[b * C + class_idx] * y[b * S + s]; |
|
|
} |
|
|
gW[class_idx * S + s] = gw_sum; |
|
|
} |
|
|
} |
|
|
|
|
|
__global__ void k_reduce_grad_map(const float* grad_y, int B, int S, float* grad_map) { |
|
|
int pixel = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (pixel >= S) return; |
|
|
|
|
|
float acc = 0.f; |
|
|
for (int b = 0; b < B; ++b) { |
|
|
acc += fabsf(grad_y[b * S + pixel]); |
|
|
} |
|
|
grad_map[pixel] = acc / static_cast<float>(B); |
|
|
} |
|
|
|
|
|
__global__ void k_sigmoid(const float* logits, float* probs, int N) { |
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (i >= N) return; |
|
|
probs[i] = 1.f / (1.f + expf(-logits[i])); |
|
|
} |
|
|
|
|
|
void create_fft_plan(FFTPlan& fft, int batch) { |
|
|
// Scale 1: 28x28 FFT plans |
|
|
int n1[2] = {SCALE_1, SCALE_1}; |
|
|
cufftPlanMany(&fft.plan_fwd_scale1, 2, n1, nullptr, 1, SCALE_1_SIZE, nullptr, 1, SCALE_1_SIZE, CUFFT_C2C, batch); |
|
|
cufftPlanMany(&fft.plan_inv_scale1, 2, n1, nullptr, 1, SCALE_1_SIZE, nullptr, 1, SCALE_1_SIZE, CUFFT_C2C, batch); |
|
|
|
|
|
// Scale 2: 14x14 FFT plans |
|
|
int n2[2] = {SCALE_2, SCALE_2}; |
|
|
cufftPlanMany(&fft.plan_fwd_scale2, 2, n2, nullptr, 1, SCALE_2_SIZE, nullptr, 1, SCALE_2_SIZE, CUFFT_C2C, batch); |
|
|
cufftPlanMany(&fft.plan_inv_scale2, 2, n2, nullptr, 1, SCALE_2_SIZE, nullptr, 1, SCALE_2_SIZE, CUFFT_C2C, batch); |
|
|
|
|
|
// Scale 3: 7x7 FFT plans |
|
|
int n3[2] = {SCALE_3, SCALE_3}; |
|
|
cufftPlanMany(&fft.plan_fwd_scale3, 2, n3, nullptr, 1, SCALE_3_SIZE, nullptr, 1, SCALE_3_SIZE, CUFFT_C2C, batch); |
|
|
cufftPlanMany(&fft.plan_inv_scale3, 2, n3, nullptr, 1, SCALE_3_SIZE, nullptr, 1, SCALE_3_SIZE, CUFFT_C2C, batch); |
|
|
} |
|
|
|
|
|
void destroy_fft_plan(FFTPlan& fft) { |
|
|
if (fft.plan_fwd_scale1) cufftDestroy(fft.plan_fwd_scale1); |
|
|
if (fft.plan_inv_scale1) cufftDestroy(fft.plan_inv_scale1); |
|
|
if (fft.plan_fwd_scale2) cufftDestroy(fft.plan_fwd_scale2); |
|
|
if (fft.plan_inv_scale2) cufftDestroy(fft.plan_inv_scale2); |
|
|
if (fft.plan_fwd_scale3) cufftDestroy(fft.plan_fwd_scale3); |
|
|
if (fft.plan_inv_scale3) cufftDestroy(fft.plan_inv_scale3); |
|
|
} |
|
|
|
|
|
// CHANGE LOG: Updated initialization for 6-scale mirror two-layer MLP |
|
|
// ORIGINAL: Single layer initialization (IMG_SIZE=784) |
|
|
// NEW: Xavier/Glorot initialization for both layers (MULTISCALE_SIZE=2058) |
|
|
void init_params(OpticalParams& p, unsigned seed) { |
|
|
std::mt19937 gen(seed); |
|
|
|
|
|
// Xavier initialization: std = sqrt(2 / (fan_in + fan_out)) |
|
|
float std_W1 = std::sqrt(2.0f / (MULTISCALE_SIZE + HIDDEN_SIZE)); |
|
|
float std_W2 = std::sqrt(2.0f / (HIDDEN_SIZE + NUM_CLASSES)); |
|
|
std::normal_distribution<float> dist_W1(0.f, std_W1); |
|
|
std::normal_distribution<float> dist_W2(0.f, std_W2); |
|
|
|
|
|
// First layer: MULTISCALE_SIZE -> HIDDEN_SIZE |
|
|
size_t W1_size = HIDDEN_SIZE * MULTISCALE_SIZE; |
|
|
p.W1.resize(W1_size); |
|
|
p.b1.resize(HIDDEN_SIZE); |
|
|
p.m_W1.resize(W1_size); |
|
|
p.v_W1.resize(W1_size); |
|
|
p.m_b1.resize(HIDDEN_SIZE); |
|
|
p.v_b1.resize(HIDDEN_SIZE); |
|
|
|
|
|
for (size_t i = 0; i < W1_size; ++i) { |
|
|
p.W1[i] = dist_W1(gen); |
|
|
p.m_W1[i] = 0.f; |
|
|
p.v_W1[i] = 0.f; |
|
|
} |
|
|
for (size_t i = 0; i < HIDDEN_SIZE; ++i) { |
|
|
p.b1[i] = 0.f; |
|
|
p.m_b1[i] = 0.f; |
|
|
p.v_b1[i] = 0.f; |
|
|
} |
|
|
|
|
|
// Second layer: HIDDEN_SIZE -> NUM_CLASSES |
|
|
size_t W2_size = NUM_CLASSES * HIDDEN_SIZE; |
|
|
p.W2.resize(W2_size); |
|
|
p.b2.resize(NUM_CLASSES); |
|
|
p.m_W2.resize(W2_size); |
|
|
p.v_W2.resize(W2_size); |
|
|
p.m_b2.resize(NUM_CLASSES); |
|
|
p.v_b2.resize(NUM_CLASSES); |
|
|
|
|
|
for (size_t i = 0; i < W2_size; ++i) { |
|
|
p.W2[i] = dist_W2(gen); |
|
|
p.m_W2[i] = 0.f; |
|
|
p.v_W2[i] = 0.f; |
|
|
} |
|
|
for (size_t i = 0; i < NUM_CLASSES; ++i) { |
|
|
p.b2[i] = 0.f; |
|
|
p.m_b2[i] = 0.f; |
|
|
p.v_b2[i] = 0.f; |
|
|
} |
|
|
} |
|
|
|
|
|
// |
|
|
|
|
|
// Linear layer with ReLU activation: output = ReLU(W * input + b) |
|
|
__global__ void k_linear_relu_forward(const float* input, const float* W, const float* b, float* output, int B, int input_size, int output_size) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (idx >= B * output_size) return; |
|
|
|
|
|
int batch_idx = idx / output_size; |
|
|
int out_idx = idx % output_size; |
|
|
|
|
|
float sum = b[out_idx]; |
|
|
for (int i = 0; i < input_size; ++i) { |
|
|
sum += W[out_idx * input_size + i] * input[batch_idx * input_size + i]; |
|
|
} |
|
|
output[idx] = fmaxf(0.0f, sum); // ReLU activation |
|
|
} |
|
|
|
|
|
// Linear layer without activation: output = W * input + b |
|
|
__global__ void k_linear_forward_mlp(const float* input, const float* W, const float* b, float* output, int B, int input_size, int output_size) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (idx >= B * output_size) return; |
|
|
|
|
|
int batch_idx = idx / output_size; |
|
|
int out_idx = idx % output_size; |
|
|
|
|
|
float sum = b[out_idx]; |
|
|
for (int i = 0; i < input_size; ++i) { |
|
|
sum += W[out_idx * input_size + i] * input[batch_idx * input_size + i]; |
|
|
} |
|
|
output[idx] = sum; |
|
|
} |
|
|
|
|
|
// ReLU backward: grad_input = grad_output * (forward_output > 0) |
|
|
__global__ void k_relu_backward(const float* grad_output, const float* forward_output, float* grad_input, int N) { |
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (i >= N) return; |
|
|
grad_input[i] = grad_output[i] * (forward_output[i] > 0.0f ? 1.0f : 0.0f); |
|
|
} |
|
|
|
|
|
// Linear backward (input gradients): grad_input = W^T * grad_output |
|
|
__global__ void k_linear_backward_input(const float* grad_output, const float* W, float* grad_input, int B, int input_size, int output_size) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (idx >= B * input_size) return; |
|
|
|
|
|
int batch_idx = idx / input_size; |
|
|
int in_idx = idx % input_size; |
|
|
|
|
|
float sum = 0.0f; |
|
|
for (int o = 0; o < output_size; ++o) { |
|
|
sum += W[o * input_size + in_idx] * grad_output[batch_idx * output_size + o]; |
|
|
} |
|
|
grad_input[idx] = sum; |
|
|
} |
|
|
|
|
|
// Accumulate gradients for linear layer weights and biases |
|
|
__global__ void k_accum_linear_grads(const float* input, const float* grad_output, float* gW, float* gb, int B, int input_size, int output_size) { |
|
|
int out_idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (out_idx >= output_size) return; |
|
|
|
|
|
// Accumulate bias gradient |
|
|
float gb_sum = 0.0f; |
|
|
for (int b = 0; b < B; ++b) { |
|
|
gb_sum += grad_output[b * output_size + out_idx]; |
|
|
} |
|
|
gb[out_idx] = gb_sum; |
|
|
|
|
|
// Accumulate weight gradients |
|
|
for (int in_idx = 0; in_idx < input_size; ++in_idx) { |
|
|
float gw_sum = 0.0f; |
|
|
for (int b = 0; b < B; ++b) { |
|
|
gw_sum += grad_output[b * output_size + out_idx] * input[b * input_size + in_idx]; |
|
|
} |
|
|
gW[out_idx * input_size + in_idx] = gw_sum; |
|
|
} |
|
|
} |
|
|
|
|
|
// |
|
|
|
|
|
// Downsample 28x28 to 14x14 using 2x2 average pooling |
|
|
__global__ void k_downsample_2x2(const float* input, float* output, int input_h, int input_w, int B) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
int output_h = input_h / 2; |
|
|
int output_w = input_w / 2; |
|
|
int output_size = output_h * output_w; |
|
|
|
|
|
if (idx >= B * output_size) return; |
|
|
|
|
|
int batch_idx = idx / output_size; |
|
|
int out_pixel = idx % output_size; |
|
|
int out_y = out_pixel / output_w; |
|
|
int out_x = out_pixel % output_w; |
|
|
|
|
|
// Average 2x2 region |
|
|
float sum = 0.0f; |
|
|
for (int dy = 0; dy < 2; ++dy) { |
|
|
for (int dx = 0; dx < 2; ++dx) { |
|
|
int in_y = out_y * 2 + dy; |
|
|
int in_x = out_x * 2 + dx; |
|
|
int in_pixel = in_y * input_w + in_x; |
|
|
sum += input[batch_idx * (input_h * input_w) + in_pixel]; |
|
|
} |
|
|
} |
|
|
output[idx] = sum * 0.25f; // Average of 4 pixels |
|
|
} |
|
|
|
|
|
|
|
|
// Concatenate multi-scale features: [scale1 | scale2 | scale3] |
|
|
__global__ void k_concatenate_features(const float* scale1, const float* scale2, const float* scale3, |
|
|
float* multiscale, int B, int s1_size, int s2_size, int s3_size) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
int total_size = s1_size + s2_size + s3_size; |
|
|
|
|
|
if (idx >= B * total_size) return; |
|
|
|
|
|
int batch_idx = idx / total_size; |
|
|
int feature_idx = idx % total_size; |
|
|
|
|
|
if (feature_idx < s1_size) { |
|
|
// Copy from scale1 |
|
|
multiscale[idx] = scale1[batch_idx * s1_size + feature_idx]; |
|
|
} else if (feature_idx < s1_size + s2_size) { |
|
|
// Copy from scale2 |
|
|
int s2_idx = feature_idx - s1_size; |
|
|
multiscale[idx] = scale2[batch_idx * s2_size + s2_idx]; |
|
|
} else { |
|
|
// Copy from scale3 |
|
|
int s3_idx = feature_idx - s1_size - s2_size; |
|
|
multiscale[idx] = scale3[batch_idx * s3_size + s3_idx]; |
|
|
} |
|
|
} |
|
|
|
|
|
// Memory-efficient horizontal flip: flip left-right in place |
|
|
__global__ void k_flip_horizontal(const float* input, float* output, int height, int width, int B) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
int total_pixels = B * height * width; |
|
|
|
|
|
if (idx >= total_pixels) return; |
|
|
|
|
|
int batch_idx = idx / (height * width); |
|
|
int pixel_idx = idx % (height * width); |
|
|
int row = pixel_idx / width; |
|
|
int col = pixel_idx % width; |
|
|
|
|
|
// Flip column: new_col = width - 1 - col |
|
|
int flipped_col = width - 1 - col; |
|
|
int flipped_idx = batch_idx * (height * width) + row * width + flipped_col; |
|
|
|
|
|
output[idx] = input[flipped_idx]; |
|
|
} |
|
|
|
|
|
// Memory-efficient vertical flip: flip top-bottom in place |
|
|
__global__ void k_flip_vertical(const float* input, float* output, int height, int width, int B) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
int total_pixels = B * height * width; |
|
|
|
|
|
if (idx >= total_pixels) return; |
|
|
|
|
|
int batch_idx = idx / (height * width); |
|
|
int pixel_idx = idx % (height * width); |
|
|
int row = pixel_idx / width; |
|
|
int col = pixel_idx % width; |
|
|
|
|
|
// Flip row: new_row = height - 1 - row |
|
|
int flipped_row = height - 1 - row; |
|
|
int flipped_idx = batch_idx * (height * width) + flipped_row * width + col; |
|
|
|
|
|
output[idx] = input[flipped_idx]; |
|
|
} |
|
|
|
|
|
// 6-scale mirror concatenation: [s1 | s2 | s3 | s1_mir | s2_mir | s3_mir] |
|
|
__global__ void k_concatenate_6scale_mirror(const float* s1, const float* s2, const float* s3, |
|
|
const float* s1_mir, const float* s2_mir, const float* s3_mir, |
|
|
float* multiscale, int B, int s1_size, int s2_size, int s3_size) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
int single_size = s1_size + s2_size + s3_size; // 1029 |
|
|
int total_size = 2 * single_size; // 2058 |
|
|
|
|
|
if (idx >= B * total_size) return; |
|
|
|
|
|
int batch_idx = idx / total_size; |
|
|
int feature_idx = idx % total_size; |
|
|
|
|
|
if (feature_idx < single_size) { |
|
|
// First half: normal features [s1 | s2 | s3] |
|
|
if (feature_idx < s1_size) { |
|
|
// Copy from scale1 |
|
|
multiscale[idx] = s1[batch_idx * s1_size + feature_idx]; |
|
|
} else if (feature_idx < s1_size + s2_size) { |
|
|
// Copy from scale2 |
|
|
int s2_idx = feature_idx - s1_size; |
|
|
multiscale[idx] = s2[batch_idx * s2_size + s2_idx]; |
|
|
} else { |
|
|
// Copy from scale3 |
|
|
int s3_idx = feature_idx - s1_size - s2_size; |
|
|
multiscale[idx] = s3[batch_idx * s3_size + s3_idx]; |
|
|
} |
|
|
} else { |
|
|
// Second half: mirrored features [s1_mir | s2_mir | s3_mir] |
|
|
int mirror_idx = feature_idx - single_size; |
|
|
if (mirror_idx < s1_size) { |
|
|
// Copy from mirrored scale1 |
|
|
multiscale[idx] = s1_mir[batch_idx * s1_size + mirror_idx]; |
|
|
} else if (mirror_idx < s1_size + s2_size) { |
|
|
// Copy from mirrored scale2 |
|
|
int s2_idx = mirror_idx - s1_size; |
|
|
multiscale[idx] = s2_mir[batch_idx * s2_size + s2_idx]; |
|
|
} else { |
|
|
// Copy from mirrored scale3 |
|
|
int s3_idx = mirror_idx - s1_size - s2_size; |
|
|
multiscale[idx] = s3_mir[batch_idx * s3_size + s3_idx]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// ================= BOTTLENECK ANALYSIS KERNELS ================= |
|
|
|
|
|
// Analyze activation saturation (ReLU dead neurons) |
|
|
__global__ void k_analyze_activation_saturation(const float* activations, float* stats, int N) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (idx >= N) return; |
|
|
|
|
|
float val = activations[idx]; |
|
|
|
|
|
// Use atomic operations to gather statistics |
|
|
if (val <= 1e-6f) { |
|
|
atomicAdd(&stats[0], 1.0f); // Dead neurons (ReLU=0) |
|
|
} else if (val >= 0.99f) { |
|
|
atomicAdd(&stats[1], 1.0f); // Saturated neurons |
|
|
} |
|
|
atomicAdd(&stats[2], val); // Sum for mean |
|
|
atomicAdd(&stats[3], val*val); // Sum squares for variance |
|
|
} |
|
|
|
|
|
// Analyze gradient flow (vanishing/exploding gradients) |
|
|
__global__ void k_analyze_gradient_flow(const float* gradients, float* stats, int N) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (idx >= N) return; |
|
|
|
|
|
float grad = gradients[idx]; |
|
|
float abs_grad = fabsf(grad); |
|
|
|
|
|
if (abs_grad < 1e-6f) { |
|
|
atomicAdd(&stats[0], 1.0f); // Vanishing gradients |
|
|
} else if (abs_grad > 10.0f) { |
|
|
atomicAdd(&stats[1], 1.0f); // Exploding gradients |
|
|
} |
|
|
atomicAdd(&stats[2], abs_grad); // Sum for mean |
|
|
} |
|
|
|
|
|
// CRITICAL: Real-time bottleneck detection - identifies where information is lost |
|
|
__global__ void k_bottleneck_detector(const float* input_features, const float* hidden_act, |
|
|
const float* logits, float* bottleneck_metrics, |
|
|
int batch_size, int input_size, int hidden_size, int output_size) { |
|
|
int tid = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
|
|
|
// Feature diversity analysis (input layer) - detect information collapse |
|
|
if (tid < input_size) { |
|
|
float feature_sum = 0.0f, feature_var = 0.0f; |
|
|
for (int b = 0; b < batch_size; b++) { |
|
|
float val = input_features[b * input_size + tid]; |
|
|
feature_sum += val; |
|
|
} |
|
|
float mean = feature_sum / batch_size; |
|
|
|
|
|
for (int b = 0; b < batch_size; b++) { |
|
|
float val = input_features[b * input_size + tid]; |
|
|
feature_var += (val - mean) * (val - mean); |
|
|
} |
|
|
feature_var /= batch_size; |
|
|
|
|
|
// Low variance = information loss (features all the same value) |
|
|
if (feature_var < 1e-4f) atomicAdd(&bottleneck_metrics[0], 1.0f); // Dead features count |
|
|
} |
|
|
|
|
|
// Hidden activation analysis - detect neural saturation |
|
|
if (tid < hidden_size) { |
|
|
float hidden_sum = 0.0f; |
|
|
for (int b = 0; b < batch_size; b++) { |
|
|
hidden_sum += hidden_act[b * hidden_size + tid]; |
|
|
} |
|
|
float hidden_mean = hidden_sum / batch_size; |
|
|
|
|
|
// Saturation detection (critical bottleneck indicators) |
|
|
if (hidden_mean < 0.01f) atomicAdd(&bottleneck_metrics[1], 1.0f); // Dead neurons |
|
|
if (hidden_mean > 0.99f) atomicAdd(&bottleneck_metrics[2], 1.0f); // Saturated neurons |
|
|
} |
|
|
|
|
|
// Logits analysis (output bottleneck) - detect poor class discrimination |
|
|
if (tid < output_size) { |
|
|
float logit_range = 0.0f; |
|
|
float min_logit = 1e10f, max_logit = -1e10f; |
|
|
|
|
|
for (int b = 0; b < batch_size; b++) { |
|
|
float val = logits[b * output_size + tid]; |
|
|
min_logit = fminf(min_logit, val); |
|
|
max_logit = fmaxf(max_logit, val); |
|
|
} |
|
|
logit_range = max_logit - min_logit; |
|
|
|
|
|
// Small range = poor discrimination ability (critical bottleneck) |
|
|
if (logit_range < 0.1f) atomicAdd(&bottleneck_metrics[3], 1.0f); // Poor discrimination count |
|
|
} |
|
|
} |
|
|
|
|
|
|