Pure_Optical_CUDA / fungi.cu
Agnuxo's picture
Upload 36 files
db3c893 verified
// ======================= Mycelial Evolution System ==========================
// Implements: reward->energy, gravity attraction, motion, growth/shrink,
// death, sexual reproduction with recombination (3 children), mutation.
// --- includes you likely already have ---
#include "fungi.hpp"
#include "fungi_Paremetres.hpp"
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <algorithm>
#include <random>
#include <numeric>
#include <cmath>
#include <stdexcept>
static inline void CK(cudaError_t st, const char* msg){
if (st != cudaSuccess) throw std::runtime_error(std::string("[CUDA] ")+msg+": "+cudaGetErrorString(st));
}
// ------------------------- 1) Reward from gradient --------------------------
__device__ inline float d_phi_gauss(int x, int y, float cx, float cy,
float s, float a, float th){
// same φ as in build-masks (elliptical Gaussian)
float dx = x - cx, dy = y - cy;
float c = cosf(th), s0 = sinf(th);
float rx = a/(s*s), ry = (1.f/a)/(s*s);
float xr = c*dx + s0*dy;
float yr = -s0*dx + c*dy;
float q = 0.5f*(xr*xr*rx + yr*yr*ry);
return __expf(-q);
}
/** R[h] = + Σ_{x,y} |grad(x,y)| * φ_h(x,y) (FIX: reward HIGH gradients for feature detection) */
__global__ void k_reward_map(const float* grad, int H,int W,
const float* fx,const float* fy,const float* fs,
const float* fa,const float* fth,
float* R, int F){
int h = blockIdx.x*blockDim.x + threadIdx.x; if (h>=F) return;
float cx=fx[h], cy=fy[h], s=fs[h], a=fa[h], th=fth[h];
float acc = 0.f;
for(int y=0;y<H;y++){
for(int x=0;x<W;x++){
float ph = d_phi_gauss(x,y,cx,cy,s,a,th);
acc += ph * fabsf(grad[y*W + x]);
}
}
R[h] = +acc; // FIX: Reward HIGH gradients (removed negative sign)
}
// ------------------------- 2) Gravity & Motion ------------------------------
/** Compute softened Newtonian attraction between fungi (O(F^2), fine up to ~1k). */
__global__ void k_gravity_forces(const float* fx,const float* fy,const float* mass,
float* ax,float* ay, int F, float G, float eps2){
int i = blockIdx.x*blockDim.x + threadIdx.x; if (i>=F) return;
float xi=fx[i], yi=fy[i], mi = fmaxf(mass[i], 1e-3f);
float ax_i=0.f, ay_i=0.f;
for(int j=0;j<F;j++){
if (j==i) continue;
float dx=fx[j]-xi, dy=fy[j]-yi;
float r2 = dx*dx + dy*dy + eps2;
float invr = rsqrtf(r2);
float invr3 = invr*invr*invr;
float Fg = G * mi * mass[j] * invr3;
ax_i += Fg * dx / mi;
ay_i += Fg * dy / mi;
}
ax[i]=ax_i; ay[i]=ay_i;
}
/** Simple velocity Verlet / Euler update with damping and box bounds. */
__global__ void k_apply_motion(float* fx,float* fy,
float* vx,float* vy,
const float* ax,const float* ay,
int F, int W,int H, float dt, float damp){
int i=blockIdx.x*blockDim.x+threadIdx.x; if(i>=F) return;
float vxi = (vx?vx[i]:0.f), vyi=(vy?vy[i]:0.f);
vxi = (vxi + dt*ax[i]) * damp;
vyi = (vyi + dt*ay[i]) * damp;
float xi = fx[i] + dt*vxi;
float yi = fy[i] + dt*vyi;
// reflect at borders
if (xi<0){ xi=0; vxi=-vxi; } if (xi>W-1){ xi=W-1; vxi=-vxi; }
if (yi<0){ yi=0; vyi=-vyi; } if (yi>H-1){ yi=H-1; vyi=-vyi; }
fx[i]=xi; fy[i]=yi; if(vx) vx[i]=vxi; if(vy) vy[i]=vyi;
}
// -------------------- 3) Energy, growth/shrink, death -----------------------
/**
* Update energy with decay/food/cost and adjust radius sigma via tanh(energy).
* Mark for death (flag=1) if energy < death_th.
*/
__global__ void k_energy_growth_mark(float* fx,float* fy,float* fs,float* fa,float* fth,
float* a0,float* p0,
float* energy,float* mass,int* age,
const float* R, int* deathFlag, int F,
float food,float decay,float death_th,float cost){
int h=blockIdx.x*blockDim.x+threadIdx.x; if(h>=F) return;
float e = energy[h]*decay + food*R[h] - cost*(1.f + 0.01f*fs[h]*fs[h]);
energy[h]=e; age[h] += 1;
// growth/shrink
float g=tanhf(e);
fs[h] = fmaxf(1.0f, fs[h]*(1.f + 0.05f*g));
// death mark
deathFlag[h] = (e<death_th) ? 1 : 0;
}
// -------------------- 4) Host-side reproduction (pairing etc.) --------------
/** Helper: uniform crossover + Gaussian mutation of scalar gene. */
static inline float _xover_mut(float a, float b, std::mt19937& rng, float mut_std){
std::uniform_real_distribution<float> U(0.f,1.f);
std::normal_distribution<float> N(0.f, mut_std);
float t = U(rng) < 0.5f ? a : b; // uniform crossover
return t + N(rng); // mutation
}
static inline float _clip(float v, float lo, float hi){ return std::max(lo, std::min(hi, v)); }
/** Create one child from parents i,j; append to population (HOST memory). */
static void _spawn_child(const FungiSoA& P, int i, int j,
FungiSoA& out, std::mt19937& rng,
float mut_pos=0.5f, float mut_par=0.2f){
FungiSoA& R = out;
R.F += 1;
// Ensure capacity in vectors
auto reserveF = [&](std::vector<float>& v){ if((int)v.size()<R.F) v.resize(R.F); };
auto reserveI = [&](std::vector<int>& v){ if((int)v.size()<R.F) v.resize(R.F); };
reserveF(R.x); reserveF(R.y); reserveF(R.sigma); reserveF(R.alpha); reserveF(R.theta);
reserveF(R.a_base); reserveF(R.p_base); reserveF(R.energy); reserveF(R.mass); reserveI(R.age);
int k = R.F-1;
// genes
R.x[k] = _clip(_xover_mut(P.x[i], P.x[j], rng, mut_pos), 0.f, (float)(R.W-1));
R.y[k] = _clip(_xover_mut(P.y[i], P.y[j], rng, mut_pos), 0.f, (float)(R.H-1));
R.sigma[k] = _clip(_xover_mut(P.sigma[i], P.sigma[j], rng, mut_par), 1.0f, 10.0f);
R.alpha[k] = _clip(_xover_mut(P.alpha[i], P.alpha[j], rng, 0.1f), 0.3f, 3.0f);
R.theta[k] = _xover_mut(P.theta[i], P.theta[j], rng, 0.2f);
R.a_base[k] = _xover_mut(P.a_base[i],P.a_base[j], rng, 0.1f);
R.p_base[k] = _xover_mut(P.p_base[i],P.p_base[j], rng, 0.1f);
// newborn state
R.energy[k] = 0.0f;
R.mass[k] = 0.5f*(P.mass[i]+P.mass[j]) * 0.3f; // fraction of parents' mass
R.age[k] = 0;
}
/**
* Pairing: pick adults by energy, find close pairs (distance < pair_dist),
* spawn `offspring_per_pair` children per pair (3 recommended).
* Parents donate part of mass/energy (they "shrink" a bit).
*/
static void _pair_and_reproduce(FungiSoA& P, const EvoParams& evo){
std::vector<int> ids(P.F); std::iota(ids.begin(), ids.end(), 0);
std::sort(ids.begin(), ids.end(), [&](int a,int b){ return P.energy[a] > P.energy[b]; });
std::mt19937 rng(evo.seed ^ 0xBADC0DEu);
FungiSoA out = P; // start with copy; children appended here
auto dist = [&](int i,int j){
float dx=P.x[i]-P.x[j], dy=P.y[i]-P.y[j];
return std::sqrt(dx*dx+dy*dy);
};
int maxPairs = std::max(1, P.F/8);
int made=0;
for (size_t idx=0; idx+1<ids.size() && made<maxPairs; ++idx){
int i = ids[idx], j = ids[idx+1];
if (i==j) continue;
// Consider as adults if energy positive
if (P.energy[i] <= 0.f || P.energy[j] <= 0.f) continue;
if (dist(i,j) > evo.pair_dist) continue;
// Spawn children
for (int c=0; c<evo.offspring_per_pair; ++c){
_spawn_child(P, i, j, out, rng);
}
// Parents donate some mass/energy → shrink a bit
P.mass[i] *= 0.85f; P.mass[j] *= 0.85f;
P.sigma[i]*= 0.95f; P.sigma[j]*= 0.95f;
P.energy[i]*= 0.7f; P.energy[j]*= 0.7f;
made++;
}
// Compact deaths and cap population
// First remove completely "dead" individuals: energy << 0 or mass very small
std::vector<int> keep(out.F, 1);
for(int h=0; h<out.F; ++h){
if (out.energy[h] < -2.0f || out.mass[h] < 0.05f) keep[h]=0;
}
// Create list of survivors
std::vector<int> order; order.reserve(out.F);
for(int h=0; h<out.F; ++h) if(keep[h]) order.push_back(h);
// Cap by energy (keep best) if necessary
int cap = evo.max_population>0 ? evo.max_population : (int)std::floor(1.5 * (double)P.F);
if ((int)order.size() > cap){
std::sort(order.begin(), order.end(), [&](int a,int b){ return out.energy[a] > out.energy[b]; });
order.resize(cap);
}
// Rewrite P with survivors in `order`
auto pick = [&](std::vector<float>& v){
std::vector<float> nv; nv.reserve(order.size());
for(int id: order) nv.push_back(v[id]); v.swap(nv);
};
auto picki = [&](std::vector<int>& v){
std::vector<int> nv; nv.reserve(order.size());
for(int id: order) nv.push_back(v[id]); v.swap(nv);
};
pick(out.x); pick(out.y); pick(out.sigma); pick(out.alpha); pick(out.theta);
pick(out.a_base); pick(out.p_base); pick(out.energy); pick(out.mass); picki(out.age);
out.F = (int)order.size();
P = std::move(out);
}
// ---------------------- 5) Full ecology step (public) -----------------------
void fungi_ecology_step(FungiSoA& pop, const float* d_grad_map, const EvoParams& evo){
const int F=pop.F, H=pop.H, W=pop.W;
if (F<=0) return;
// Upload SoA to device
float *fx,*fy,*fs,*fa,*fth,*a0,*p0,*E,*M; int *Age;
CK(cudaMalloc(&fx,sizeof(float)*F),"alloc fx");
CK(cudaMalloc(&fy,sizeof(float)*F),"alloc fy");
CK(cudaMalloc(&fs,sizeof(float)*F),"alloc fs");
CK(cudaMalloc(&fa,sizeof(float)*F),"alloc fa");
CK(cudaMalloc(&fth,sizeof(float)*F),"alloc th");
CK(cudaMalloc(&a0,sizeof(float)*F),"alloc a0");
CK(cudaMalloc(&p0,sizeof(float)*F),"alloc p0");
CK(cudaMalloc(&E, sizeof(float)*F),"alloc E");
CK(cudaMalloc(&M, sizeof(float)*F),"alloc M");
CK(cudaMalloc(&Age,sizeof(int)*F),"alloc Age");
CK(cudaMemcpy(fx, pop.x.data(), sizeof(float)*F, cudaMemcpyHostToDevice),"H2D fx");
CK(cudaMemcpy(fy, pop.y.data(), sizeof(float)*F, cudaMemcpyHostToDevice),"H2D fy");
CK(cudaMemcpy(fs, pop.sigma.data(), sizeof(float)*F, cudaMemcpyHostToDevice),"H2D fs");
CK(cudaMemcpy(fa, pop.alpha.data(), sizeof(float)*F, cudaMemcpyHostToDevice),"H2D fa");
CK(cudaMemcpy(fth,pop.theta.data(), sizeof(float)*F, cudaMemcpyHostToDevice),"H2D th");
CK(cudaMemcpy(a0, pop.a_base.data(),sizeof(float)*F, cudaMemcpyHostToDevice),"H2D a0");
CK(cudaMemcpy(p0, pop.p_base.data(),sizeof(float)*F, cudaMemcpyHostToDevice),"H2D p0");
CK(cudaMemcpy(E, pop.energy.data(),sizeof(float)*F, cudaMemcpyHostToDevice),"H2D E");
CK(cudaMemcpy(M, pop.mass.data(), sizeof(float)*F, cudaMemcpyHostToDevice),"H2D M");
CK(cudaMemcpy(Age,pop.age.data(), sizeof(int)*F, cudaMemcpyHostToDevice),"H2D Age");
// 1) Reward
float* dR=nullptr; CK(cudaMalloc(&dR,sizeof(float)*F),"alloc R");
k_reward_map<<<(F+255)/256,256>>>(d_grad_map, H,W, fx,fy,fs,fa,fth, dR, F);
// 2) Gravity & motion (alloc vel/acc)
float *ax,*ay,*vx,*vy; CK(cudaMalloc(&ax,sizeof(float)*F),"alloc ax");
CK(cudaMalloc(&ay,sizeof(float)*F),"alloc ay");
CK(cudaMalloc(&vx,sizeof(float)*F),"alloc vx");
CK(cudaMalloc(&vy,sizeof(float)*F),"alloc vy");
CK(cudaMemset(vx,0,sizeof(float)*F),"zero vx");
CK(cudaMemset(vy,0,sizeof(float)*F),"zero vy");
k_gravity_forces<<<(F+255)/256,256>>>(fx,fy,M, ax,ay, F, evo.G, evo.eps2);
k_apply_motion<<<(F+255)/256,256>>>(fx,fy, vx,vy, ax,ay, F, W,H, evo.dt, evo.damp);
// 3) Energy/growth + death flags
int* dDead=nullptr; CK(cudaMalloc(&dDead,sizeof(int)*F),"alloc deadFlag");
CK(cudaMemset(dDead,0,sizeof(int)*F),"zero deadFlag");
k_energy_growth_mark<<<(F+255)/256,256>>>(fx,fy,fs,fa,fth, a0,p0, E,M,Age,
dR, dDead, F,
evo.food,evo.decay,evo.death_th,evo.cost);
// Download updated arrays
std::vector<int> hDead(F);
CK(cudaMemcpy(pop.x.data(), fx, sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H fx");
CK(cudaMemcpy(pop.y.data(), fy, sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H fy");
CK(cudaMemcpy(pop.sigma.data(), fs, sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H fs");
CK(cudaMemcpy(pop.alpha.data(), fa, sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H fa");
CK(cudaMemcpy(pop.theta.data(), fth,sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H th");
CK(cudaMemcpy(pop.a_base.data(),a0, sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H a0");
CK(cudaMemcpy(pop.p_base.data(),p0, sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H p0");
CK(cudaMemcpy(pop.energy.data(),E, sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H E");
CK(cudaMemcpy(pop.mass.data(), M, sizeof(float)*F, cudaMemcpyDeviceToHost),"D2H M");
CK(cudaMemcpy(pop.age.data(), Age, sizeof(int)*F, cudaMemcpyDeviceToHost),"D2H Age");
CK(cudaMemcpy(hDead.data(), dDead, sizeof(int)*F, cudaMemcpyDeviceToHost),"D2H deadFlag");
// Free device temporaries
cudaFree(dR); cudaFree(ax); cudaFree(ay); cudaFree(vx); cudaFree(vy); cudaFree(dDead);
cudaFree(fx); cudaFree(fy); cudaFree(fs); cudaFree(fa); cudaFree(fth);
cudaFree(a0); cudaFree(p0); cudaFree(E); cudaFree(M); cudaFree(Age);
// Remove marked dead on HOST quickly
std::vector<int> keep; keep.reserve(pop.F);
for (int i=0;i<pop.F;i++) if(!hDead[i]) keep.push_back(i);
auto pick = [&](std::vector<float>& v){
std::vector<float> nv; nv.reserve(keep.size());
for(int id: keep) nv.push_back(v[id]); v.swap(nv);
};
auto picki=[&](std::vector<int>& v){
std::vector<int> nv; nv.reserve(keep.size());
for(int id: keep) nv.push_back(v[id]); v.swap(nv);
};
pick(pop.x); pick(pop.y); pick(pop.sigma); pick(pop.alpha); pick(pop.theta);
pick(pop.a_base); pick(pop.p_base); pick(pop.energy); pick(pop.mass); picki(pop.age);
pop.F = (int)keep.size();
// 4) Pairing & reproduction (3 children per pair by default)
_pair_and_reproduce(pop, evo);
}
#include "fungi.hpp"
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <stdexcept>
static inline void ck(cudaError_t st, const char* msg){
if (st != cudaSuccess) { throw std::runtime_error(std::string("[CUDA] ")+msg+": "+cudaGetErrorString(st)); }
}
void FungiSoA::resize(int F_, int H_, int W_) {
F = F_; H=H_; W=W_;
x.resize(F); y.resize(F); sigma.resize(F); alpha.resize(F); theta.resize(F);
a_base.resize(F); p_base.resize(F);
energy.assign(F, 0.f); mass.assign(F, 1.f); age.assign(F, 0);
}
void FungiSoA::init_random(unsigned seed, float sigma_min, float sigma_max) {
std::mt19937 rng(seed);
std::uniform_real_distribution<float> Ux(0.f, (float)(W-1));
std::uniform_real_distribution<float> Uy(0.f, (float)(H-1));
std::uniform_real_distribution<float> Us(sigma_min, sigma_max);
std::uniform_real_distribution<float> Ua(0.7f, 1.3f);
std::uniform_real_distribution<float> Ut(-3.1415926f, 3.1415926f);
std::normal_distribution<float> N01(0.f, 0.15f);
for (int i=0;i<F;i++){
x[i]=Ux(rng); y[i]=Uy(rng); sigma[i]=Us(rng);
alpha[i]=Ua(rng); theta[i]=Ut(rng);
a_base[i]=N01(rng); p_base[i]=N01(rng);
}
}
void FungiSoA::adjust_population(int newF, unsigned seed) {
if (newF < 1) newF = 1;
if (newF == F) return;
if (newF < F) {
std::vector<int> idx(F);
std::iota(idx.begin(), idx.end(), 0);
std::partial_sort(idx.begin(), idx.begin() + newF, idx.end(), [&](int a, int b){
return energy[a] > energy[b];
});
idx.resize(newF);
auto reorderF = [&](std::vector<float>& v){
std::vector<float> nv; nv.reserve(newF);
for (int id : idx) nv.push_back(v[id]);
v.swap(nv);
};
auto reorderI = [&](std::vector<int>& v){
std::vector<int> nv; nv.reserve(newF);
for (int id : idx) nv.push_back(v[id]);
v.swap(nv);
};
reorderF(x); reorderF(y); reorderF(sigma); reorderF(alpha); reorderF(theta);
reorderF(a_base); reorderF(p_base); reorderF(energy); reorderF(mass);
reorderI(age);
F = newF;
return;
}
int oldF = F;
x.resize(newF); y.resize(newF); sigma.resize(newF); alpha.resize(newF); theta.resize(newF);
a_base.resize(newF); p_base.resize(newF); energy.resize(newF); mass.resize(newF); age.resize(newF);
std::mt19937 rng(seed ^ (unsigned)newF);
std::uniform_real_distribution<float> Ux(0.f, (float)(W>0?W-1:0));
std::uniform_real_distribution<float> Uy(0.f, (float)(H>0?H-1:0));
std::uniform_real_distribution<float> Us(1.5f, 6.5f);
std::uniform_real_distribution<float> Ua(0.5f, 1.6f);
std::uniform_real_distribution<float> Ut(-3.1415926f, 3.1415926f);
std::normal_distribution<float> Namp(0.f, 0.3f);
for (int i = oldF; i < newF; ++i) {
x[i] = (W > 0) ? Ux(rng) : 0.f;
y[i] = (H > 0) ? Uy(rng) : 0.f;
sigma[i] = Us(rng);
alpha[i] = Ua(rng);
theta[i] = Ut(rng);
a_base[i] = Namp(rng);
p_base[i] = Namp(rng);
energy[i] = 0.f;
mass[i] = 1.f;
age[i] = 0;
}
F = newF;
}
// ------------------ GPU kernels ------------------
__global__ void k_clear(float* A, int N, float v=0.f){ int i=blockIdx.x*blockDim.x+threadIdx.x; if(i<N) A[i]=v; }
/** Gaussian elliptical basis φ_h(x,y). */
__device__ inline float phi_gauss(int x, int y, float cx, float cy, float s, float a, float th){
float dx = x - cx, dy = y - cy;
float c = cosf(th), s0 = sinf(th);
float rx = a/(s*s), ry = (1.f/a)/(s*s);
// Rotate
float xr = c*dx + s0*dy;
float yr = -s0*dx + c*dy;
float q = 0.5f*(xr*xr*rx + yr*yr*ry);
return __expf(-q);
}
/** Accumulate masks from fungi (naive per-pixel; for 28x28 or small HxW it's fast). */
__global__ void k_build_masks(const float* fx,const float* fy,const float* fs,const float* fa,const float* fth,
const float* fa0,const float* fp0, int F,
float* A, float* P, int H, int W){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int N = H*W; if (idx>=N) return;
int y = idx / W, x = idx % W;
float sa=0.f, sp=0.f;
#pragma unroll 4
for(int h=0; h<F; ++h){
float ph = phi_gauss(x,y, fx[h],fy[h],fs[h],fa[h],fth[h]);
sa += fa0[h] * ph;
sp += fp0[h] * ph;
}
// Map to physical domain with CLAMPING to prevent oversaturation
// A = clamped_softplus(sa)+eps ; P = pi * tanh(sp)
float Araw = fminf(log1pf(expf(fminf(sa, 2.0f))), 2.0f) + 1e-3f; // FIX: Clamp amplitude max 2.0
float Praw = 3.1415926f * tanhf(fminf(fmaxf(sp, -1.0f), 1.0f)); // FIX: Clamp phase input
A[idx] = Araw;
P[idx] = Praw;
}
void fungi_build_masks_GPU(const FungiSoA& pop,
float* d_A, float* d_P,
int /*tiles_y*/, int /*tiles_x*/) {
// Upload minimal arrays (F is small ~128-512)
float *dx,*dy,*ds,*da,*dt,*dab,*dpb;
ck(cudaMalloc(&dx, sizeof(float)*pop.F), "alloc fx");
ck(cudaMalloc(&dy, sizeof(float)*pop.F), "alloc fy");
ck(cudaMalloc(&ds, sizeof(float)*pop.F), "alloc fs");
ck(cudaMalloc(&da, sizeof(float)*pop.F), "alloc fa");
ck(cudaMalloc(&dt, sizeof(float)*pop.F), "alloc fth");
ck(cudaMalloc(&dab,sizeof(float)*pop.F), "alloc a0");
ck(cudaMalloc(&dpb,sizeof(float)*pop.F), "alloc p0");
ck(cudaMemcpy(dx, pop.x.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice), "H2D fx");
ck(cudaMemcpy(dy, pop.y.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice), "H2D fy");
ck(cudaMemcpy(ds, pop.sigma.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice), "H2D fs");
ck(cudaMemcpy(da, pop.alpha.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice), "H2D fa");
ck(cudaMemcpy(dt, pop.theta.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice), "H2D fth");
ck(cudaMemcpy(dab,pop.a_base.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice), "H2D a0");
ck(cudaMemcpy(dpb,pop.p_base.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice), "H2D p0");
int N = pop.H*pop.W;
k_clear<<<(N+255)/256,256>>>(d_A, N, 0.f);
k_clear<<<(N+255)/256,256>>>(d_P, N, 0.f);
k_build_masks<<<(N+255)/256,256>>>(dx,dy,ds,da,dt, dab,dpb,pop.F, d_A,d_P, pop.H,pop.W);
cudaFree(dx); cudaFree(dy); cudaFree(ds); cudaFree(da); cudaFree(dt); cudaFree(dab); cudaFree(dpb);
}
// --- simple RNG on device for evolution ---
__global__ void k_rng_init(curandStatePhilox4_32_10_t* st, unsigned seed, int n){
int i=blockIdx.x*blockDim.x+threadIdx.x; if(i<n) curand_init(seed, i, 0, &st[i]);
}
// Reward accumulation: R_h = -sum φ_h * grad(x,y)
__global__ void k_reward(const float* grad, int H, int W,
const float* fx,const float* fy,const float* fs,const float* fa,const float* fth,
float* R, int F){
int h = blockIdx.x*blockDim.x+threadIdx.x; if(h>=F) return;
float cx=fx[h], cy=fy[h], s=fs[h], a=fa[h], th=fth[h];
float acc=0.f;
for(int y=0;y<H;y++){
for(int x=0;x<W;x++){
float ph = phi_gauss(x,y,cx,cy,s,a,th);
acc += ph * fabsf(grad[y*W+x]); // magnitude proxy
}
}
R[h] = +acc; // FIX: Reward HIGH gradients (removed negative sign) // want to reduce gradient magnitude
}
__global__ void k_evolve(float* fx,float* fy,float* fs,float* fa,float* fth,
float* a0,float* p0,float* energy,float* mass,int* age,
const float* R, int F, int H, int W,
float food,float decay,float death_th,float cost,
curandStatePhilox4_32_10_t* rng){
int h = blockIdx.x*blockDim.x+threadIdx.x; if(h>=F) return;
curandStatePhilox4_32_10_t st = rng[h];
float e = energy[h]*decay + food*R[h] - cost*(1.f + 0.01f*fs[h]*fs[h]); // penalize large radius
energy[h] = e;
age[h] += 1;
// Growth/Shrink
float g = tanhf(e); // [-1,1]
fs[h] = fmaxf(1.0f, fs[h] * (1.f + 0.05f*g));
// Small random drift (Brownian)
float dx=(curand_uniform(&st)-0.5f)*1.0f, dy=(curand_uniform(&st)-0.5f)*1.0f;
fx[h] = fminf(fmaxf(fx[h] + dx, 0.f), (float)(W-1));
fy[h] = fminf(fmaxf(fy[h] + dy, 0.f), (float)(H-1));
// Rejuvenate bad fungi
if(e < death_th){
fx[h]=curand_uniform(&st)*(W-1.f);
fy[h]=curand_uniform(&st)*(H-1.f);
fs[h]=1.5f + 4.0f*curand_uniform(&st);
fa[h]=0.8f + 0.6f*curand_uniform(&st);
fth[h]=(curand_uniform(&st)*2.f-1.f)*3.1415926f;
a0[h]+= (curand_uniform(&st)-0.5f)*0.2f;
p0[h]+= (curand_uniform(&st)-0.5f)*0.2f;
energy[h]=0.f; mass[h]=1.f; age[h]=0;
}
rng[h]=st;
}
void fungi_evolve_GPU(FungiSoA& pop,
const float* d_grad_map,
int evo_pairs,
float food, float decay, float death_th,
float cost, unsigned seed){
(void)evo_pairs; // (pairing/attraction left minimal; reward-based dynamics are cheap & robust)
// Upload SoA
float *fx,*fy,*fs,*fa,*fth,*a0,*p0,*E,*M; int *Age;
ck(cudaMalloc(&fx,sizeof(float)*pop.F),"alloc fx"); ck(cudaMalloc(&fy,sizeof(float)*pop.F),"alloc fy");
ck(cudaMalloc(&fs,sizeof(float)*pop.F),"alloc fs"); ck(cudaMalloc(&fa,sizeof(float)*pop.F),"alloc fa");
ck(cudaMalloc(&fth,sizeof(float)*pop.F),"alloc th"); ck(cudaMalloc(&a0,sizeof(float)*pop.F),"alloc a0");
ck(cudaMalloc(&p0,sizeof(float)*pop.F),"alloc p0"); ck(cudaMalloc(&E,sizeof(float)*pop.F),"alloc E");
ck(cudaMalloc(&M,sizeof(float)*pop.F),"alloc M"); ck(cudaMalloc(&Age,sizeof(int)*pop.F),"alloc age");
ck(cudaMemcpy(fx, pop.x.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D fx");
ck(cudaMemcpy(fy, pop.y.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D fy");
ck(cudaMemcpy(fs, pop.sigma.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D fs");
ck(cudaMemcpy(fa, pop.alpha.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D fa");
ck(cudaMemcpy(fth,pop.theta.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D th");
ck(cudaMemcpy(a0, pop.a_base.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D a0");
ck(cudaMemcpy(p0, pop.p_base.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D p0");
ck(cudaMemcpy(E, pop.energy.data(),sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D E");
ck(cudaMemcpy(M, pop.mass.data(), sizeof(float)*pop.F, cudaMemcpyHostToDevice),"H2D M");
ck(cudaMemcpy(Age,pop.age.data(), sizeof(int)*pop.F, cudaMemcpyHostToDevice),"H2D age");
// Reward
float* dR=nullptr; ck(cudaMalloc(&dR,sizeof(float)*pop.F),"alloc R");
k_reward<<<(pop.F+255)/256,256>>>(d_grad_map, pop.H,pop.W, fx,fy,fs,fa,fth, dR, pop.F);
// RNG & evolve
curandStatePhilox4_32_10_t* rng; ck(cudaMalloc(&rng,sizeof(curandStatePhilox4_32_10_t)*pop.F),"alloc rng");
k_rng_init<<<(pop.F+255)/256,256>>>(rng, seed, pop.F);
k_evolve<<<(pop.F+255)/256,256>>>(fx,fy,fs,fa,fth, a0,p0,E,M,Age, dR, pop.F, pop.H,pop.W, food,decay,death_th,cost, rng);
// Download
ck(cudaMemcpy(pop.x.data(), fx, sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H fx");
ck(cudaMemcpy(pop.y.data(), fy, sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H fy");
ck(cudaMemcpy(pop.sigma.data(), fs, sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H fs");
ck(cudaMemcpy(pop.alpha.data(), fa, sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H fa");
ck(cudaMemcpy(pop.theta.data(), fth,sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H th");
ck(cudaMemcpy(pop.a_base.data(), a0,sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H a0");
ck(cudaMemcpy(pop.p_base.data(), p0,sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H p0");
ck(cudaMemcpy(pop.energy.data(), E, sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H E");
ck(cudaMemcpy(pop.mass.data(), M, sizeof(float)*pop.F, cudaMemcpyDeviceToHost),"D2H M");
ck(cudaMemcpy(pop.age.data(), Age,sizeof(int)*pop.F, cudaMemcpyDeviceToHost),"D2H age");
cudaFree(dR); cudaFree(rng);
cudaFree(fx); cudaFree(fy); cudaFree(fs); cudaFree(fa); cudaFree(fth);
cudaFree(a0); cudaFree(p0); cudaFree(E); cudaFree(M); cudaFree(Age);
}
void download_mask(float* h, const float* d, int HW){
ck(cudaMemcpy(h,d,sizeof(float)*HW,cudaMemcpyDeviceToHost),"D2H mask");
}
// ================== DIAGNOSTIC TOOLS ==================
void fungi_create_test_pattern(float* h_pattern, int H, int W, int pattern_type) {
for (int y = 0; y < H; y++) {
for (int x = 0; x < W; x++) {
float val = 0.0f;
switch (pattern_type) {
case 0: // Checkerboard 4x4
val = ((x/4 + y/4) % 2) ? 1.0f : 0.0f;
break;
case 1: // Gradient horizontal
val = (float)x / (W-1);
break;
case 2: // Concentric circles
{
float cx = W/2.0f, cy = H/2.0f;
float r = sqrtf((x-cx)*(x-cx) + (y-cy)*(y-cy));
val = 0.5f + 0.5f * sinf(r * 0.5f);
}
break;
case 3: // Delta impulse (center)
val = (x == W/2 && y == H/2) ? 1.0f : 0.0f;
break;
}
h_pattern[y*W + x] = val;
}
}
}
void fungi_analyze_mask_statistics(const float* d_A, const float* d_P, int HW) {
std::vector<float> h_A(HW), h_P(HW);
ck(cudaMemcpy(h_A.data(), d_A, sizeof(float)*HW, cudaMemcpyDeviceToHost), "D2H A");
ck(cudaMemcpy(h_P.data(), d_P, sizeof(float)*HW, cudaMemcpyDeviceToHost), "D2H P");
// Estadísticas de Amplitud
float A_min = *std::min_element(h_A.begin(), h_A.end());
float A_max = *std::max_element(h_A.begin(), h_A.end());
float A_mean = std::accumulate(h_A.begin(), h_A.end(), 0.0f) / HW;
// Estadísticas de Fase
float P_min = *std::min_element(h_P.begin(), h_P.end());
float P_max = *std::max_element(h_P.begin(), h_P.end());
float P_mean = std::accumulate(h_P.begin(), h_P.end(), 0.0f) / HW;
printf("🔍 FUNGI MASK STATISTICS:\n");
printf(" Amplitude: min=%.3f, max=%.3f, mean=%.3f\n", A_min, A_max, A_mean);
printf(" Phase: min=%.3f, max=%.3f, mean=%.3f\n", P_min, P_max, P_mean);
printf(" A oversaturation: %s\n", (A_max > 1.9f) ? "⚠️ YES" : "✅ NO");
printf(" P range check: %s\n", (P_max > 3.2f || P_min < -3.2f) ? "⚠️ OUT OF RANGE" : "✅ OK");
}