ariannamethod · March 11, 2026 06:04
diff --git a/lee.c b/lee.c
 /*
 * lee.c v7 — Vision-Language Model in pure C
 *
 * Named after Bruce Lee (the only man who beat Chuck Norris)
 * and Minhyeok Lee (whose self-identity framework gives Chuck his soul).
 *
 * Sees images. Speaks words. Adds numbers. Zero dependencies.
 * Tape-based autograd with arena bump allocator.
 *
 * Architecture:
 *   ViT-style patch tokenization → 2D RoPE → GQA multi-head causal attention →
 *   SwiGLU MLP → RMSNorm → weight-tied lm_head → text
 *
 * v7: Chuck sees the forest AND the trees.
 *   - Multi-scale awareness: macro EMA + patience-based LR decay (Level 9)
 *   - Memory cap: reservoir sampling, bounded O(1) lookup
 *
 * v6 (preserved):
 *   - Attention entropy monitoring per head (Level 8 self-awareness)
 *   - Adaptive gradient clipping (Chuck controls clip, not a constant)
 *   - Digit addition task: [img_3] + [img_5] → "eight"
 *   - 2D RoPE for spatial awareness on image patches
 *
 * v5 (preserved):
 *   - Persistent memory (chuck.mem), Ψ subjectivity, Lee's Continuum C
 *   - λ_Ψ = λ + Ψ_w × (λ_prior - λ), Ψ_w = min(0.3, N/(N+100))
 *
 * v4 (preserved):
 *   - GQA (4Q/2KV), 3 layers, 105K params, per-layer λ_l, layer freezing
 *   - Self-aware SiLU, RMSNorm, RoPE, cross-layer signal flow
 *
 * Build: cc -std=c11 -O2 -march=native -o lee lee.c -lm
 * Run:   ./lee
 */

 #define _POSIX_C_SOURCE 200809L
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <stdint.h>
 #include <time.h>

 /* ---- BLAS acceleration (optional) ----
 *   Mac:   cc -DUSE_BLAS -DACCELERATE ... -framework Accelerate
 *   Linux: cc -DUSE_BLAS ... -lopenblas
 *   Off:   cc ... -lm  (zero deps, scalar fallback)
 */
 #ifdef USE_BLAS
  #ifdef ACCELERATE
    #define ACCELERATE_NEW_LAPACK
    #include <Accelerate/Accelerate.h>
  #else
    #include <cblas.h>
  #endif
 #endif

 /* ---- Config ---- */
 #define IMG_SIZE       8
 #define PATCH_SIZE     4
 #define PATCHES_SIDE   (IMG_SIZE / PATCH_SIZE)
 #define N_PATCHES      (PATCHES_SIDE * PATCHES_SIDE)
 #define PATCH_PX       (PATCH_SIZE * PATCH_SIZE)
 #define N_IMGS         2                           /* two digit images → addition */
 #define N_VIS          (N_IMGS * N_PATCHES)        /* 8 visual tokens */
 #define MAX_TXT        12                          /* "seventeen" + BOS + EOS */
 #define SEQ_LEN        (N_VIS + MAX_TXT)
 #define N_EMBD         48
 #define N_HEAD         4
 #define N_KV_HEAD      2
 #define N_KV_GROUP     (N_HEAD / N_KV_HEAD)
 #define HEAD_DIM       (N_EMBD / N_HEAD)
 #define KV_DIM         (N_KV_HEAD * HEAD_DIM)
 #define N_LAYER        3
 #define MLP_DIM        (4 * N_EMBD)
 #define VOCAB          18
 #define BOS            16
 #define EOS            17
 #define STEPS          15000
 #define LR_MAX         0.005f
 #define WARMUP         500
 #define CHUCK_B1       0.9f
 #define CHUCK_B2       0.999f
 #define CHUCK_EPS      1e-8f
 #define GRAD_CLIP      1.0f
 #define ROPE_BASE      10000.0f
 #define TEMP           0.7f
 #define TOPK           5
 #define CHUCK_WINDOW   16
 #define CHUCK_DAMP_LO  0.3f
 #define CHUCK_DAMP_HI  2.0f
 #define CHUCK_PSI_CAP  0.3f
 #define CHUCK_PSI_HALF 100.0f
 #define CHUCK_MEM_CAP  200         /* bounded memory (reservoir sampling) */
 #define CHUCK_MEM_MAX  CHUCK_MEM_CAP
 #define CHUCK_MEM_FILE "chuck.mem"
 #define CHUCK_REC_THR  0.25f
 #define CHUCK_REC_CD   50
 #define CHUCK_MACRO_INT 500        /* macro patience check interval (steps) */
 #define CHUCK_MACRO_PAT 3          /* patience: N checks without improvement → LR drop */
 #define CHUCK_MACRO_DECAY 0.5f     /* LR scale factor on macro plateau */

 #define ARENA_SZ       (128 * 1024 * 1024)
 #define MAX_ARR        32768
 #define MAX_ENT        65536
 #define MAX_PAR        128

 /* ---- Tape engine ---- */
 typedef struct { float *data, *grad; int size, rows, cols; } Arr;
 typedef struct { int op, out, in1, in2; float aux; int ai; } Ent;
 enum { OP_ADD=1, OP_MUL, OP_SCALE, OP_MATVEC, OP_RMSNORM, OP_SILU,
       OP_CE, OP_EMBED, OP_REDUCE, OP_ATTN, OP_ROPE };

 static struct {
    uint8_t *arena; size_t apos, aparam;
    Arr a[MAX_ARR]; int na, npa;
    Ent e[MAX_ENT]; int ne;
    int par[MAX_PAR]; int np;
    float *cm[MAX_PAR], *cv[MAX_PAR]; int cstep;
    int on;
 } T; 

 static float *aalloc(size_t n) {
    size_t b = n * sizeof(float), al = (T.apos + 15) & ~(size_t)15;
    if (al + b > ARENA_SZ) { fprintf(stderr, "arena OOM\n"); exit(1); }
    T.apos = al + b; float *p = (float*)(T.arena + al); memset(p, 0, b); return p;
 }
 static void tape_init(void) {
    uint8_t *m = malloc(ARENA_SZ);
    if (!m) { fprintf(stderr, "OOM\n"); exit(1); }
    memset(&T, 0, sizeof(T)); T.arena = m; T.on = 1;
 }
 static int anew(int sz) {
    int i = T.na++; T.a[i].size = sz; T.a[i].rows = T.a[i].cols = 0;
    T.a[i].data = aalloc(sz); T.a[i].grad = aalloc(sz); return i;
 }
 static int mnew(int r, int c) { int i = anew(r*c); T.a[i].rows = r; T.a[i].cols = c; return i; }
 static void preg(int i) {
    int pi = T.np++; T.par[pi] = i;
    T.cm[pi] = calloc(T.a[i].size, sizeof(float));
    T.cv[pi] = calloc(T.a[i].size, sizeof(float));
 }
 static void rec(int op, int o, int i1, int i2, float aux, int ai) {
    if (!T.on) return;
    Ent *e = &T.e[T.ne++]; e->op=op; e->out=o; e->in1=i1; e->in2=i2; e->aux=aux; e->ai=ai;
 }
 static void tape_reset(void) {
    T.apos = T.aparam; T.na = T.npa; T.ne = 0;
    for (int i = 0; i < T.npa; i++) memset(T.a[i].grad, 0, T.a[i].size * sizeof(float));
 }

 /* ---- RNG (xoshiro256**) ---- */
 static uint64_t rng[4];
 static uint64_t rnext(void) {
    uint64_t t = rng[1] << 17;
    rng[2] ^= rng[0]; rng[3] ^= rng[1]; rng[1] ^= rng[2]; rng[0] ^= rng[3];
    rng[2] ^= t; rng[3] = (rng[3] << 45) | (rng[3] >> 19);
    uint64_t r = rng[1] * 5; return (r << 7 | r >> 57) * 9;
 }
 static void rseed(uint64_t s) {
    rng[0]=s; rng[1]=s^0x6a09e667f3bcc908ULL; rng[2]=s^0xbb67ae8584caa73bULL; rng[3]=s^0x3c6ef372fe94f82bULL;
    for (int i = 0; i < 20; i++) rnext();
 }
 static float ruf(void) { return (float)((rnext()>>11)+1) / (float)(1ULL<<53); }
 static float rnf(float mu, float s) {
    double u1 = (double)(((rnext()>>11)+1)) / (double)(1ULL<<53);
    double u2 = (double)(((rnext()>>11)+1)) / (double)(1ULL<<53);
    return mu + s * (float)(sqrt(-2.0*log(u1)) * cos(6.283185307179586*u2));
 }
 static inline float sigf(float x) { return 1.0f / (1.0f + expf(-x)); }

 /* ===========================================================================
 * Chuck Memory — persistent across training runs
 *
 *   chuck.mem: binary append-only file of training snapshots.
 *   Each snapshot: 16 bytes (4 floats).
 *   Nearest-neighbor recall gives λ_prior.
 *   Ψ = λ_prior - λ_current = subjectivity signal.
 *
 *   Lee's Continuum C: chuck.mem is ℳ. NN is identity mapping I.
 *   Ψ_w is belief function B. Fixed point s* when Ψ → 0.
 * =========================================================================== */

 typedef struct {
    float loss;           /* where Chuck was */
    float grad_norm;      /* how hard the network was shaking */
    float lambda;         /* what Chuck decided */
    float delta_loss;     /* what happened (negative = improvement) */
 } ChuckMem;

 static ChuckMem chuck_mem[CHUCK_MEM_MAX];
 static int chuck_mem_n = 0;
 static int chuck_mem_total = 0;  /* total memories ever recorded (for reservoir sampling) */

 static void chuck_mem_load(void) {
    FILE *f = fopen(CHUCK_MEM_FILE, "rb");
    if (!f) return;
    chuck_mem_n = (int)fread(chuck_mem, sizeof(ChuckMem), CHUCK_MEM_CAP, f);
    chuck_mem_total = chuck_mem_n;  /* at least this many were saved */
    fclose(f);
 }

 static void chuck_mem_save(ChuckMem *m) {
    chuck_mem_total++;
    if (chuck_mem_n < CHUCK_MEM_CAP) {
        /* Under cap: append */
        chuck_mem[chuck_mem_n++] = *m;
        FILE *f = fopen(CHUCK_MEM_FILE, "ab");
        if (f) { fwrite(m, sizeof(ChuckMem), 1, f); fclose(f); }
    } else {
        /* At cap: reservoir sampling — replace random entry */
        int slot = (int)(rnext() % (uint64_t)chuck_mem_total);
        if (slot < CHUCK_MEM_CAP) {
            chuck_mem[slot] = *m;
            /* Rewrite entire file (200 entries × 16 bytes = 3.2 KB — trivial) */
            FILE *f = fopen(CHUCK_MEM_FILE, "wb");
            if (f) { fwrite(chuck_mem, sizeof(ChuckMem), chuck_mem_n, f); fclose(f); }
        }
    }
 }

 /* Nearest neighbor recall: find most similar past state, return its λ.
 * Distance = normalized (loss, grad_norm) difference.
 * Successful memories (negative delta_loss) get 2x weight. */
 static float chuck_mem_recall(float loss, float grad_norm) {
    if (chuck_mem_n == 0) return -1.0f;  /* no memory → no prior */
    float best_dist = 1e9f, best_lambda = -1.0f;
    for (int i = 0; i < chuck_mem_n; i++) {
        float dl = (loss - chuck_mem[i].loss) / (fabsf(loss) + 1e-8f);
        float dg = (grad_norm - chuck_mem[i].grad_norm) / (fabsf(grad_norm) + 1e-8f);
        float dist = dl * dl + dg * dg;
        if (chuck_mem[i].delta_loss < 0) dist *= 0.5f;  /* prefer wins */
        if (dist < best_dist) { best_dist = dist; best_lambda = chuck_mem[i].lambda; }
    }
    return best_lambda;
 }

 /* ---- Self-Awareness: Eyes ---- */

 /* SiLU eye: tracks dead neuron ratio */
 static struct { int dead, total; float health; } SiLU_eye;

 static void silu_eye_reset(void) { SiLU_eye.dead = 0; SiLU_eye.total = 0; }
 static void silu_eye_update(void) {
    if (SiLU_eye.total == 0) { SiLU_eye.health = 1.0f; return; }
    SiLU_eye.health = 1.0f - (float)SiLU_eye.dead / SiLU_eye.total;
    SiLU_eye.dead = 0; SiLU_eye.total = 0;
 }

 /* RMSNorm eye: tracks normalization scale EMA */
 static struct { float scale_ema; int init; } Norm_eye;

 /* RoPE eye: tracks frequency band utilization */
 static struct { float freq_energy[N_EMBD/2]; int calls; float utilization; } RoPE_eye;

 static void rope_eye_reset(void) {
    memset(RoPE_eye.freq_energy, 0, sizeof(RoPE_eye.freq_energy));
    RoPE_eye.calls = 0;
 }
 static void rope_eye_update(void) {
    if (RoPE_eye.calls == 0) return;
    float max_e = 0;
    for (int b = 0; b < HEAD_DIM/2; b++) {
        RoPE_eye.freq_energy[b] /= RoPE_eye.calls;
        if (RoPE_eye.freq_energy[b] > max_e) max_e = RoPE_eye.freq_energy[b];
    }
    int active = 0;
    for (int b = 0; b < HEAD_DIM/2; b++)
        if (RoPE_eye.freq_energy[b] > max_e * 0.01f) active++;
    RoPE_eye.utilization = (HEAD_DIM/2 > 0) ? (float)active / (HEAD_DIM/2) : 1.0f;
    memset(RoPE_eye.freq_energy, 0, sizeof(RoPE_eye.freq_energy));
    RoPE_eye.calls = 0;
 }

 /* Attention eye: tracks per-head entropy (Level 7) */
 static struct {
    float entropy[N_HEAD];      /* per-head attention entropy */
    float entropy_ema[N_HEAD];  /* EMA-smoothed entropy */
    int calls;
    int init;
 } Attn_eye;

 static void attn_eye_reset(void) { Attn_eye.calls = 0; memset(Attn_eye.entropy, 0, sizeof(Attn_eye.entropy)); }
 static void attn_eye_observe(int head, const float *weights, int len) {
    /* Shannon entropy: H = -Σ p × log(p) */
    float H = 0;
    for (int t = 0; t < len; t++) {
        if (weights[t] > 1e-10f) H -= weights[t] * logf(weights[t]);
    }
    Attn_eye.entropy[head] += H;
    Attn_eye.calls++;
 }
 static void attn_eye_update(void) {
    if (Attn_eye.calls == 0) return;
    int calls_per_head = Attn_eye.calls / N_HEAD;
    if (calls_per_head == 0) calls_per_head = 1;
    for (int h = 0; h < N_HEAD; h++) {
        float avg = Attn_eye.entropy[h] / calls_per_head;
        if (Attn_eye.init) Attn_eye.entropy_ema[h] = 0.95f * Attn_eye.entropy_ema[h] + 0.05f * avg;
        else Attn_eye.entropy_ema[h] = avg;
    }
    Attn_eye.init = 1;
    memset(Attn_eye.entropy, 0, sizeof(Attn_eye.entropy));
    Attn_eye.calls = 0;
 }

 /* Cross-layer signal flow */
 static float act_mag[N_LAYER];

 /* 2D position table for RoPE — image patches get (row,col), text gets sequential */
 static int pos_row[SEQ_LEN], pos_col[SEQ_LEN];
 static void init_positions(void) {
    /* Image A patches: grid positions */
    for (int p = 0; p < N_PATCHES; p++) {
        pos_row[p] = p / PATCHES_SIDE;
        pos_col[p] = p % PATCHES_SIDE;
    }
    /* Image B patches: offset columns to distinguish from A */
    for (int p = 0; p < N_PATCHES; p++) {
        pos_row[N_PATCHES + p] = p / PATCHES_SIDE;
        pos_col[N_PATCHES + p] = PATCHES_SIDE + (p % PATCHES_SIDE);
    }
    /* Text tokens: sequential rows below images, col=0 */
    for (int t = 0; t < MAX_TXT; t++) {
        pos_row[N_VIS + t] = PATCHES_SIDE + t;
        pos_col[N_VIS + t] = 0;
    }
 }

 /* ---- Forward ops (with awareness tracking) ---- */
 static int op_add(int xi, int yi) {
    int n = T.a[xi].size, zi = anew(n);
    for (int i = 0; i < n; i++) T.a[zi].data[i] = T.a[xi].data[i] + T.a[yi].data[i];
    rec(OP_ADD,zi,xi,yi,0,0); return zi;
 }
 static int op_mul(int xi, int yi) {
    int n = T.a[xi].size, zi = anew(n);
    for (int i = 0; i < n; i++) T.a[zi].data[i] = T.a[xi].data[i] * T.a[yi].data[i];
    rec(OP_MUL,zi,xi,yi,0,0); return zi;
 }
 static int op_scale(int xi, float s) {
    int n = T.a[xi].size, zi = anew(n);
    for (int i = 0; i < n; i++) T.a[zi].data[i] = T.a[xi].data[i] * s;
    rec(OP_SCALE,zi,xi,-1,s,0); return zi;
 }
 static int op_mv(int Wi, int xi) {
    int r = T.a[Wi].rows, c = T.a[Wi].cols, zi = anew(r);
 #ifdef USE_BLAS
    cblas_sgemv(CblasRowMajor, CblasNoTrans, r, c,
                1.0f, T.a[Wi].data, c, T.a[xi].data, 1,
                0.0f, T.a[zi].data, 1);
 #else
    for (int i = 0; i < r; i++) { float s = 0; const float *Wr = &T.a[Wi].data[i*c];
        for (int j = 0; j < c; j++) s += Wr[j] * T.a[xi].data[j]; T.a[zi].data[i] = s; }
 #endif
    rec(OP_MATVEC,zi,Wi,xi,0,0); return zi;
 }
 static int op_rms(int xi) {
    int n = T.a[xi].size, zi = anew(n); float ms = 0;
    for (int i = 0; i < n; i++) ms += T.a[xi].data[i] * T.a[xi].data[i];
    ms = ms / n + 1e-5f; float sc = 1.0f / sqrtf(ms);
    for (int i = 0; i < n; i++) T.a[zi].data[i] = T.a[xi].data[i] * sc;
    /* Norm eye: track scale */
    if (Norm_eye.init) Norm_eye.scale_ema = 0.99f * Norm_eye.scale_ema + 0.01f * sc;
    else { Norm_eye.scale_ema = sc; Norm_eye.init = 1; }
    rec(OP_RMSNORM,zi,xi,-1,sc,n); return zi;
 }
 static int op_silu(int xi) {
    int n = T.a[xi].size, zi = anew(n);
    for (int i = 0; i < n; i++) {
        float x = T.a[xi].data[i]; float s = sigf(x);
        T.a[zi].data[i] = x * s;
        /* SiLU eye: track dead zone */
        if (x < -4.0f) SiLU_eye.dead++;
        SiLU_eye.total++;
    }
    rec(OP_SILU,zi,xi,-1,0,0); return zi;
 }
 static int op_embed(int Wi, int id) {
    int c = T.a[Wi].cols, zi = anew(c);
    memcpy(T.a[zi].data, &T.a[Wi].data[id*c], c * sizeof(float));
    rec(OP_EMBED,zi,Wi,-1,0,id); return zi;
 }
 static int op_ce(int li, int tgt) {
    int n = T.a[li].size; float mx = T.a[li].data[0];
    for (int i = 1; i < n; i++) if (T.a[li].data[i] > mx) mx = T.a[li].data[i];
    int pi = anew(n); float *p = T.a[pi].data; float s = 0;
    for (int i = 0; i < n; i++) { p[i] = expf(T.a[li].data[i] - mx); s += p[i]; }
    for (int i = 0; i < n; i++) p[i] /= (s + 1e-10f);
    int zi = anew(1); T.a[zi].data[0] = -logf(p[tgt] + 1e-10f);
    rec(OP_CE,zi,li,pi,(float)tgt,n); return zi;
 }
 /* 2D RoPE: first half of head encodes row, second half encodes column.
 * Image patches get true 2D positions. Text tokens: row=sequential, col=0. */
 static int op_rope(int xi, int pos) {
    int n = T.a[xi].size, zi = anew(n);
    memcpy(T.a[zi].data, T.a[xi].data, n * sizeof(float));
    float *d = T.a[zi].data;
    int n_heads = n / HEAD_DIM, half = HEAD_DIM / 2;
    int row = pos_row[pos], col = pos_col[pos];
    for (int h = 0; h < n_heads; h++) {
        /* Row encoding (first half of head) */
        for (int i = 0; i < half; i += 2) {
            float freq = 1.0f / powf(ROPE_BASE, (float)i / (float)half);
            float ang = row * freq, c = cosf(ang), s = sinf(ang);
            int idx = h * HEAD_DIM + i;
            float x0 = d[idx], x1 = d[idx+1];
            d[idx] = x0*c - x1*s; d[idx+1] = x0*s + x1*c;
            float energy = d[idx]*d[idx] + d[idx+1]*d[idx+1];
            if (i/2 < N_EMBD/2) RoPE_eye.freq_energy[i/2] += energy;
        }
        /* Column encoding (second half of head) */
        for (int i = 0; i < half; i += 2) {
            float freq = 1.0f / powf(ROPE_BASE, (float)i / (float)half);
            float ang = col * freq, c = cosf(ang), s = sinf(ang);
            int idx = h * HEAD_DIM + half + i;
            float x0 = d[idx], x1 = d[idx+1];
            d[idx] = x0*c - x1*s; d[idx+1] = x0*s + x1*c;
            float energy = d[idx]*d[idx] + d[idx+1]*d[idx+1];
            if ((half+i)/2 < N_EMBD/2) RoPE_eye.freq_energy[(half+i)/2] += energy;
        }
    }
    RoPE_eye.calls++;
    rec(OP_ROPE,zi,xi,-1,0,pos); return zi;
 }
 static int op_reduce(int *la, int n) {
    float s = 0; for (int i = 0; i < n; i++) s += T.a[la[i]].data[0];
    int zi = anew(1); T.a[zi].data[0] = s / n;
    int buf = anew(n); for (int i = 0; i < n; i++) ((int*)T.a[buf].data)[i] = la[i];
    rec(OP_REDUCE,zi,buf,-1,0,n); return zi;
 }

 /* ---- KV cache (GQA: KV_DIM, not N_EMBD) ---- */
 static float *kv_k[N_LAYER][SEQ_LEN], *kv_v[N_LAYER][SEQ_LEN];
 static int kv_ki[N_LAYER][SEQ_LEN], kv_vi[N_LAYER][SEQ_LEN];
 static void kv_clear(void) {
    memset(kv_k,0,sizeof(kv_k)); memset(kv_v,0,sizeof(kv_v));
    memset(kv_ki,0,sizeof(kv_ki)); memset(kv_vi,0,sizeof(kv_vi));
 }

 /* ---- Backward ---- */
 static void backward(int loss) {
    T.a[loss].grad[0] = 1.0f;
    for (int ei = T.ne - 1; ei >= 0; ei--) {
        Ent *e = &T.e[ei];
        Arr *out = &T.a[e->out], *i1 = (e->in1 >= 0) ? &T.a[e->in1] : NULL, *i2 = (e->in2 >= 0) ? &T.a[e->in2] : NULL;
        switch (e->op) {
        case OP_ADD: { int n = out->size;
            for (int i = 0; i < n; i++) { i1->grad[i] += out->grad[i]; i2->grad[i] += out->grad[i]; } break; }
        case OP_MUL: { int n = out->size;
            for (int i = 0; i < n; i++) { i1->grad[i] += out->grad[i]*i2->data[i]; i2->grad[i] += out->grad[i]*i1->data[i]; } break; }
        case OP_SCALE: { int n = out->size; float s = e->aux;
            for (int i = 0; i < n; i++) i1->grad[i] += out->grad[i] * s; break; }
        case OP_MATVEC: { int r = i1->rows, c = i1->cols;
            for (int i = 0; i < r; i++) { float dz = out->grad[i];
                for (int j = 0; j < c; j++) { i1->grad[i*c+j] += dz*i2->data[j]; i2->grad[j] += dz*i1->data[i*c+j]; } } break; }
        case OP_RMSNORM: { int n = e->ai; float sc = e->aux, dot = 0;
            for (int i = 0; i < n; i++) dot += out->grad[i] * out->data[i];
            for (int i = 0; i < n; i++) i1->grad[i] += sc * (out->grad[i] - out->data[i]*dot/n); break; }
        case OP_SILU: { int n = out->size;
            for (int i = 0; i < n; i++) { float sg = sigf(i1->data[i]); i1->grad[i] += out->grad[i]*sg*(1.0f+i1->data[i]*(1.0f-sg)); } break; }
        case OP_CE: { int n = e->ai; int tgt = (int)e->aux; float dl = out->grad[0];
            for (int i = 0; i < n; i++) i1->grad[i] += dl * (i2->data[i] - (i==tgt ? 1.0f : 0.0f)); break; }
        case OP_EMBED: { int id = e->ai, c = i1->cols;
            for (int j = 0; j < c; j++) i1->grad[id*c+j] += out->grad[j]; break; }
        case OP_ROPE: { int n = out->size, pos = e->ai;
            int nh = n / HEAD_DIM, half = HEAD_DIM / 2;
            int row = pos_row[pos], col = pos_col[pos];
            for (int h = 0; h < nh; h++) {
                /* Row backward (first half) */
                for (int i = 0; i < half; i += 2) {
                    float freq = 1.0f / powf(ROPE_BASE, (float)i/(float)half);
                    float ang = row*freq, c = cosf(ang), s = sinf(ang);
                    int idx = h * HEAD_DIM + i;
                    float g0 = out->grad[idx], g1 = out->grad[idx+1];
                    i1->grad[idx] += g0*c + g1*s; i1->grad[idx+1] += -g0*s + g1*c;
                }
                /* Col backward (second half) */
                for (int i = 0; i < half; i += 2) {
                    float freq = 1.0f / powf(ROPE_BASE, (float)i/(float)half);
                    float ang = col*freq, c = cosf(ang), s = sinf(ang);
                    int idx = h * HEAD_DIM + half + i;
                    float g0 = out->grad[idx], g1 = out->grad[idx+1];
                    i1->grad[idx] += g0*c + g1*s; i1->grad[idx+1] += -g0*s + g1*c;
                }
            } break; }
        case OP_ATTN: { /* GQA attention backward */
            int li = (int)e->aux, pos = e->ai;
            float *qd = i1->data, *ag = out->grad, isq = 1.0f / sqrtf((float)HEAD_DIM);
            for (int h = 0; h < N_HEAD; h++) {
                int hs = h * HEAD_DIM;
                int kvh = h / N_KV_GROUP;
                int kvs = kvh * HEAD_DIM;
                float sc[SEQ_LEN], mx = -1e9f;
                for (int t = 0; t <= pos; t++) { float s = 0;
                    for (int d = 0; d < HEAD_DIM; d++) s += qd[hs+d]*kv_k[li][t][kvs+d];
                    sc[t] = s*isq; if (sc[t] > mx) mx = sc[t]; }
                float sm = 0; for (int t = 0; t <= pos; t++) { sc[t] = expf(sc[t]-mx); sm += sc[t]; }
                for (int t = 0; t <= pos; t++) sc[t] /= (sm + 1e-10f);
                float dw[SEQ_LEN];
                for (int t = 0; t <= pos; t++) { dw[t] = 0;
                    for (int d = 0; d < HEAD_DIM; d++) dw[t] += kv_v[li][t][kvs+d]*ag[hs+d]; }
                float dot = 0; for (int t = 0; t <= pos; t++) dot += sc[t]*dw[t];
                for (int t = 0; t <= pos; t++) { float ds = sc[t]*(dw[t]-dot);
                    for (int d = 0; d < HEAD_DIM; d++) {
                        /* grad Q: each Q-head gets its own gradient */
                        i1->grad[hs+d] += ds * kv_k[li][t][kvs+d] * isq;
                        /* grad K: multiple Q-heads accumulate to shared KV-head */
                        T.a[kv_ki[li][t]].grad[kvs+d] += ds * qd[hs+d] * isq;
                        /* grad V: multiple Q-heads accumulate to shared KV-head */
                        T.a[kv_vi[li][t]].grad[kvs+d] += sc[t] * ag[hs+d];
                    } }
            } break; }
        case OP_REDUCE: { int n = e->ai; int *idxs = (int*)i1->data; float dl = out->grad[0]/n;
            for (int i = 0; i < n; i++) T.a[idxs[i]].grad[0] += dl; break; }
        }
    }
 }

 /* ===========================================================================
 * Chuck v4: Self-Aware Optimizer
 *
 *   θ_l -= (α × λ × λ_l × σ) × m̂/(√v̂ + ε) + η
 *
 *   λ   = global self-modulation (loss trend over 16-step window)
 *   λ_l = per-layer self-modulation (gradient norm trend per layer)
 *   σ   = activation health signal (SiLU alive ratio × norm stability)
 *   η   = stagnation noise (only when globally stuck)
 *   α   = base learning rate from cosine schedule
 *
 *   If λ_l = 0 → layer is frozen. Zero compute waste. Chuck decided it's done.
 *   Adam doesn't know which layers are done. Chuck does.
 * =========================================================================== */

 /* Per-layer awareness state */
 typedef struct {
    float grad_hist[CHUCK_WINDOW];
    float dampen;
    int frozen;
    int pos, full, stag;
 } ChuckLayer;

 /* Global awareness state */
 static struct {
    float hist[CHUCK_WINDOW];
    float dampen, noise, sigma;
    float loss_ema;         /* EMA-smoothed loss (batch noise filter) */
    float gnorm_ema;        /* EMA-smoothed grad norm (for adaptive clip) */
    float psi;              /* Ψ: subjectivity signal (memory - observation) */
    float psi_w;            /* Ψ weight: trust in memory (0 → 0.3) */
    float macro_ema;        /* slow EMA for epoch-scale trend (Level 9) */
    float best_macro;       /* best macro_ema seen (for patience) */
    float lr_scale;         /* macro LR multiplier (patience decay) */
    int macro_stag;         /* macro patience counter */
    int macro_drops;        /* how many times macro decay fired */
    float rec_lambda;       /* λ at last memory recording */
    float rec_loss;         /* loss at last memory recording */
    int rec_frozen[N_LAYER]; /* frozen state at last recording */
    int rec_cd;             /* cooldown counter (steps since last record) */
    int pos, full, stag;
    int global_step;        /* total step counter for macro interval */
 } Chuck;

 static ChuckLayer CL[N_LAYER];

 static void chuck_init(void) {
    memset(&Chuck, 0, sizeof(Chuck));
    Chuck.dampen = 1.0f; Chuck.sigma = 1.0f;
    Chuck.lr_scale = 1.0f; Chuck.best_macro = 1e9f;
    Chuck.rec_lambda = 1.0f; Chuck.rec_loss = 999.0f;
    memset(Chuck.rec_frozen, 0, sizeof(Chuck.rec_frozen));
    Chuck.psi = 0; Chuck.psi_w = 0;
    for (int l = 0; l < N_LAYER; l++) {
        memset(&CL[l], 0, sizeof(ChuckLayer));
        CL[l].dampen = 1.0f;
    }
    Norm_eye.init = 0; Norm_eye.scale_ema = 1.0f;
    SiLU_eye.health = 1.0f;
    RoPE_eye.utilization = 1.0f;
    /* Load persistent memory */
    chuck_mem_load();
    if (chuck_mem_n > 0)
        printf("  chuck: loaded %d memories from %s (Ψ_w=%.2f)\n",
               chuck_mem_n, CHUCK_MEM_FILE,
               fminf(CHUCK_PSI_CAP, (float)chuck_mem_n / ((float)chuck_mem_n + CHUCK_PSI_HALF)));
 }

 /* Which layer does param pi belong to? -1 = global (patch_proj, wte) */
 static int param_layer(int pi) {
    if (pi < 2) return -1;  /* 0=patch_proj, 1=wte */
    return (pi - 2) / 7;     /* 7 params per layer: wq,wk,wv,wo,w1,w3,w2 */
 }

 static void chuck_step(float lr, float loss) {
    /* ═══ Level 1: Global self-awareness (loss trend) ═══ */
    /* EMA smoothing: filters batch-to-batch noise for mini-batch SGD */
    if (Chuck.loss_ema == 0.0f) Chuck.loss_ema = loss;
    else Chuck.loss_ema = 0.99f * Chuck.loss_ema + 0.01f * loss;
    Chuck.hist[Chuck.pos % CHUCK_WINDOW] = Chuck.loss_ema;
    Chuck.pos++;
    if (Chuck.pos >= CHUCK_WINDOW) Chuck.full = 1;
    if (Chuck.full) {
        int q = CHUCK_WINDOW / 4;
        float recent = 0, old = 0;
        for (int i = 0; i < q; i++) {
            recent += Chuck.hist[(Chuck.pos - 1 - i) % CHUCK_WINDOW];
            old += Chuck.hist[(Chuck.pos - CHUCK_WINDOW + i) % CHUCK_WINDOW];
        }
        recent /= q; old /= q;
        float trend = (recent - old) / (old + 1e-8f);
        if (trend > 0.01f) Chuck.dampen *= 0.95f;        /* loss rising → dampen */
        else if (trend < -0.05f) Chuck.dampen *= 1.05f;   /* loss falling → boost */
        if (fabsf(trend) < 0.001f) {
            Chuck.stag++;
            if (Chuck.stag > 8) { Chuck.noise = 0.001f; Chuck.stag = 0; }
        } else { Chuck.stag = 0; Chuck.noise *= 0.9f; }
        if (Chuck.dampen < CHUCK_DAMP_LO) Chuck.dampen = CHUCK_DAMP_LO;
        if (Chuck.dampen > CHUCK_DAMP_HI) Chuck.dampen = CHUCK_DAMP_HI;
    }

    /* ═══ Level 9: Multi-scale awareness (macro patience) ═══ */
    /*
     *   Slow EMA (α=0.001) tracks epoch-scale loss trend.
     *   Every CHUCK_MACRO_INT steps, check if training is improving.
     *   If patience exceeded → scale LR down (like ReduceLROnPlateau but continuous).
     *   Chuck sees both the forest and the trees.
     */
    Chuck.global_step++;
    if (Chuck.macro_ema == 0.0f) Chuck.macro_ema = loss;
    else Chuck.macro_ema = 0.999f * Chuck.macro_ema + 0.001f * loss;

    if (Chuck.global_step % CHUCK_MACRO_INT == 0 && Chuck.global_step > CHUCK_WINDOW) {
        if (Chuck.macro_ema > Chuck.best_macro * 0.999f) {
            Chuck.macro_stag++;
            if (Chuck.macro_stag >= CHUCK_MACRO_PAT) {
                Chuck.lr_scale *= CHUCK_MACRO_DECAY;
                if (Chuck.lr_scale < 0.05f) Chuck.lr_scale = 0.05f;
                Chuck.macro_stag = 0;
                Chuck.macro_drops++;
            }
        } else {
            Chuck.best_macro = Chuck.macro_ema;
            Chuck.macro_stag = 0;
        }
    }

    /* ═══ Level 4: Activation health signal (σ) ═══ */
    silu_eye_update();
    rope_eye_update();
    attn_eye_update();
    Chuck.sigma = 1.0f;
    if (SiLU_eye.health < 0.7f) Chuck.sigma *= SiLU_eye.health / 0.7f;
    if (Norm_eye.scale_ema > 5.0f) Chuck.sigma *= 0.9f;
    if (Norm_eye.scale_ema < 0.2f) Chuck.sigma *= 0.9f;

    /* ═══ Level 7: Attention entropy awareness ═══ */
    /*
     *   H_max = log(seq_len) for uniform attention.
     *   H → 0: collapsed (one token dominates) → model overfitting to position
     *   H → H_max: diffuse (all tokens equal) → model not learning attention
     *   Chuck dampens σ if any head collapses or goes fully diffuse.
     */
    if (Attn_eye.init) {
        float h_max = logf((float)(N_VIS + MAX_TXT));  /* max possible entropy */
        for (int hd = 0; hd < N_HEAD; hd++) {
            float ratio = Attn_eye.entropy_ema[hd] / (h_max + 1e-8f);
            if (ratio < 0.1f) Chuck.sigma *= 0.95f;       /* collapsed head → dampen */
            else if (ratio > 0.95f) Chuck.sigma *= 0.98f;  /* fully diffuse → slight dampen */
        }
    }

    /* ═══ Level 2: Per-layer self-awareness (grad norm trend) ═══ */
    float layer_gnorm[N_LAYER];
    memset(layer_gnorm, 0, sizeof(layer_gnorm));
    for (int pi = 0; pi < T.np; pi++) {
        int l = param_layer(pi);
        if (l < 0 || l >= N_LAYER) continue;
        Arr *p = &T.a[T.par[pi]];
        float gn = 0;
        for (int i = 0; i < p->size; i++) gn += p->grad[i] * p->grad[i];
        layer_gnorm[l] += gn;
    }
    for (int l = 0; l < N_LAYER; l++) layer_gnorm[l] = sqrtf(layer_gnorm[l]);

    for (int l = 0; l < N_LAYER; l++) {
        if (CL[l].frozen) continue;
        CL[l].grad_hist[CL[l].pos % CHUCK_WINDOW] = layer_gnorm[l];
        CL[l].pos++;
        if (CL[l].pos >= CHUCK_WINDOW) CL[l].full = 1;
        if (CL[l].full) {
            int q = CHUCK_WINDOW / 4;
            float recent = 0, old = 0;
            for (int i = 0; i < q; i++) {
                recent += CL[l].grad_hist[(CL[l].pos - 1 - i) % CHUCK_WINDOW];
                old += CL[l].grad_hist[(CL[l].pos - CHUCK_WINDOW + i) % CHUCK_WINDOW];
            }
            recent /= q; old /= q;
            float trend = (recent - old) / (old + 1e-8f);
            /* grad norm trending up → layer needs more work → boost */
            if (trend > 0.05f) CL[l].dampen *= 1.05f;
            /* grad norm trending down → layer is settling → dampen */
            else if (trend < -0.05f) CL[l].dampen *= 0.95f;
            /* freeze: near-zero gradient norm for extended period */
            if (layer_gnorm[l] < 0.01f) {
                CL[l].stag++;
                if (CL[l].stag > 8) CL[l].frozen = 1;
            } else { CL[l].stag = 0; }
            if (CL[l].dampen < CHUCK_DAMP_LO) CL[l].dampen = CHUCK_DAMP_LO;
            if (CL[l].dampen > CHUCK_DAMP_HI) CL[l].dampen = CHUCK_DAMP_HI;
        }
    }

    /* ═══ Level 5: Cross-layer signal flow ═══ */
    if (act_mag[0] > 1e-8f) {
        float ratio = act_mag[N_LAYER-1] / (act_mag[0] + 1e-8f);
        for (int l = 1; l < N_LAYER; l++) {
            if (CL[l].frozen) continue;
            float depth = (float)l / (N_LAYER - 1);
            if (ratio < 0.3f) CL[l].dampen *= (1.0f + 0.02f * depth);       /* vanishing → boost deep */
            else if (ratio > 3.0f) CL[l].dampen *= (1.0f - 0.02f * depth);  /* exploding → dampen deep */
            if (CL[l].dampen < CHUCK_DAMP_LO) CL[l].dampen = CHUCK_DAMP_LO;
            if (CL[l].dampen > CHUCK_DAMP_HI) CL[l].dampen = CHUCK_DAMP_HI;
        }
    }

    /* ═══ Level 6: Ψ — Subjectivity (memory vs observation) ═══ */
    /*
     *   λ_Ψ   = λ + Ψ_w × (λ_prior - λ)
     *   Ψ_w   = min(0.3, N / (N + 100))
     *   λ_prior = nearest_neighbor(loss, grad_norm) from chuck.mem
     *
     *   When Ψ → 0: memory matches reality. Chuck is home.
     *   When |Ψ| large: unfamiliar territory. Chuck explores.
     */
    float gnorm_sq = 0;
    for (int pi = 0; pi < T.np; pi++) { Arr *p = &T.a[T.par[pi]];
        for (int i = 0; i < p->size; i++) gnorm_sq += p->grad[i] * p->grad[i]; }
    float gnorm = sqrtf(gnorm_sq + 1e-8f);

    Chuck.psi_w = (chuck_mem_n > 0) ?
        fminf(CHUCK_PSI_CAP, (float)chuck_mem_n / ((float)chuck_mem_n + CHUCK_PSI_HALF)) : 0.0f;

    float lambda_psi = Chuck.dampen;  /* default: pure reactive */
    if (chuck_mem_n > 0) {
        float lambda_prior = chuck_mem_recall(loss, gnorm);
        if (lambda_prior > 0) {
            Chuck.psi = lambda_prior - Chuck.dampen;
            lambda_psi = Chuck.dampen + Chuck.psi_w * Chuck.psi;
            if (lambda_psi < CHUCK_DAMP_LO) lambda_psi = CHUCK_DAMP_LO;
            if (lambda_psi > CHUCK_DAMP_HI) lambda_psi = CHUCK_DAMP_HI;
        }
    }

    /* Record memory on regime change — Chuck speaks rarely, but always on point */
    Chuck.rec_cd++;
    if (Chuck.full && Chuck.rec_cd >= CHUCK_REC_CD) {
        float delta_loss = loss - Chuck.rec_loss;
        float lambda_shift = fabsf(Chuck.dampen - Chuck.rec_lambda) / (Chuck.rec_lambda + 1e-8f);
        int regime_change = (lambda_shift > CHUCK_REC_THR);  /* λ shifted >25% */
        for (int l = 0; l < N_LAYER && !regime_change; l++)
            if (CL[l].frozen != Chuck.rec_frozen[l]) regime_change = 1;
        if (regime_change) {
            ChuckMem snap = { loss, gnorm, Chuck.dampen, delta_loss };
            chuck_mem_save(&snap);
            Chuck.rec_lambda = Chuck.dampen;
            Chuck.rec_loss = loss;
            Chuck.rec_cd = 0;
            for (int l = 0; l < N_LAYER; l++) Chuck.rec_frozen[l] = CL[l].frozen;
        }
    }

    /* ═══ Apply parameter updates ═══ */
    T.cstep++;
    float bc1 = 1.0f - powf(CHUCK_B1, (float)T.cstep);
    float bc2 = 1.0f - powf(CHUCK_B2, (float)T.cstep);

    /* Adaptive gradient clipping — Chuck controls the leash */
    /*
     *   Early training (gnorm_ema unset): use base GRAD_CLIP
     *   Converging (gnorm dropping): tighten clip to protect learned weights
     *   Exploring (gnorm rising): loosen clip to allow learning
     *   Anomaly (gnorm > 3× EMA): extra tight — don't let one bad batch wreck everything
     */
    if (Chuck.gnorm_ema == 0.0f) Chuck.gnorm_ema = gnorm;
    else Chuck.gnorm_ema = 0.97f * Chuck.gnorm_ema + 0.03f * gnorm;
    float adaptive_clip = GRAD_CLIP;
    if (Chuck.gnorm_ema > 1e-8f) {
        adaptive_clip = fmaxf(0.5f, fminf(2.0f, 1.5f * Chuck.gnorm_ema));  /* track gnorm */
        if (gnorm > 3.0f * Chuck.gnorm_ema) adaptive_clip *= 0.5f;          /* anomaly → clamp hard */
    }
    float clip = (gnorm > adaptive_clip) ? adaptive_clip / gnorm : 1.0f;

    for (int pi = 0; pi < T.np; pi++) {
        int l = param_layer(pi);
        /* Frozen layer → skip entirely */
        if (l >= 0 && l < N_LAYER && CL[l].frozen) continue;
        float layer_damp = (l >= 0 && l < N_LAYER) ? CL[l].dampen : 1.0f;
        float eff_lr = lr * lambda_psi * layer_damp * Chuck.sigma * Chuck.lr_scale;

        int idx = T.par[pi]; Arr *p = &T.a[idx];
        float *m = T.cm[pi], *v = T.cv[pi];
        for (int i = 0; i < p->size; i++) { float g = p->grad[i] * clip;
            m[i] = CHUCK_B1*m[i] + (1.0f-CHUCK_B1)*g;
            v[i] = CHUCK_B2*v[i] + (1.0f-CHUCK_B2)*g*g;
            p->data[i] -= eff_lr * (m[i]/bc1) / (sqrtf(v[i]/bc2) + CHUCK_EPS);
            if (Chuck.noise > 0) p->data[i] += Chuck.noise * rnf(0, 1.0f);
        }
    }
 }

 /* ---- Synthetic digit data ---- */
 static const float digit_pat[10][IMG_SIZE*IMG_SIZE] = {
    {0,0,.5f,.8f,.8f,.5f,0,0, 0,.5f,.8f,0,0,.8f,.5f,0, .5f,.8f,0,0,0,0,.8f,.5f, .8f,0,0,0,0,0,0,.8f, .8f,0,0,0,0,0,0,.8f, .5f,.8f,0,0,0,0,.8f,.5f, 0,.5f,.8f,0,0,.8f,.5f,0, 0,0,.5f,.8f,.8f,.5f,0,0},
    {0,0,0,.8f,0,0,0,0, 0,0,.5f,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,.5f,.8f,.8f,.5f,0,0},
    {0,.5f,.8f,.8f,.8f,.5f,0,0, 0,0,0,0,0,.8f,0,0, 0,0,0,0,0,.8f,0,0, 0,0,0,.5f,.8f,.5f,0,0, 0,0,.5f,.8f,0,0,0,0, 0,.5f,.8f,0,0,0,0,0, .5f,.8f,0,0,0,0,0,0, .5f,.8f,.8f,.8f,.8f,.8f,.5f,0},
    {.5f,.8f,.8f,.8f,.5f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, .5f,.8f,.8f,.8f,.5f,0,0,0},
    {.8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,.8f,.8f,.8f,.8f,.8f,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0},
    {.5f,.8f,.8f,.8f,.8f,.5f,0,0, .8f,0,0,0,0,0,0,0, .8f,0,0,0,0,0,0,0, .5f,.8f,.8f,.8f,.5f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, .5f,.8f,.8f,.8f,.5f,0,0,0},
    {0,0,.5f,.8f,.8f,.5f,0,0, 0,.5f,.8f,0,0,0,0,0, .5f,.8f,0,0,0,0,0,0, .8f,.5f,.8f,.8f,.5f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, .5f,.8f,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0},
    {.8f,.8f,.8f,.8f,.8f,.8f,0,0, 0,0,0,0,.5f,.8f,0,0, 0,0,0,0,.8f,.5f,0,0, 0,0,0,.5f,.8f,0,0,0, 0,0,0,.8f,.5f,0,0,0, 0,0,.5f,.8f,0,0,0,0, 0,0,.8f,.5f,0,0,0,0, 0,0,.8f,0,0,0,0,0},
    {0,.5f,.8f,.8f,.5f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0},
    {0,.5f,.8f,.8f,.5f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,.5f,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0},
 };
 /* Addition task: two digit images → sum as word */
 typedef struct { float **imgs_a; float **imgs_b; int *da; int *db; int *sums; int n; } Data;
 static Data gen_data(int n) {
    Data d; d.n = n;
    d.imgs_a = malloc(n * sizeof(float*)); d.imgs_b = malloc(n * sizeof(float*));
    d.da = malloc(n * sizeof(int)); d.db = malloc(n * sizeof(int)); d.sums = malloc(n * sizeof(int));
    for (int i = 0; i < n; i++) {
        int a = (int)(rnext() % 10), b = (int)(rnext() % 10);
        d.da[i] = a; d.db[i] = b; d.sums[i] = a + b;
        d.imgs_a[i] = malloc(IMG_SIZE*IMG_SIZE*sizeof(float));
        d.imgs_b[i] = malloc(IMG_SIZE*IMG_SIZE*sizeof(float));
        for (int p = 0; p < IMG_SIZE*IMG_SIZE; p++) {
            float va = digit_pat[a][p] + rnf(0, 0.07f);
            float vb = digit_pat[b][p] + rnf(0, 0.07f);
            d.imgs_a[i][p] = va < 0 ? 0 : va > 1 ? 1 : va;
            d.imgs_b[i][p] = vb < 0 ? 0 : vb > 1 ? 1 : vb;
        }
    }
    return d;
 }
 static const char *names[] = {
    "zero","one","two","three","four","five","six","seven","eight","nine",
    "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen"
 };
 static const char chars[] = "efghilnorstuvwxz";
 #define N_CHARS 16
 static int c2id(char c) { for (int i = 0; i < N_CHARS; i++) if (chars[i] == c) return i; return -1; }
 static char id2c(int i) { return (i == BOS) ? '^' : (i == EOS) ? '$' : (i >= 0 && i < N_CHARS) ? chars[i] : '?'; }

 /* ---- Model (GQA: wk/wv use KV_DIM) ---- */
 typedef struct {
    int patch_proj, wte;
    struct { int wq, wk, wv, wo, w1, w3, w2; } L[N_LAYER];
 } Model;
 static Model M;

 static int init_w(int r, int c, float s) {
    int i = mnew(r, c);
    for (int j = 0; j < r*c; j++) T.a[i].data[j] = rnf(0, s);
    preg(i); return i;
 }
 static void init_model(void) {
    M.patch_proj = init_w(N_EMBD, PATCH_PX, 0.1f);        /* param 0 */
    M.wte = init_w(VOCAB, N_EMBD, 0.08f);                  /* param 1 */
    for (int i = 0; i < N_LAYER; i++) {
        float s = 0.08f / sqrtf(2.0f * N_LAYER);
        M.L[i].wq = init_w(N_EMBD, N_EMBD, s);             /* param 2+7i+0 */
        M.L[i].wk = init_w(KV_DIM, N_EMBD, s);             /* param 2+7i+1: GQA! */
        M.L[i].wv = init_w(KV_DIM, N_EMBD, s);             /* param 2+7i+2: GQA! */
        M.L[i].wo = init_w(N_EMBD, N_EMBD, s);             /* param 2+7i+3 */
        M.L[i].w1 = init_w(MLP_DIM, N_EMBD, s);            /* param 2+7i+4 */
        M.L[i].w3 = init_w(MLP_DIM, N_EMBD, s);            /* param 2+7i+5 */
        M.L[i].w2 = init_w(N_EMBD, MLP_DIM, s);            /* param 2+7i+6 */
    }
    T.npa = T.na; T.aparam = T.apos;
 }

 /* ---- GPT step (one position, GQA attention) ---- */
 static int gpt_step(int x, int pos, int layer_track) {
    int h = x;
    for (int li = 0; li < N_LAYER; li++) {
        int res = h; h = op_rms(h);
        int qi = op_mv(M.L[li].wq, h);
        int ki = op_mv(M.L[li].wk, h);  /* KV_DIM output */
        int vi = op_mv(M.L[li].wv, h);  /* KV_DIM output */
        int rqi = op_rope(qi, pos);
        int rki = op_rope(ki, pos);      /* RoPE on KV_DIM */
        kv_k[li][pos] = T.a[rki].data; kv_v[li][pos] = T.a[vi].data;
        kv_ki[li][pos] = rki; kv_vi[li][pos] = vi;

        /* GQA multi-head attention */
        int ao = anew(N_EMBD); float *ad = T.a[ao].data;
        for (int h_ = 0; h_ < N_HEAD; h_++) {
            int hs = h_ * HEAD_DIM;         /* Q offset in N_EMBD */
            int kvh = h_ / N_KV_GROUP;      /* which KV head */
            int kvs = kvh * HEAD_DIM;        /* KV offset in KV_DIM */
            float sc[SEQ_LEN], mx = -1e9f; float *qd = T.a[rqi].data;
            for (int t = 0; t <= pos; t++) { float s = 0;
                for (int d = 0; d < HEAD_DIM; d++) s += qd[hs+d]*kv_k[li][t][kvs+d];
                sc[t] = s / sqrtf((float)HEAD_DIM); if (sc[t] > mx) mx = sc[t]; }
            float sm = 0; for (int t = 0; t <= pos; t++) { sc[t] = expf(sc[t]-mx); sm += sc[t]; }
            for (int t = 0; t <= pos; t++) sc[t] /= (sm + 1e-10f);
            /* Attention eye: observe entropy of this head's attention distribution */
            if (layer_track && pos > 0) attn_eye_observe(h_, sc, pos + 1);
            for (int d = 0; d < HEAD_DIM; d++) { float v = 0;
                for (int t = 0; t <= pos; t++) v += sc[t]*kv_v[li][t][kvs+d]; ad[hs+d] = v; }
        }
        rec(OP_ATTN, ao, rqi, -1, (float)li, pos);
        h = op_add(res, op_mv(M.L[li].wo, ao));

        /* Track activation magnitude for cross-layer signal */
        if (layer_track) {
            float rms = 0;
            for (int i = 0; i < N_EMBD; i++) rms += T.a[h].data[i] * T.a[h].data[i];
            act_mag[li] = sqrtf(rms / N_EMBD);
        }

        res = h; h = op_rms(h);
        int gate = op_silu(op_mv(M.L[li].w1, h)), up = op_mv(M.L[li].w3, h);
        h = op_add(res, op_mv(M.L[li].w2, op_mul(gate, up)));
    }
    return op_mv(M.wte, op_rms(h));  /* weight-tied lm_head */
 }

 /* ---- Vision encoder: ViT-style patch tokenization ---- */
 static void encode_vis(float *pix, int *tok) {
    for (int py = 0; py < PATCHES_SIDE; py++)
        for (int px = 0; px < PATCHES_SIDE; px++) {
            int pi = anew(PATCH_PX); float *pd = T.a[pi].data;
            for (int y = 0; y < PATCH_SIZE; y++)
                for (int x = 0; x < PATCH_SIZE; x++)
                    pd[y*PATCH_SIZE + x] = pix[(py*PATCH_SIZE+y)*IMG_SIZE + px*PATCH_SIZE+x];
            tok[py*PATCHES_SIDE + px] = op_mv(M.patch_proj, pi);
        }
 }

 static float cos_lr(int step, int total) {
    if (step < WARMUP) return LR_MAX * (float)step / WARMUP;
    float p = (float)(step - WARMUP) / (float)(total - WARMUP);
    return LR_MAX * 0.5f * (1.0f + cosf(3.14159265f * p));
 }

 /* ---- Training ---- */
 static void train(Data *data) {
    printf("\n=== TRAINING (%d steps, Chuck v7 — multi-scale + reservoir memory) ===\n", STEPS);
    int tp = 0; for (int i = 0; i < T.np; i++) tp += T.a[T.par[i]].size;
    printf("  %d params (%.1fK) | %d layers | GQA %dQ/%dKV | embd=%d | %d imgs x %d patches | 2D RoPE | weight-tied\n",
           tp, tp/1000.0f, N_LAYER, N_HEAD, N_KV_HEAD, N_EMBD, N_IMGS, N_PATCHES);
    printf("  task: [digit_a] + [digit_b] → sum as word (0+0..9+9, 19 classes)\n\n");
    float rl = 0; int rn = 0;
    for (int step = 0; step < STEPS; step++) {
        int idx = (int)(rnext() % (uint64_t)data->n); int label = data->sums[idx];
        const char *name = names[label]; int nlen = strlen(name);
        int toks[MAX_TXT+2]; int nt = 0;
        toks[nt++] = BOS; for (int i = 0; i < nlen; i++) toks[nt++] = c2id(name[i]); toks[nt++] = EOS;
        tape_reset(); kv_clear();
        silu_eye_reset(); rope_eye_reset(); attn_eye_reset();
        int vt[N_VIS];
        encode_vis(data->imgs_a[idx], vt);             /* first digit */
        encode_vis(data->imgs_b[idx], vt + N_PATCHES); /* second digit */
        for (int p = 0; p < N_VIS; p++) gpt_step(vt[p], p, 0);
        int la[MAX_TXT]; int nl = 0;
        for (int t = 0; t < nt - 1; t++) {
            int pos = N_VIS + t, te = op_embed(M.wte, toks[t]);
            int lg = gpt_step(te, pos, (t == nt - 2)); /* track signal on last token */
            la[nl++] = op_ce(lg, toks[t+1]);
        }
        int loss = op_reduce(la, nl); backward(loss);
        float lv = T.a[loss].data[0];
        chuck_step(cos_lr(step, STEPS), lv);
        rl += lv; rn++;
        if ((step+1) % 250 == 0) {
            float elr = cos_lr(step, STEPS) * Chuck.dampen * Chuck.lr_scale;
            printf("  step %5d/%d | loss %.4f (avg %.4f) | lr %.6f\n",
                   step+1, STEPS, lv, rl/rn, elr);
            printf("    chuck: \xce\xbb=%.2f \xce\xa8=%+.2f (\xce\xa8w=%.2f, %d mem) \xcf\x83=%.2f macro=%.2f",
                   Chuck.dampen, Chuck.psi, Chuck.psi_w, chuck_mem_n, Chuck.sigma, Chuck.lr_scale);
            if (Chuck.macro_drops > 0) printf(" (%d drops)", Chuck.macro_drops);
            for (int l = 0; l < N_LAYER; l++) {
                if (CL[l].frozen) printf(" | L%d: frozen", l);
                else printf(" | L%d: %.2f", l, CL[l].dampen);
            }
            printf("\n    silu: %.0f%% alive | norm: %.1f | rope: %.0f%%",
                   SiLU_eye.health * 100, Norm_eye.scale_ema, RoPE_eye.utilization * 100);
            if (Attn_eye.init) {
                printf(" | attn H:");
                for (int hd = 0; hd < N_HEAD; hd++) printf(" %.2f", Attn_eye.entropy_ema[hd]);
            }
            if (act_mag[0] > 0) {
                printf(" | flow:");
                for (int l = 0; l < N_LAYER; l++) printf(" %.2f%s", act_mag[l], l<N_LAYER-1?"→":"");
            }
            printf("\n");
            rl = 0; rn = 0;
        }
    }
 }

 /* ---- Sampling ---- */
 static int sample_topk(float *logits, int vocab, float temp, int topk) {
    float sc[VOCAB]; for (int i = 0; i < vocab; i++) sc[i] = logits[i] / temp;
    float mx = sc[0]; for (int i = 1; i < vocab; i++) if (sc[i] > mx) mx = sc[i];
    float p[VOCAB]; float s = 0;
    for (int i = 0; i < vocab; i++) { p[i] = expf(sc[i] - mx); s += p[i]; }
    for (int i = 0; i < vocab; i++) p[i] /= s;
    float tv[TOPK]; int ti[TOPK];
    for (int k = 0; k < topk && k < vocab; k++) { int best = 0; float bv = -1e9f;
        for (int i = 0; i < vocab; i++) { int taken = 0;
            for (int j = 0; j < k; j++) if (ti[j] == i) { taken = 1; break; }
            if (!taken && p[i] > bv) { bv = p[i]; best = i; } }
        ti[k] = best; tv[k] = bv; }
    float ts = 0; for (int k = 0; k < topk; k++) ts += tv[k];
    float r = ruf() * ts, cum = 0;
    for (int k = 0; k < topk; k++) { cum += tv[k]; if (cum >= r) return ti[k]; }
    return ti[0];
 }

 /* ---- Inference ---- */
 static void inference(Data *data) {
    printf("\n=== INFERENCE — digit addition (temp=%.1f, top-k=%d) ===\n\n", TEMP, TOPK);
    T.on = 0; int correct = 0, total = 0;
    /* Test 50 random addition problems */
    for (int s = 0; s < 50; s++) {
        int idx = s % data->n;
        int label = data->sums[idx];
        tape_reset(); kv_clear();
        int vt[N_VIS];
        encode_vis(data->imgs_a[idx], vt);
        encode_vis(data->imgs_b[idx], vt + N_PATCHES);
        for (int p = 0; p < N_VIS; p++) gpt_step(vt[p], p, 0);
        int tok = BOS; char gen[MAX_TXT+1]; int gl = 0;
        for (int t = 0; t < MAX_TXT; t++) {
            int pos = N_VIS + t, te = op_embed(M.wte, tok);
            int lg = gpt_step(te, pos, 0);
            tok = sample_topk(T.a[lg].data, VOCAB, TEMP, TOPK);
            if (tok == EOS || tok == BOS) break;
            if (gl < MAX_TXT) gen[gl++] = id2c(tok);
        }
        gen[gl] = '\0'; int ok = strcmp(gen, names[label]) == 0; correct += ok; total++;
        printf("  %d+%d=%d  true: %-10s | gen: %-10s %s\n",
               data->da[idx], data->db[idx], label, names[label], gen, ok ? "OK" : "MISS");
    }
    printf("\n  accuracy: %d/%d (%.1f%%)\n", correct, total, 100.0f*correct/total);

    /* Frozen layer report */
    int frozen = 0;
    for (int l = 0; l < N_LAYER; l++) if (CL[l].frozen) frozen++;
    if (frozen > 0) printf("  chuck: %d/%d layers frozen (compute saved)\n", frozen, N_LAYER);

    T.on = 1;
 }

 int main(void) {
    printf("lee.c v7 — Vision-Language Model in pure C\n");
    printf("GQA %dQ/%dKV | %d layers | 2D RoPE | SwiGLU | Chuck v7 (multi-scale + reservoir memory)\n",
           N_HEAD, N_KV_HEAD, N_LAYER);
    printf("Named after Bruce Lee and Minhyeok Lee. Chuck sees inside the transformer.\n\n");
    clock_t t0 = clock(); rseed(42);
    init_positions(); tape_init(); chuck_init(); init_model();
    printf("generating 10000 addition problems (digit pairs + sums)...\n");
    Data d = gen_data(10000); printf("done.\n");
    train(&d); inference(&d);
    printf("\ntotal: %.1fs\n", (double)(clock()-t0)/CLOCKS_PER_SEC);
    printf("chuck.mem: %d memories (%.1f KB) | \xce\xa8_w=%.3f\n",
           chuck_mem_n, (float)(chuck_mem_n * (int)sizeof(ChuckMem)) / 1024.0f, Chuck.psi_w);
    if (chuck_mem_n > 0)
        printf("  next run: Chuck starts with experience. \xce\xa8 \xe2\x89\xa0 0. He remembers.\n");
    else
        printf("  first run: Chuck has no memories yet. Pure reactive. Newborn.\n");
    for (int i = 0; i < d.n; i++) { free(d.imgs_a[i]); free(d.imgs_b[i]); }
    free(d.imgs_a); free(d.imgs_b); free(d.da); free(d.db); free(d.sums);
    for (int i = 0; i < T.np; i++) { free(T.cm[i]); free(T.cv[i]); } free(T.arena);
    printf("\ndone.\n"); return 0;
 }
No results found