Skip to content

Instantly share code, notes, and snippets.

@ariannamethod
Last active March 11, 2026 06:04
Show Gist options
  • Select an option

  • Save ariannamethod/401828b3b9a169b8b40da74d3190d1f1 to your computer and use it in GitHub Desktop.

Select an option

Save ariannamethod/401828b3b9a169b8b40da74d3190d1f1 to your computer and use it in GitHub Desktop.
lee.c — Vision-Language Model in pure C. Patch tokens + RoPE + SwiGLU + Chuck optimizer. Zero dependencies. Inspired by sailfish009/purevlm.
/*
* lee.c v7 — Vision-Language Model in pure C
*
* Named after Bruce Lee (the only man who beat Chuck Norris)
* and Minhyeok Lee (whose self-identity framework gives Chuck his soul).
*
* Sees images. Speaks words. Adds numbers. Zero dependencies.
* Tape-based autograd with arena bump allocator.
*
* Architecture:
* ViT-style patch tokenization → 2D RoPE → GQA multi-head causal attention →
* SwiGLU MLP → RMSNorm → weight-tied lm_head → text
*
* v7: Chuck sees the forest AND the trees.
* - Multi-scale awareness: macro EMA + patience-based LR decay (Level 9)
* - Memory cap: reservoir sampling, bounded O(1) lookup
*
* v6 (preserved):
* - Attention entropy monitoring per head (Level 8 self-awareness)
* - Adaptive gradient clipping (Chuck controls clip, not a constant)
* - Digit addition task: [img_3] + [img_5] → "eight"
* - 2D RoPE for spatial awareness on image patches
*
* v5 (preserved):
* - Persistent memory (chuck.mem), Ψ subjectivity, Lee's Continuum C
* - λ_Ψ = λ + Ψ_w × (λ_prior - λ), Ψ_w = min(0.3, N/(N+100))
*
* v4 (preserved):
* - GQA (4Q/2KV), 3 layers, 105K params, per-layer λ_l, layer freezing
* - Self-aware SiLU, RMSNorm, RoPE, cross-layer signal flow
*
* Build: cc -std=c11 -O2 -march=native -o lee lee.c -lm
* Run: ./lee
*/
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdint.h>
#include <time.h>
/* ---- BLAS acceleration (optional) ----
* Mac: cc -DUSE_BLAS -DACCELERATE ... -framework Accelerate
* Linux: cc -DUSE_BLAS ... -lopenblas
* Off: cc ... -lm (zero deps, scalar fallback)
*/
#ifdef USE_BLAS
#ifdef ACCELERATE
#define ACCELERATE_NEW_LAPACK
#include <Accelerate/Accelerate.h>
#else
#include <cblas.h>
#endif
#endif
/* ---- Config ---- */
#define IMG_SIZE 8
#define PATCH_SIZE 4
#define PATCHES_SIDE (IMG_SIZE / PATCH_SIZE)
#define N_PATCHES (PATCHES_SIDE * PATCHES_SIDE)
#define PATCH_PX (PATCH_SIZE * PATCH_SIZE)
#define N_IMGS 2 /* two digit images → addition */
#define N_VIS (N_IMGS * N_PATCHES) /* 8 visual tokens */
#define MAX_TXT 12 /* "seventeen" + BOS + EOS */
#define SEQ_LEN (N_VIS + MAX_TXT)
#define N_EMBD 48
#define N_HEAD 4
#define N_KV_HEAD 2
#define N_KV_GROUP (N_HEAD / N_KV_HEAD)
#define HEAD_DIM (N_EMBD / N_HEAD)
#define KV_DIM (N_KV_HEAD * HEAD_DIM)
#define N_LAYER 3
#define MLP_DIM (4 * N_EMBD)
#define VOCAB 18
#define BOS 16
#define EOS 17
#define STEPS 15000
#define LR_MAX 0.005f
#define WARMUP 500
#define CHUCK_B1 0.9f
#define CHUCK_B2 0.999f
#define CHUCK_EPS 1e-8f
#define GRAD_CLIP 1.0f
#define ROPE_BASE 10000.0f
#define TEMP 0.7f
#define TOPK 5
#define CHUCK_WINDOW 16
#define CHUCK_DAMP_LO 0.3f
#define CHUCK_DAMP_HI 2.0f
#define CHUCK_PSI_CAP 0.3f
#define CHUCK_PSI_HALF 100.0f
#define CHUCK_MEM_CAP 200 /* bounded memory (reservoir sampling) */
#define CHUCK_MEM_MAX CHUCK_MEM_CAP
#define CHUCK_MEM_FILE "chuck.mem"
#define CHUCK_REC_THR 0.25f
#define CHUCK_REC_CD 50
#define CHUCK_MACRO_INT 500 /* macro patience check interval (steps) */
#define CHUCK_MACRO_PAT 3 /* patience: N checks without improvement → LR drop */
#define CHUCK_MACRO_DECAY 0.5f /* LR scale factor on macro plateau */
#define ARENA_SZ (128 * 1024 * 1024)
#define MAX_ARR 32768
#define MAX_ENT 65536
#define MAX_PAR 128
/* ---- Tape engine ---- */
typedef struct { float *data, *grad; int size, rows, cols; } Arr;
typedef struct { int op, out, in1, in2; float aux; int ai; } Ent;
enum { OP_ADD=1, OP_MUL, OP_SCALE, OP_MATVEC, OP_RMSNORM, OP_SILU,
OP_CE, OP_EMBED, OP_REDUCE, OP_ATTN, OP_ROPE };
static struct {
uint8_t *arena; size_t apos, aparam;
Arr a[MAX_ARR]; int na, npa;
Ent e[MAX_ENT]; int ne;
int par[MAX_PAR]; int np;
float *cm[MAX_PAR], *cv[MAX_PAR]; int cstep;
int on;
} T;
static float *aalloc(size_t n) {
size_t b = n * sizeof(float), al = (T.apos + 15) & ~(size_t)15;
if (al + b > ARENA_SZ) { fprintf(stderr, "arena OOM\n"); exit(1); }
T.apos = al + b; float *p = (float*)(T.arena + al); memset(p, 0, b); return p;
}
static void tape_init(void) {
uint8_t *m = malloc(ARENA_SZ);
if (!m) { fprintf(stderr, "OOM\n"); exit(1); }
memset(&T, 0, sizeof(T)); T.arena = m; T.on = 1;
}
static int anew(int sz) {
int i = T.na++; T.a[i].size = sz; T.a[i].rows = T.a[i].cols = 0;
T.a[i].data = aalloc(sz); T.a[i].grad = aalloc(sz); return i;
}
static int mnew(int r, int c) { int i = anew(r*c); T.a[i].rows = r; T.a[i].cols = c; return i; }
static void preg(int i) {
int pi = T.np++; T.par[pi] = i;
T.cm[pi] = calloc(T.a[i].size, sizeof(float));
T.cv[pi] = calloc(T.a[i].size, sizeof(float));
}
static void rec(int op, int o, int i1, int i2, float aux, int ai) {
if (!T.on) return;
Ent *e = &T.e[T.ne++]; e->op=op; e->out=o; e->in1=i1; e->in2=i2; e->aux=aux; e->ai=ai;
}
static void tape_reset(void) {
T.apos = T.aparam; T.na = T.npa; T.ne = 0;
for (int i = 0; i < T.npa; i++) memset(T.a[i].grad, 0, T.a[i].size * sizeof(float));
}
/* ---- RNG (xoshiro256**) ---- */
static uint64_t rng[4];
static uint64_t rnext(void) {
uint64_t t = rng[1] << 17;
rng[2] ^= rng[0]; rng[3] ^= rng[1]; rng[1] ^= rng[2]; rng[0] ^= rng[3];
rng[2] ^= t; rng[3] = (rng[3] << 45) | (rng[3] >> 19);
uint64_t r = rng[1] * 5; return (r << 7 | r >> 57) * 9;
}
static void rseed(uint64_t s) {
rng[0]=s; rng[1]=s^0x6a09e667f3bcc908ULL; rng[2]=s^0xbb67ae8584caa73bULL; rng[3]=s^0x3c6ef372fe94f82bULL;
for (int i = 0; i < 20; i++) rnext();
}
static float ruf(void) { return (float)((rnext()>>11)+1) / (float)(1ULL<<53); }
static float rnf(float mu, float s) {
double u1 = (double)(((rnext()>>11)+1)) / (double)(1ULL<<53);
double u2 = (double)(((rnext()>>11)+1)) / (double)(1ULL<<53);
return mu + s * (float)(sqrt(-2.0*log(u1)) * cos(6.283185307179586*u2));
}
static inline float sigf(float x) { return 1.0f / (1.0f + expf(-x)); }
/* ===========================================================================
* Chuck Memory — persistent across training runs
*
* chuck.mem: binary append-only file of training snapshots.
* Each snapshot: 16 bytes (4 floats).
* Nearest-neighbor recall gives λ_prior.
* Ψ = λ_prior - λ_current = subjectivity signal.
*
* Lee's Continuum C: chuck.mem is ℳ. NN is identity mapping I.
* Ψ_w is belief function B. Fixed point s* when Ψ → 0.
* =========================================================================== */
typedef struct {
float loss; /* where Chuck was */
float grad_norm; /* how hard the network was shaking */
float lambda; /* what Chuck decided */
float delta_loss; /* what happened (negative = improvement) */
} ChuckMem;
static ChuckMem chuck_mem[CHUCK_MEM_MAX];
static int chuck_mem_n = 0;
static int chuck_mem_total = 0; /* total memories ever recorded (for reservoir sampling) */
static void chuck_mem_load(void) {
FILE *f = fopen(CHUCK_MEM_FILE, "rb");
if (!f) return;
chuck_mem_n = (int)fread(chuck_mem, sizeof(ChuckMem), CHUCK_MEM_CAP, f);
chuck_mem_total = chuck_mem_n; /* at least this many were saved */
fclose(f);
}
static void chuck_mem_save(ChuckMem *m) {
chuck_mem_total++;
if (chuck_mem_n < CHUCK_MEM_CAP) {
/* Under cap: append */
chuck_mem[chuck_mem_n++] = *m;
FILE *f = fopen(CHUCK_MEM_FILE, "ab");
if (f) { fwrite(m, sizeof(ChuckMem), 1, f); fclose(f); }
} else {
/* At cap: reservoir sampling — replace random entry */
int slot = (int)(rnext() % (uint64_t)chuck_mem_total);
if (slot < CHUCK_MEM_CAP) {
chuck_mem[slot] = *m;
/* Rewrite entire file (200 entries × 16 bytes = 3.2 KB — trivial) */
FILE *f = fopen(CHUCK_MEM_FILE, "wb");
if (f) { fwrite(chuck_mem, sizeof(ChuckMem), chuck_mem_n, f); fclose(f); }
}
}
}
/* Nearest neighbor recall: find most similar past state, return its λ.
* Distance = normalized (loss, grad_norm) difference.
* Successful memories (negative delta_loss) get 2x weight. */
static float chuck_mem_recall(float loss, float grad_norm) {
if (chuck_mem_n == 0) return -1.0f; /* no memory → no prior */
float best_dist = 1e9f, best_lambda = -1.0f;
for (int i = 0; i < chuck_mem_n; i++) {
float dl = (loss - chuck_mem[i].loss) / (fabsf(loss) + 1e-8f);
float dg = (grad_norm - chuck_mem[i].grad_norm) / (fabsf(grad_norm) + 1e-8f);
float dist = dl * dl + dg * dg;
if (chuck_mem[i].delta_loss < 0) dist *= 0.5f; /* prefer wins */
if (dist < best_dist) { best_dist = dist; best_lambda = chuck_mem[i].lambda; }
}
return best_lambda;
}
/* ---- Self-Awareness: Eyes ---- */
/* SiLU eye: tracks dead neuron ratio */
static struct { int dead, total; float health; } SiLU_eye;
static void silu_eye_reset(void) { SiLU_eye.dead = 0; SiLU_eye.total = 0; }
static void silu_eye_update(void) {
if (SiLU_eye.total == 0) { SiLU_eye.health = 1.0f; return; }
SiLU_eye.health = 1.0f - (float)SiLU_eye.dead / SiLU_eye.total;
SiLU_eye.dead = 0; SiLU_eye.total = 0;
}
/* RMSNorm eye: tracks normalization scale EMA */
static struct { float scale_ema; int init; } Norm_eye;
/* RoPE eye: tracks frequency band utilization */
static struct { float freq_energy[N_EMBD/2]; int calls; float utilization; } RoPE_eye;
static void rope_eye_reset(void) {
memset(RoPE_eye.freq_energy, 0, sizeof(RoPE_eye.freq_energy));
RoPE_eye.calls = 0;
}
static void rope_eye_update(void) {
if (RoPE_eye.calls == 0) return;
float max_e = 0;
for (int b = 0; b < HEAD_DIM/2; b++) {
RoPE_eye.freq_energy[b] /= RoPE_eye.calls;
if (RoPE_eye.freq_energy[b] > max_e) max_e = RoPE_eye.freq_energy[b];
}
int active = 0;
for (int b = 0; b < HEAD_DIM/2; b++)
if (RoPE_eye.freq_energy[b] > max_e * 0.01f) active++;
RoPE_eye.utilization = (HEAD_DIM/2 > 0) ? (float)active / (HEAD_DIM/2) : 1.0f;
memset(RoPE_eye.freq_energy, 0, sizeof(RoPE_eye.freq_energy));
RoPE_eye.calls = 0;
}
/* Attention eye: tracks per-head entropy (Level 7) */
static struct {
float entropy[N_HEAD]; /* per-head attention entropy */
float entropy_ema[N_HEAD]; /* EMA-smoothed entropy */
int calls;
int init;
} Attn_eye;
static void attn_eye_reset(void) { Attn_eye.calls = 0; memset(Attn_eye.entropy, 0, sizeof(Attn_eye.entropy)); }
static void attn_eye_observe(int head, const float *weights, int len) {
/* Shannon entropy: H = -Σ p × log(p) */
float H = 0;
for (int t = 0; t < len; t++) {
if (weights[t] > 1e-10f) H -= weights[t] * logf(weights[t]);
}
Attn_eye.entropy[head] += H;
Attn_eye.calls++;
}
static void attn_eye_update(void) {
if (Attn_eye.calls == 0) return;
int calls_per_head = Attn_eye.calls / N_HEAD;
if (calls_per_head == 0) calls_per_head = 1;
for (int h = 0; h < N_HEAD; h++) {
float avg = Attn_eye.entropy[h] / calls_per_head;
if (Attn_eye.init) Attn_eye.entropy_ema[h] = 0.95f * Attn_eye.entropy_ema[h] + 0.05f * avg;
else Attn_eye.entropy_ema[h] = avg;
}
Attn_eye.init = 1;
memset(Attn_eye.entropy, 0, sizeof(Attn_eye.entropy));
Attn_eye.calls = 0;
}
/* Cross-layer signal flow */
static float act_mag[N_LAYER];
/* 2D position table for RoPE — image patches get (row,col), text gets sequential */
static int pos_row[SEQ_LEN], pos_col[SEQ_LEN];
static void init_positions(void) {
/* Image A patches: grid positions */
for (int p = 0; p < N_PATCHES; p++) {
pos_row[p] = p / PATCHES_SIDE;
pos_col[p] = p % PATCHES_SIDE;
}
/* Image B patches: offset columns to distinguish from A */
for (int p = 0; p < N_PATCHES; p++) {
pos_row[N_PATCHES + p] = p / PATCHES_SIDE;
pos_col[N_PATCHES + p] = PATCHES_SIDE + (p % PATCHES_SIDE);
}
/* Text tokens: sequential rows below images, col=0 */
for (int t = 0; t < MAX_TXT; t++) {
pos_row[N_VIS + t] = PATCHES_SIDE + t;
pos_col[N_VIS + t] = 0;
}
}
/* ---- Forward ops (with awareness tracking) ---- */
static int op_add(int xi, int yi) {
int n = T.a[xi].size, zi = anew(n);
for (int i = 0; i < n; i++) T.a[zi].data[i] = T.a[xi].data[i] + T.a[yi].data[i];
rec(OP_ADD,zi,xi,yi,0,0); return zi;
}
static int op_mul(int xi, int yi) {
int n = T.a[xi].size, zi = anew(n);
for (int i = 0; i < n; i++) T.a[zi].data[i] = T.a[xi].data[i] * T.a[yi].data[i];
rec(OP_MUL,zi,xi,yi,0,0); return zi;
}
static int op_scale(int xi, float s) {
int n = T.a[xi].size, zi = anew(n);
for (int i = 0; i < n; i++) T.a[zi].data[i] = T.a[xi].data[i] * s;
rec(OP_SCALE,zi,xi,-1,s,0); return zi;
}
static int op_mv(int Wi, int xi) {
int r = T.a[Wi].rows, c = T.a[Wi].cols, zi = anew(r);
#ifdef USE_BLAS
cblas_sgemv(CblasRowMajor, CblasNoTrans, r, c,
1.0f, T.a[Wi].data, c, T.a[xi].data, 1,
0.0f, T.a[zi].data, 1);
#else
for (int i = 0; i < r; i++) { float s = 0; const float *Wr = &T.a[Wi].data[i*c];
for (int j = 0; j < c; j++) s += Wr[j] * T.a[xi].data[j]; T.a[zi].data[i] = s; }
#endif
rec(OP_MATVEC,zi,Wi,xi,0,0); return zi;
}
static int op_rms(int xi) {
int n = T.a[xi].size, zi = anew(n); float ms = 0;
for (int i = 0; i < n; i++) ms += T.a[xi].data[i] * T.a[xi].data[i];
ms = ms / n + 1e-5f; float sc = 1.0f / sqrtf(ms);
for (int i = 0; i < n; i++) T.a[zi].data[i] = T.a[xi].data[i] * sc;
/* Norm eye: track scale */
if (Norm_eye.init) Norm_eye.scale_ema = 0.99f * Norm_eye.scale_ema + 0.01f * sc;
else { Norm_eye.scale_ema = sc; Norm_eye.init = 1; }
rec(OP_RMSNORM,zi,xi,-1,sc,n); return zi;
}
static int op_silu(int xi) {
int n = T.a[xi].size, zi = anew(n);
for (int i = 0; i < n; i++) {
float x = T.a[xi].data[i]; float s = sigf(x);
T.a[zi].data[i] = x * s;
/* SiLU eye: track dead zone */
if (x < -4.0f) SiLU_eye.dead++;
SiLU_eye.total++;
}
rec(OP_SILU,zi,xi,-1,0,0); return zi;
}
static int op_embed(int Wi, int id) {
int c = T.a[Wi].cols, zi = anew(c);
memcpy(T.a[zi].data, &T.a[Wi].data[id*c], c * sizeof(float));
rec(OP_EMBED,zi,Wi,-1,0,id); return zi;
}
static int op_ce(int li, int tgt) {
int n = T.a[li].size; float mx = T.a[li].data[0];
for (int i = 1; i < n; i++) if (T.a[li].data[i] > mx) mx = T.a[li].data[i];
int pi = anew(n); float *p = T.a[pi].data; float s = 0;
for (int i = 0; i < n; i++) { p[i] = expf(T.a[li].data[i] - mx); s += p[i]; }
for (int i = 0; i < n; i++) p[i] /= (s + 1e-10f);
int zi = anew(1); T.a[zi].data[0] = -logf(p[tgt] + 1e-10f);
rec(OP_CE,zi,li,pi,(float)tgt,n); return zi;
}
/* 2D RoPE: first half of head encodes row, second half encodes column.
* Image patches get true 2D positions. Text tokens: row=sequential, col=0. */
static int op_rope(int xi, int pos) {
int n = T.a[xi].size, zi = anew(n);
memcpy(T.a[zi].data, T.a[xi].data, n * sizeof(float));
float *d = T.a[zi].data;
int n_heads = n / HEAD_DIM, half = HEAD_DIM / 2;
int row = pos_row[pos], col = pos_col[pos];
for (int h = 0; h < n_heads; h++) {
/* Row encoding (first half of head) */
for (int i = 0; i < half; i += 2) {
float freq = 1.0f / powf(ROPE_BASE, (float)i / (float)half);
float ang = row * freq, c = cosf(ang), s = sinf(ang);
int idx = h * HEAD_DIM + i;
float x0 = d[idx], x1 = d[idx+1];
d[idx] = x0*c - x1*s; d[idx+1] = x0*s + x1*c;
float energy = d[idx]*d[idx] + d[idx+1]*d[idx+1];
if (i/2 < N_EMBD/2) RoPE_eye.freq_energy[i/2] += energy;
}
/* Column encoding (second half of head) */
for (int i = 0; i < half; i += 2) {
float freq = 1.0f / powf(ROPE_BASE, (float)i / (float)half);
float ang = col * freq, c = cosf(ang), s = sinf(ang);
int idx = h * HEAD_DIM + half + i;
float x0 = d[idx], x1 = d[idx+1];
d[idx] = x0*c - x1*s; d[idx+1] = x0*s + x1*c;
float energy = d[idx]*d[idx] + d[idx+1]*d[idx+1];
if ((half+i)/2 < N_EMBD/2) RoPE_eye.freq_energy[(half+i)/2] += energy;
}
}
RoPE_eye.calls++;
rec(OP_ROPE,zi,xi,-1,0,pos); return zi;
}
static int op_reduce(int *la, int n) {
float s = 0; for (int i = 0; i < n; i++) s += T.a[la[i]].data[0];
int zi = anew(1); T.a[zi].data[0] = s / n;
int buf = anew(n); for (int i = 0; i < n; i++) ((int*)T.a[buf].data)[i] = la[i];
rec(OP_REDUCE,zi,buf,-1,0,n); return zi;
}
/* ---- KV cache (GQA: KV_DIM, not N_EMBD) ---- */
static float *kv_k[N_LAYER][SEQ_LEN], *kv_v[N_LAYER][SEQ_LEN];
static int kv_ki[N_LAYER][SEQ_LEN], kv_vi[N_LAYER][SEQ_LEN];
static void kv_clear(void) {
memset(kv_k,0,sizeof(kv_k)); memset(kv_v,0,sizeof(kv_v));
memset(kv_ki,0,sizeof(kv_ki)); memset(kv_vi,0,sizeof(kv_vi));
}
/* ---- Backward ---- */
static void backward(int loss) {
T.a[loss].grad[0] = 1.0f;
for (int ei = T.ne - 1; ei >= 0; ei--) {
Ent *e = &T.e[ei];
Arr *out = &T.a[e->out], *i1 = (e->in1 >= 0) ? &T.a[e->in1] : NULL, *i2 = (e->in2 >= 0) ? &T.a[e->in2] : NULL;
switch (e->op) {
case OP_ADD: { int n = out->size;
for (int i = 0; i < n; i++) { i1->grad[i] += out->grad[i]; i2->grad[i] += out->grad[i]; } break; }
case OP_MUL: { int n = out->size;
for (int i = 0; i < n; i++) { i1->grad[i] += out->grad[i]*i2->data[i]; i2->grad[i] += out->grad[i]*i1->data[i]; } break; }
case OP_SCALE: { int n = out->size; float s = e->aux;
for (int i = 0; i < n; i++) i1->grad[i] += out->grad[i] * s; break; }
case OP_MATVEC: { int r = i1->rows, c = i1->cols;
for (int i = 0; i < r; i++) { float dz = out->grad[i];
for (int j = 0; j < c; j++) { i1->grad[i*c+j] += dz*i2->data[j]; i2->grad[j] += dz*i1->data[i*c+j]; } } break; }
case OP_RMSNORM: { int n = e->ai; float sc = e->aux, dot = 0;
for (int i = 0; i < n; i++) dot += out->grad[i] * out->data[i];
for (int i = 0; i < n; i++) i1->grad[i] += sc * (out->grad[i] - out->data[i]*dot/n); break; }
case OP_SILU: { int n = out->size;
for (int i = 0; i < n; i++) { float sg = sigf(i1->data[i]); i1->grad[i] += out->grad[i]*sg*(1.0f+i1->data[i]*(1.0f-sg)); } break; }
case OP_CE: { int n = e->ai; int tgt = (int)e->aux; float dl = out->grad[0];
for (int i = 0; i < n; i++) i1->grad[i] += dl * (i2->data[i] - (i==tgt ? 1.0f : 0.0f)); break; }
case OP_EMBED: { int id = e->ai, c = i1->cols;
for (int j = 0; j < c; j++) i1->grad[id*c+j] += out->grad[j]; break; }
case OP_ROPE: { int n = out->size, pos = e->ai;
int nh = n / HEAD_DIM, half = HEAD_DIM / 2;
int row = pos_row[pos], col = pos_col[pos];
for (int h = 0; h < nh; h++) {
/* Row backward (first half) */
for (int i = 0; i < half; i += 2) {
float freq = 1.0f / powf(ROPE_BASE, (float)i/(float)half);
float ang = row*freq, c = cosf(ang), s = sinf(ang);
int idx = h * HEAD_DIM + i;
float g0 = out->grad[idx], g1 = out->grad[idx+1];
i1->grad[idx] += g0*c + g1*s; i1->grad[idx+1] += -g0*s + g1*c;
}
/* Col backward (second half) */
for (int i = 0; i < half; i += 2) {
float freq = 1.0f / powf(ROPE_BASE, (float)i/(float)half);
float ang = col*freq, c = cosf(ang), s = sinf(ang);
int idx = h * HEAD_DIM + half + i;
float g0 = out->grad[idx], g1 = out->grad[idx+1];
i1->grad[idx] += g0*c + g1*s; i1->grad[idx+1] += -g0*s + g1*c;
}
} break; }
case OP_ATTN: { /* GQA attention backward */
int li = (int)e->aux, pos = e->ai;
float *qd = i1->data, *ag = out->grad, isq = 1.0f / sqrtf((float)HEAD_DIM);
for (int h = 0; h < N_HEAD; h++) {
int hs = h * HEAD_DIM;
int kvh = h / N_KV_GROUP;
int kvs = kvh * HEAD_DIM;
float sc[SEQ_LEN], mx = -1e9f;
for (int t = 0; t <= pos; t++) { float s = 0;
for (int d = 0; d < HEAD_DIM; d++) s += qd[hs+d]*kv_k[li][t][kvs+d];
sc[t] = s*isq; if (sc[t] > mx) mx = sc[t]; }
float sm = 0; for (int t = 0; t <= pos; t++) { sc[t] = expf(sc[t]-mx); sm += sc[t]; }
for (int t = 0; t <= pos; t++) sc[t] /= (sm + 1e-10f);
float dw[SEQ_LEN];
for (int t = 0; t <= pos; t++) { dw[t] = 0;
for (int d = 0; d < HEAD_DIM; d++) dw[t] += kv_v[li][t][kvs+d]*ag[hs+d]; }
float dot = 0; for (int t = 0; t <= pos; t++) dot += sc[t]*dw[t];
for (int t = 0; t <= pos; t++) { float ds = sc[t]*(dw[t]-dot);
for (int d = 0; d < HEAD_DIM; d++) {
/* grad Q: each Q-head gets its own gradient */
i1->grad[hs+d] += ds * kv_k[li][t][kvs+d] * isq;
/* grad K: multiple Q-heads accumulate to shared KV-head */
T.a[kv_ki[li][t]].grad[kvs+d] += ds * qd[hs+d] * isq;
/* grad V: multiple Q-heads accumulate to shared KV-head */
T.a[kv_vi[li][t]].grad[kvs+d] += sc[t] * ag[hs+d];
} }
} break; }
case OP_REDUCE: { int n = e->ai; int *idxs = (int*)i1->data; float dl = out->grad[0]/n;
for (int i = 0; i < n; i++) T.a[idxs[i]].grad[0] += dl; break; }
}
}
}
/* ===========================================================================
* Chuck v4: Self-Aware Optimizer
*
* θ_l -= (α × λ × λ_l × σ) × m̂/(√v̂ + ε) + η
*
* λ = global self-modulation (loss trend over 16-step window)
* λ_l = per-layer self-modulation (gradient norm trend per layer)
* σ = activation health signal (SiLU alive ratio × norm stability)
* η = stagnation noise (only when globally stuck)
* α = base learning rate from cosine schedule
*
* If λ_l = 0 → layer is frozen. Zero compute waste. Chuck decided it's done.
* Adam doesn't know which layers are done. Chuck does.
* =========================================================================== */
/* Per-layer awareness state */
typedef struct {
float grad_hist[CHUCK_WINDOW];
float dampen;
int frozen;
int pos, full, stag;
} ChuckLayer;
/* Global awareness state */
static struct {
float hist[CHUCK_WINDOW];
float dampen, noise, sigma;
float loss_ema; /* EMA-smoothed loss (batch noise filter) */
float gnorm_ema; /* EMA-smoothed grad norm (for adaptive clip) */
float psi; /* Ψ: subjectivity signal (memory - observation) */
float psi_w; /* Ψ weight: trust in memory (0 → 0.3) */
float macro_ema; /* slow EMA for epoch-scale trend (Level 9) */
float best_macro; /* best macro_ema seen (for patience) */
float lr_scale; /* macro LR multiplier (patience decay) */
int macro_stag; /* macro patience counter */
int macro_drops; /* how many times macro decay fired */
float rec_lambda; /* λ at last memory recording */
float rec_loss; /* loss at last memory recording */
int rec_frozen[N_LAYER]; /* frozen state at last recording */
int rec_cd; /* cooldown counter (steps since last record) */
int pos, full, stag;
int global_step; /* total step counter for macro interval */
} Chuck;
static ChuckLayer CL[N_LAYER];
static void chuck_init(void) {
memset(&Chuck, 0, sizeof(Chuck));
Chuck.dampen = 1.0f; Chuck.sigma = 1.0f;
Chuck.lr_scale = 1.0f; Chuck.best_macro = 1e9f;
Chuck.rec_lambda = 1.0f; Chuck.rec_loss = 999.0f;
memset(Chuck.rec_frozen, 0, sizeof(Chuck.rec_frozen));
Chuck.psi = 0; Chuck.psi_w = 0;
for (int l = 0; l < N_LAYER; l++) {
memset(&CL[l], 0, sizeof(ChuckLayer));
CL[l].dampen = 1.0f;
}
Norm_eye.init = 0; Norm_eye.scale_ema = 1.0f;
SiLU_eye.health = 1.0f;
RoPE_eye.utilization = 1.0f;
/* Load persistent memory */
chuck_mem_load();
if (chuck_mem_n > 0)
printf(" chuck: loaded %d memories from %s (Ψ_w=%.2f)\n",
chuck_mem_n, CHUCK_MEM_FILE,
fminf(CHUCK_PSI_CAP, (float)chuck_mem_n / ((float)chuck_mem_n + CHUCK_PSI_HALF)));
}
/* Which layer does param pi belong to? -1 = global (patch_proj, wte) */
static int param_layer(int pi) {
if (pi < 2) return -1; /* 0=patch_proj, 1=wte */
return (pi - 2) / 7; /* 7 params per layer: wq,wk,wv,wo,w1,w3,w2 */
}
static void chuck_step(float lr, float loss) {
/* ═══ Level 1: Global self-awareness (loss trend) ═══ */
/* EMA smoothing: filters batch-to-batch noise for mini-batch SGD */
if (Chuck.loss_ema == 0.0f) Chuck.loss_ema = loss;
else Chuck.loss_ema = 0.99f * Chuck.loss_ema + 0.01f * loss;
Chuck.hist[Chuck.pos % CHUCK_WINDOW] = Chuck.loss_ema;
Chuck.pos++;
if (Chuck.pos >= CHUCK_WINDOW) Chuck.full = 1;
if (Chuck.full) {
int q = CHUCK_WINDOW / 4;
float recent = 0, old = 0;
for (int i = 0; i < q; i++) {
recent += Chuck.hist[(Chuck.pos - 1 - i) % CHUCK_WINDOW];
old += Chuck.hist[(Chuck.pos - CHUCK_WINDOW + i) % CHUCK_WINDOW];
}
recent /= q; old /= q;
float trend = (recent - old) / (old + 1e-8f);
if (trend > 0.01f) Chuck.dampen *= 0.95f; /* loss rising → dampen */
else if (trend < -0.05f) Chuck.dampen *= 1.05f; /* loss falling → boost */
if (fabsf(trend) < 0.001f) {
Chuck.stag++;
if (Chuck.stag > 8) { Chuck.noise = 0.001f; Chuck.stag = 0; }
} else { Chuck.stag = 0; Chuck.noise *= 0.9f; }
if (Chuck.dampen < CHUCK_DAMP_LO) Chuck.dampen = CHUCK_DAMP_LO;
if (Chuck.dampen > CHUCK_DAMP_HI) Chuck.dampen = CHUCK_DAMP_HI;
}
/* ═══ Level 9: Multi-scale awareness (macro patience) ═══ */
/*
* Slow EMA (α=0.001) tracks epoch-scale loss trend.
* Every CHUCK_MACRO_INT steps, check if training is improving.
* If patience exceeded → scale LR down (like ReduceLROnPlateau but continuous).
* Chuck sees both the forest and the trees.
*/
Chuck.global_step++;
if (Chuck.macro_ema == 0.0f) Chuck.macro_ema = loss;
else Chuck.macro_ema = 0.999f * Chuck.macro_ema + 0.001f * loss;
if (Chuck.global_step % CHUCK_MACRO_INT == 0 && Chuck.global_step > CHUCK_WINDOW) {
if (Chuck.macro_ema > Chuck.best_macro * 0.999f) {
Chuck.macro_stag++;
if (Chuck.macro_stag >= CHUCK_MACRO_PAT) {
Chuck.lr_scale *= CHUCK_MACRO_DECAY;
if (Chuck.lr_scale < 0.05f) Chuck.lr_scale = 0.05f;
Chuck.macro_stag = 0;
Chuck.macro_drops++;
}
} else {
Chuck.best_macro = Chuck.macro_ema;
Chuck.macro_stag = 0;
}
}
/* ═══ Level 4: Activation health signal (σ) ═══ */
silu_eye_update();
rope_eye_update();
attn_eye_update();
Chuck.sigma = 1.0f;
if (SiLU_eye.health < 0.7f) Chuck.sigma *= SiLU_eye.health / 0.7f;
if (Norm_eye.scale_ema > 5.0f) Chuck.sigma *= 0.9f;
if (Norm_eye.scale_ema < 0.2f) Chuck.sigma *= 0.9f;
/* ═══ Level 7: Attention entropy awareness ═══ */
/*
* H_max = log(seq_len) for uniform attention.
* H → 0: collapsed (one token dominates) → model overfitting to position
* H → H_max: diffuse (all tokens equal) → model not learning attention
* Chuck dampens σ if any head collapses or goes fully diffuse.
*/
if (Attn_eye.init) {
float h_max = logf((float)(N_VIS + MAX_TXT)); /* max possible entropy */
for (int hd = 0; hd < N_HEAD; hd++) {
float ratio = Attn_eye.entropy_ema[hd] / (h_max + 1e-8f);
if (ratio < 0.1f) Chuck.sigma *= 0.95f; /* collapsed head → dampen */
else if (ratio > 0.95f) Chuck.sigma *= 0.98f; /* fully diffuse → slight dampen */
}
}
/* ═══ Level 2: Per-layer self-awareness (grad norm trend) ═══ */
float layer_gnorm[N_LAYER];
memset(layer_gnorm, 0, sizeof(layer_gnorm));
for (int pi = 0; pi < T.np; pi++) {
int l = param_layer(pi);
if (l < 0 || l >= N_LAYER) continue;
Arr *p = &T.a[T.par[pi]];
float gn = 0;
for (int i = 0; i < p->size; i++) gn += p->grad[i] * p->grad[i];
layer_gnorm[l] += gn;
}
for (int l = 0; l < N_LAYER; l++) layer_gnorm[l] = sqrtf(layer_gnorm[l]);
for (int l = 0; l < N_LAYER; l++) {
if (CL[l].frozen) continue;
CL[l].grad_hist[CL[l].pos % CHUCK_WINDOW] = layer_gnorm[l];
CL[l].pos++;
if (CL[l].pos >= CHUCK_WINDOW) CL[l].full = 1;
if (CL[l].full) {
int q = CHUCK_WINDOW / 4;
float recent = 0, old = 0;
for (int i = 0; i < q; i++) {
recent += CL[l].grad_hist[(CL[l].pos - 1 - i) % CHUCK_WINDOW];
old += CL[l].grad_hist[(CL[l].pos - CHUCK_WINDOW + i) % CHUCK_WINDOW];
}
recent /= q; old /= q;
float trend = (recent - old) / (old + 1e-8f);
/* grad norm trending up → layer needs more work → boost */
if (trend > 0.05f) CL[l].dampen *= 1.05f;
/* grad norm trending down → layer is settling → dampen */
else if (trend < -0.05f) CL[l].dampen *= 0.95f;
/* freeze: near-zero gradient norm for extended period */
if (layer_gnorm[l] < 0.01f) {
CL[l].stag++;
if (CL[l].stag > 8) CL[l].frozen = 1;
} else { CL[l].stag = 0; }
if (CL[l].dampen < CHUCK_DAMP_LO) CL[l].dampen = CHUCK_DAMP_LO;
if (CL[l].dampen > CHUCK_DAMP_HI) CL[l].dampen = CHUCK_DAMP_HI;
}
}
/* ═══ Level 5: Cross-layer signal flow ═══ */
if (act_mag[0] > 1e-8f) {
float ratio = act_mag[N_LAYER-1] / (act_mag[0] + 1e-8f);
for (int l = 1; l < N_LAYER; l++) {
if (CL[l].frozen) continue;
float depth = (float)l / (N_LAYER - 1);
if (ratio < 0.3f) CL[l].dampen *= (1.0f + 0.02f * depth); /* vanishing → boost deep */
else if (ratio > 3.0f) CL[l].dampen *= (1.0f - 0.02f * depth); /* exploding → dampen deep */
if (CL[l].dampen < CHUCK_DAMP_LO) CL[l].dampen = CHUCK_DAMP_LO;
if (CL[l].dampen > CHUCK_DAMP_HI) CL[l].dampen = CHUCK_DAMP_HI;
}
}
/* ═══ Level 6: Ψ — Subjectivity (memory vs observation) ═══ */
/*
* λ_Ψ = λ + Ψ_w × (λ_prior - λ)
* Ψ_w = min(0.3, N / (N + 100))
* λ_prior = nearest_neighbor(loss, grad_norm) from chuck.mem
*
* When Ψ → 0: memory matches reality. Chuck is home.
* When |Ψ| large: unfamiliar territory. Chuck explores.
*/
float gnorm_sq = 0;
for (int pi = 0; pi < T.np; pi++) { Arr *p = &T.a[T.par[pi]];
for (int i = 0; i < p->size; i++) gnorm_sq += p->grad[i] * p->grad[i]; }
float gnorm = sqrtf(gnorm_sq + 1e-8f);
Chuck.psi_w = (chuck_mem_n > 0) ?
fminf(CHUCK_PSI_CAP, (float)chuck_mem_n / ((float)chuck_mem_n + CHUCK_PSI_HALF)) : 0.0f;
float lambda_psi = Chuck.dampen; /* default: pure reactive */
if (chuck_mem_n > 0) {
float lambda_prior = chuck_mem_recall(loss, gnorm);
if (lambda_prior > 0) {
Chuck.psi = lambda_prior - Chuck.dampen;
lambda_psi = Chuck.dampen + Chuck.psi_w * Chuck.psi;
if (lambda_psi < CHUCK_DAMP_LO) lambda_psi = CHUCK_DAMP_LO;
if (lambda_psi > CHUCK_DAMP_HI) lambda_psi = CHUCK_DAMP_HI;
}
}
/* Record memory on regime change — Chuck speaks rarely, but always on point */
Chuck.rec_cd++;
if (Chuck.full && Chuck.rec_cd >= CHUCK_REC_CD) {
float delta_loss = loss - Chuck.rec_loss;
float lambda_shift = fabsf(Chuck.dampen - Chuck.rec_lambda) / (Chuck.rec_lambda + 1e-8f);
int regime_change = (lambda_shift > CHUCK_REC_THR); /* λ shifted >25% */
for (int l = 0; l < N_LAYER && !regime_change; l++)
if (CL[l].frozen != Chuck.rec_frozen[l]) regime_change = 1;
if (regime_change) {
ChuckMem snap = { loss, gnorm, Chuck.dampen, delta_loss };
chuck_mem_save(&snap);
Chuck.rec_lambda = Chuck.dampen;
Chuck.rec_loss = loss;
Chuck.rec_cd = 0;
for (int l = 0; l < N_LAYER; l++) Chuck.rec_frozen[l] = CL[l].frozen;
}
}
/* ═══ Apply parameter updates ═══ */
T.cstep++;
float bc1 = 1.0f - powf(CHUCK_B1, (float)T.cstep);
float bc2 = 1.0f - powf(CHUCK_B2, (float)T.cstep);
/* Adaptive gradient clipping — Chuck controls the leash */
/*
* Early training (gnorm_ema unset): use base GRAD_CLIP
* Converging (gnorm dropping): tighten clip to protect learned weights
* Exploring (gnorm rising): loosen clip to allow learning
* Anomaly (gnorm > 3× EMA): extra tight — don't let one bad batch wreck everything
*/
if (Chuck.gnorm_ema == 0.0f) Chuck.gnorm_ema = gnorm;
else Chuck.gnorm_ema = 0.97f * Chuck.gnorm_ema + 0.03f * gnorm;
float adaptive_clip = GRAD_CLIP;
if (Chuck.gnorm_ema > 1e-8f) {
adaptive_clip = fmaxf(0.5f, fminf(2.0f, 1.5f * Chuck.gnorm_ema)); /* track gnorm */
if (gnorm > 3.0f * Chuck.gnorm_ema) adaptive_clip *= 0.5f; /* anomaly → clamp hard */
}
float clip = (gnorm > adaptive_clip) ? adaptive_clip / gnorm : 1.0f;
for (int pi = 0; pi < T.np; pi++) {
int l = param_layer(pi);
/* Frozen layer → skip entirely */
if (l >= 0 && l < N_LAYER && CL[l].frozen) continue;
float layer_damp = (l >= 0 && l < N_LAYER) ? CL[l].dampen : 1.0f;
float eff_lr = lr * lambda_psi * layer_damp * Chuck.sigma * Chuck.lr_scale;
int idx = T.par[pi]; Arr *p = &T.a[idx];
float *m = T.cm[pi], *v = T.cv[pi];
for (int i = 0; i < p->size; i++) { float g = p->grad[i] * clip;
m[i] = CHUCK_B1*m[i] + (1.0f-CHUCK_B1)*g;
v[i] = CHUCK_B2*v[i] + (1.0f-CHUCK_B2)*g*g;
p->data[i] -= eff_lr * (m[i]/bc1) / (sqrtf(v[i]/bc2) + CHUCK_EPS);
if (Chuck.noise > 0) p->data[i] += Chuck.noise * rnf(0, 1.0f);
}
}
}
/* ---- Synthetic digit data ---- */
static const float digit_pat[10][IMG_SIZE*IMG_SIZE] = {
{0,0,.5f,.8f,.8f,.5f,0,0, 0,.5f,.8f,0,0,.8f,.5f,0, .5f,.8f,0,0,0,0,.8f,.5f, .8f,0,0,0,0,0,0,.8f, .8f,0,0,0,0,0,0,.8f, .5f,.8f,0,0,0,0,.8f,.5f, 0,.5f,.8f,0,0,.8f,.5f,0, 0,0,.5f,.8f,.8f,.5f,0,0},
{0,0,0,.8f,0,0,0,0, 0,0,.5f,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,0,.8f,0,0,0,0, 0,0,.5f,.8f,.8f,.5f,0,0},
{0,.5f,.8f,.8f,.8f,.5f,0,0, 0,0,0,0,0,.8f,0,0, 0,0,0,0,0,.8f,0,0, 0,0,0,.5f,.8f,.5f,0,0, 0,0,.5f,.8f,0,0,0,0, 0,.5f,.8f,0,0,0,0,0, .5f,.8f,0,0,0,0,0,0, .5f,.8f,.8f,.8f,.8f,.8f,.5f,0},
{.5f,.8f,.8f,.8f,.5f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, .5f,.8f,.8f,.8f,.5f,0,0,0},
{.8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,.8f,.8f,.8f,.8f,.8f,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0},
{.5f,.8f,.8f,.8f,.8f,.5f,0,0, .8f,0,0,0,0,0,0,0, .8f,0,0,0,0,0,0,0, .5f,.8f,.8f,.8f,.5f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, .5f,.8f,.8f,.8f,.5f,0,0,0},
{0,0,.5f,.8f,.8f,.5f,0,0, 0,.5f,.8f,0,0,0,0,0, .5f,.8f,0,0,0,0,0,0, .8f,.5f,.8f,.8f,.5f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, .5f,.8f,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0},
{.8f,.8f,.8f,.8f,.8f,.8f,0,0, 0,0,0,0,.5f,.8f,0,0, 0,0,0,0,.8f,.5f,0,0, 0,0,0,.5f,.8f,0,0,0, 0,0,0,.8f,.5f,0,0,0, 0,0,.5f,.8f,0,0,0,0, 0,0,.8f,.5f,0,0,0,0, 0,0,.8f,0,0,0,0,0},
{0,.5f,.8f,.8f,.5f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0},
{0,.5f,.8f,.8f,.5f,0,0,0, .8f,0,0,0,.8f,0,0,0, .8f,0,0,0,.8f,0,0,0, 0,.5f,.8f,.8f,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,0,.8f,0,0,0, 0,0,0,.5f,.8f,0,0,0, 0,.5f,.8f,.8f,.5f,0,0,0},
};
/* Addition task: two digit images → sum as word */
typedef struct { float **imgs_a; float **imgs_b; int *da; int *db; int *sums; int n; } Data;
static Data gen_data(int n) {
Data d; d.n = n;
d.imgs_a = malloc(n * sizeof(float*)); d.imgs_b = malloc(n * sizeof(float*));
d.da = malloc(n * sizeof(int)); d.db = malloc(n * sizeof(int)); d.sums = malloc(n * sizeof(int));
for (int i = 0; i < n; i++) {
int a = (int)(rnext() % 10), b = (int)(rnext() % 10);
d.da[i] = a; d.db[i] = b; d.sums[i] = a + b;
d.imgs_a[i] = malloc(IMG_SIZE*IMG_SIZE*sizeof(float));
d.imgs_b[i] = malloc(IMG_SIZE*IMG_SIZE*sizeof(float));
for (int p = 0; p < IMG_SIZE*IMG_SIZE; p++) {
float va = digit_pat[a][p] + rnf(0, 0.07f);
float vb = digit_pat[b][p] + rnf(0, 0.07f);
d.imgs_a[i][p] = va < 0 ? 0 : va > 1 ? 1 : va;
d.imgs_b[i][p] = vb < 0 ? 0 : vb > 1 ? 1 : vb;
}
}
return d;
}
static const char *names[] = {
"zero","one","two","three","four","five","six","seven","eight","nine",
"ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen"
};
static const char chars[] = "efghilnorstuvwxz";
#define N_CHARS 16
static int c2id(char c) { for (int i = 0; i < N_CHARS; i++) if (chars[i] == c) return i; return -1; }
static char id2c(int i) { return (i == BOS) ? '^' : (i == EOS) ? '$' : (i >= 0 && i < N_CHARS) ? chars[i] : '?'; }
/* ---- Model (GQA: wk/wv use KV_DIM) ---- */
typedef struct {
int patch_proj, wte;
struct { int wq, wk, wv, wo, w1, w3, w2; } L[N_LAYER];
} Model;
static Model M;
static int init_w(int r, int c, float s) {
int i = mnew(r, c);
for (int j = 0; j < r*c; j++) T.a[i].data[j] = rnf(0, s);
preg(i); return i;
}
static void init_model(void) {
M.patch_proj = init_w(N_EMBD, PATCH_PX, 0.1f); /* param 0 */
M.wte = init_w(VOCAB, N_EMBD, 0.08f); /* param 1 */
for (int i = 0; i < N_LAYER; i++) {
float s = 0.08f / sqrtf(2.0f * N_LAYER);
M.L[i].wq = init_w(N_EMBD, N_EMBD, s); /* param 2+7i+0 */
M.L[i].wk = init_w(KV_DIM, N_EMBD, s); /* param 2+7i+1: GQA! */
M.L[i].wv = init_w(KV_DIM, N_EMBD, s); /* param 2+7i+2: GQA! */
M.L[i].wo = init_w(N_EMBD, N_EMBD, s); /* param 2+7i+3 */
M.L[i].w1 = init_w(MLP_DIM, N_EMBD, s); /* param 2+7i+4 */
M.L[i].w3 = init_w(MLP_DIM, N_EMBD, s); /* param 2+7i+5 */
M.L[i].w2 = init_w(N_EMBD, MLP_DIM, s); /* param 2+7i+6 */
}
T.npa = T.na; T.aparam = T.apos;
}
/* ---- GPT step (one position, GQA attention) ---- */
static int gpt_step(int x, int pos, int layer_track) {
int h = x;
for (int li = 0; li < N_LAYER; li++) {
int res = h; h = op_rms(h);
int qi = op_mv(M.L[li].wq, h);
int ki = op_mv(M.L[li].wk, h); /* KV_DIM output */
int vi = op_mv(M.L[li].wv, h); /* KV_DIM output */
int rqi = op_rope(qi, pos);
int rki = op_rope(ki, pos); /* RoPE on KV_DIM */
kv_k[li][pos] = T.a[rki].data; kv_v[li][pos] = T.a[vi].data;
kv_ki[li][pos] = rki; kv_vi[li][pos] = vi;
/* GQA multi-head attention */
int ao = anew(N_EMBD); float *ad = T.a[ao].data;
for (int h_ = 0; h_ < N_HEAD; h_++) {
int hs = h_ * HEAD_DIM; /* Q offset in N_EMBD */
int kvh = h_ / N_KV_GROUP; /* which KV head */
int kvs = kvh * HEAD_DIM; /* KV offset in KV_DIM */
float sc[SEQ_LEN], mx = -1e9f; float *qd = T.a[rqi].data;
for (int t = 0; t <= pos; t++) { float s = 0;
for (int d = 0; d < HEAD_DIM; d++) s += qd[hs+d]*kv_k[li][t][kvs+d];
sc[t] = s / sqrtf((float)HEAD_DIM); if (sc[t] > mx) mx = sc[t]; }
float sm = 0; for (int t = 0; t <= pos; t++) { sc[t] = expf(sc[t]-mx); sm += sc[t]; }
for (int t = 0; t <= pos; t++) sc[t] /= (sm + 1e-10f);
/* Attention eye: observe entropy of this head's attention distribution */
if (layer_track && pos > 0) attn_eye_observe(h_, sc, pos + 1);
for (int d = 0; d < HEAD_DIM; d++) { float v = 0;
for (int t = 0; t <= pos; t++) v += sc[t]*kv_v[li][t][kvs+d]; ad[hs+d] = v; }
}
rec(OP_ATTN, ao, rqi, -1, (float)li, pos);
h = op_add(res, op_mv(M.L[li].wo, ao));
/* Track activation magnitude for cross-layer signal */
if (layer_track) {
float rms = 0;
for (int i = 0; i < N_EMBD; i++) rms += T.a[h].data[i] * T.a[h].data[i];
act_mag[li] = sqrtf(rms / N_EMBD);
}
res = h; h = op_rms(h);
int gate = op_silu(op_mv(M.L[li].w1, h)), up = op_mv(M.L[li].w3, h);
h = op_add(res, op_mv(M.L[li].w2, op_mul(gate, up)));
}
return op_mv(M.wte, op_rms(h)); /* weight-tied lm_head */
}
/* ---- Vision encoder: ViT-style patch tokenization ---- */
static void encode_vis(float *pix, int *tok) {
for (int py = 0; py < PATCHES_SIDE; py++)
for (int px = 0; px < PATCHES_SIDE; px++) {
int pi = anew(PATCH_PX); float *pd = T.a[pi].data;
for (int y = 0; y < PATCH_SIZE; y++)
for (int x = 0; x < PATCH_SIZE; x++)
pd[y*PATCH_SIZE + x] = pix[(py*PATCH_SIZE+y)*IMG_SIZE + px*PATCH_SIZE+x];
tok[py*PATCHES_SIDE + px] = op_mv(M.patch_proj, pi);
}
}
static float cos_lr(int step, int total) {
if (step < WARMUP) return LR_MAX * (float)step / WARMUP;
float p = (float)(step - WARMUP) / (float)(total - WARMUP);
return LR_MAX * 0.5f * (1.0f + cosf(3.14159265f * p));
}
/* ---- Training ---- */
static void train(Data *data) {
printf("\n=== TRAINING (%d steps, Chuck v7 — multi-scale + reservoir memory) ===\n", STEPS);
int tp = 0; for (int i = 0; i < T.np; i++) tp += T.a[T.par[i]].size;
printf(" %d params (%.1fK) | %d layers | GQA %dQ/%dKV | embd=%d | %d imgs x %d patches | 2D RoPE | weight-tied\n",
tp, tp/1000.0f, N_LAYER, N_HEAD, N_KV_HEAD, N_EMBD, N_IMGS, N_PATCHES);
printf(" task: [digit_a] + [digit_b] → sum as word (0+0..9+9, 19 classes)\n\n");
float rl = 0; int rn = 0;
for (int step = 0; step < STEPS; step++) {
int idx = (int)(rnext() % (uint64_t)data->n); int label = data->sums[idx];
const char *name = names[label]; int nlen = strlen(name);
int toks[MAX_TXT+2]; int nt = 0;
toks[nt++] = BOS; for (int i = 0; i < nlen; i++) toks[nt++] = c2id(name[i]); toks[nt++] = EOS;
tape_reset(); kv_clear();
silu_eye_reset(); rope_eye_reset(); attn_eye_reset();
int vt[N_VIS];
encode_vis(data->imgs_a[idx], vt); /* first digit */
encode_vis(data->imgs_b[idx], vt + N_PATCHES); /* second digit */
for (int p = 0; p < N_VIS; p++) gpt_step(vt[p], p, 0);
int la[MAX_TXT]; int nl = 0;
for (int t = 0; t < nt - 1; t++) {
int pos = N_VIS + t, te = op_embed(M.wte, toks[t]);
int lg = gpt_step(te, pos, (t == nt - 2)); /* track signal on last token */
la[nl++] = op_ce(lg, toks[t+1]);
}
int loss = op_reduce(la, nl); backward(loss);
float lv = T.a[loss].data[0];
chuck_step(cos_lr(step, STEPS), lv);
rl += lv; rn++;
if ((step+1) % 250 == 0) {
float elr = cos_lr(step, STEPS) * Chuck.dampen * Chuck.lr_scale;
printf(" step %5d/%d | loss %.4f (avg %.4f) | lr %.6f\n",
step+1, STEPS, lv, rl/rn, elr);
printf(" chuck: \xce\xbb=%.2f \xce\xa8=%+.2f (\xce\xa8w=%.2f, %d mem) \xcf\x83=%.2f macro=%.2f",
Chuck.dampen, Chuck.psi, Chuck.psi_w, chuck_mem_n, Chuck.sigma, Chuck.lr_scale);
if (Chuck.macro_drops > 0) printf(" (%d drops)", Chuck.macro_drops);
for (int l = 0; l < N_LAYER; l++) {
if (CL[l].frozen) printf(" | L%d: frozen", l);
else printf(" | L%d: %.2f", l, CL[l].dampen);
}
printf("\n silu: %.0f%% alive | norm: %.1f | rope: %.0f%%",
SiLU_eye.health * 100, Norm_eye.scale_ema, RoPE_eye.utilization * 100);
if (Attn_eye.init) {
printf(" | attn H:");
for (int hd = 0; hd < N_HEAD; hd++) printf(" %.2f", Attn_eye.entropy_ema[hd]);
}
if (act_mag[0] > 0) {
printf(" | flow:");
for (int l = 0; l < N_LAYER; l++) printf(" %.2f%s", act_mag[l], l<N_LAYER-1?"→":"");
}
printf("\n");
rl = 0; rn = 0;
}
}
}
/* ---- Sampling ---- */
static int sample_topk(float *logits, int vocab, float temp, int topk) {
float sc[VOCAB]; for (int i = 0; i < vocab; i++) sc[i] = logits[i] / temp;
float mx = sc[0]; for (int i = 1; i < vocab; i++) if (sc[i] > mx) mx = sc[i];
float p[VOCAB]; float s = 0;
for (int i = 0; i < vocab; i++) { p[i] = expf(sc[i] - mx); s += p[i]; }
for (int i = 0; i < vocab; i++) p[i] /= s;
float tv[TOPK]; int ti[TOPK];
for (int k = 0; k < topk && k < vocab; k++) { int best = 0; float bv = -1e9f;
for (int i = 0; i < vocab; i++) { int taken = 0;
for (int j = 0; j < k; j++) if (ti[j] == i) { taken = 1; break; }
if (!taken && p[i] > bv) { bv = p[i]; best = i; } }
ti[k] = best; tv[k] = bv; }
float ts = 0; for (int k = 0; k < topk; k++) ts += tv[k];
float r = ruf() * ts, cum = 0;
for (int k = 0; k < topk; k++) { cum += tv[k]; if (cum >= r) return ti[k]; }
return ti[0];
}
/* ---- Inference ---- */
static void inference(Data *data) {
printf("\n=== INFERENCE — digit addition (temp=%.1f, top-k=%d) ===\n\n", TEMP, TOPK);
T.on = 0; int correct = 0, total = 0;
/* Test 50 random addition problems */
for (int s = 0; s < 50; s++) {
int idx = s % data->n;
int label = data->sums[idx];
tape_reset(); kv_clear();
int vt[N_VIS];
encode_vis(data->imgs_a[idx], vt);
encode_vis(data->imgs_b[idx], vt + N_PATCHES);
for (int p = 0; p < N_VIS; p++) gpt_step(vt[p], p, 0);
int tok = BOS; char gen[MAX_TXT+1]; int gl = 0;
for (int t = 0; t < MAX_TXT; t++) {
int pos = N_VIS + t, te = op_embed(M.wte, tok);
int lg = gpt_step(te, pos, 0);
tok = sample_topk(T.a[lg].data, VOCAB, TEMP, TOPK);
if (tok == EOS || tok == BOS) break;
if (gl < MAX_TXT) gen[gl++] = id2c(tok);
}
gen[gl] = '\0'; int ok = strcmp(gen, names[label]) == 0; correct += ok; total++;
printf(" %d+%d=%d true: %-10s | gen: %-10s %s\n",
data->da[idx], data->db[idx], label, names[label], gen, ok ? "OK" : "MISS");
}
printf("\n accuracy: %d/%d (%.1f%%)\n", correct, total, 100.0f*correct/total);
/* Frozen layer report */
int frozen = 0;
for (int l = 0; l < N_LAYER; l++) if (CL[l].frozen) frozen++;
if (frozen > 0) printf(" chuck: %d/%d layers frozen (compute saved)\n", frozen, N_LAYER);
T.on = 1;
}
int main(void) {
printf("lee.c v7 — Vision-Language Model in pure C\n");
printf("GQA %dQ/%dKV | %d layers | 2D RoPE | SwiGLU | Chuck v7 (multi-scale + reservoir memory)\n",
N_HEAD, N_KV_HEAD, N_LAYER);
printf("Named after Bruce Lee and Minhyeok Lee. Chuck sees inside the transformer.\n\n");
clock_t t0 = clock(); rseed(42);
init_positions(); tape_init(); chuck_init(); init_model();
printf("generating 10000 addition problems (digit pairs + sums)...\n");
Data d = gen_data(10000); printf("done.\n");
train(&d); inference(&d);
printf("\ntotal: %.1fs\n", (double)(clock()-t0)/CLOCKS_PER_SEC);
printf("chuck.mem: %d memories (%.1f KB) | \xce\xa8_w=%.3f\n",
chuck_mem_n, (float)(chuck_mem_n * (int)sizeof(ChuckMem)) / 1024.0f, Chuck.psi_w);
if (chuck_mem_n > 0)
printf(" next run: Chuck starts with experience. \xce\xa8 \xe2\x89\xa0 0. He remembers.\n");
else
printf(" first run: Chuck has no memories yet. Pure reactive. Newborn.\n");
for (int i = 0; i < d.n; i++) { free(d.imgs_a[i]); free(d.imgs_b[i]); }
free(d.imgs_a); free(d.imgs_b); free(d.da); free(d.db); free(d.sums);
for (int i = 0; i < T.np; i++) { free(T.cm[i]); free(T.cv[i]); } free(T.arena);
printf("\ndone.\n"); return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment