kind: nla_model
extraction:
  d_model: 1536
  layer: 23
  injection_scale: sqrt_d_model
  mse_scale: sqrt_d_model
tokens:
  injection_token_id: 249568
  injection_left_neighbor_id: 236813
  injection_right_neighbor_id: 954
base_model: google/gemma-4-E2B
n_layers: 35
provenance:
  n_rows: 3200
  n_docs: 800
  min_position: 50
  stage: stage3_build
prompt_templates:
  actor: 'You are a meticulous AI researcher conducting an important investigation
    into activation vectors from a language model. Your overall task is to describe
    the semantic content of that activation vector.


    We will pass the vector enclosed in <concept> tags into your context. You must
    then produce an explanation for the vector, enclosed within <explanation> tags.
    The explanation consists of 2-3 text snippets describing that vector.


    Here is the vector:


    <concept>{injection_char}</concept>'
  critic: 'Summary of the following text: <text>{explanation}</text> <summary>'
role: critic
critic:
  num_hidden_layers: 18
ar_lora:
  r: 64
  alpha: 128
  target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
ar_head_dim_in: 1536
ar_head_dim_out: 1536
training:
  lr: 5.0e-05
  max_steps: 15
  micro_batch: 1
  grad_accum: 4
eval_provenance:
  results_json: experiments/v8_nla_local/results/round_trip_v0_n50.json
  results_sha256: 2197c86d534e38f56bd8fd47be07cec6c6142a284bdbec3d0a7345c4946e8edd
  results_commit: 5877be84922ac74f9b2897eda92f396d54ed7aff
  eval_script: experiments/v8_nla_local/eval_round_trip.py
  eval_data: experiments/v8_nla_local/data/stage1/rl.parquet
  eval_date: '2026-05-10'
  paired_with: experiments/v8_nla_local/checkpoints/av_v0_continued/final
  n_evaluated: 42
  n_attempted: 50
  round_trip_cos_mean: 0.4378
  round_trip_cos_median: 0.4343
  round_trip_cos_std: 0.0538
  round_trip_mse_mean: 1.1243
  random_baseline_mse: 2.0
  triage_threshold_cos: 0.3
  triage_result: above_threshold
  max_new_tokens: 120
  notes: |
    First measured round-trip faithfulness for V8 NLA v0. All 42
    evaluated rows above the 0.3 triage threshold (worst row 0.313).
    AR truncation: 18 of 35 Gemma 4 E2B layers + Linear(1536,1536) head.
    See notes/SESSION_SUMMARY_2026-05-10_gpu_grant_session_8.md Phase 3.