{
  "title": "GEO + ARM 100-Site Audit — Reproduce This",
  "version": "v1-2026-04-16",
  "description": "This study measures AI citation readiness — how well 100 major websites are structured for discovery and citation by AI systems like ChatGPT, Claude, Perplexity, and Gemini. The audit runs in three automated phases: an 8-signal infrastructure scan (robots.txt, llms.txt, JSON-LD, TTFB, HTTP/3, etc.), GEO scoring by Claude Sonnet agents across 9 weighted dimensions (0–100 scale, 85 target), and an ARM probe that queries 4 AI platforms with 20 brand-relevant queries per site to measure real citation presence. To recreate it: clone the scripts, add the 5 API keys listed below, and run the three phases in order — total API cost is roughly $50 depending on your tier.",
  "api_keys_required": [
    {
      "key": "SERPER_API_KEY",
      "purpose": "SERP signals — brand SERP, knowledge graph, sitelinks",
      "get_it_at": "serper.dev"
    },
    {
      "key": "OPENAI_API_KEY",
      "purpose": "ARM probe — OpenAI platform citation check (gpt-4o-mini-search-preview)",
      "get_it_at": "platform.openai.com"
    },
    {
      "key": "ANTHROPIC_API_KEY",
      "purpose": "ARM probe — Anthropic platform citation check (claude-haiku-4-5) + query set generation",
      "get_it_at": "console.anthropic.com"
    },
    {
      "key": "PERPLEXITY_API_KEY",
      "purpose": "ARM probe — Perplexity platform citation check (sonar)",
      "get_it_at": "perplexity.ai/settings/api"
    },
    {
      "key": "GEMINI_API_KEY",
      "purpose": "ARM probe — Gemini platform citation check (gemini-2.5-flash) with grounding",
      "get_it_at": "aistudio.google.com/app/apikey"
    }
  ],
  "cost_estimate": {
    "phase1_serper": "~$0.10 (100 SERP queries at $0.001/query)",
    "phase3_aifs_openai": "~$3 (100 sites × 20 queries × ~$0.015/call)",
    "phase3_aifs_gemini": "~$8 (100 sites × 20 queries × ~$0.04/call)",
    "phase3_aifs_anthropic": "~$2 (100 sites × 20 queries × ~$0.001/call, Haiku)",
    "phase3_aifs_perplexity": "~$1.20 (100 sites × 20 queries × ~$0.006/call, sonar)",
    "total": "~$15–50 depending on enterprise vs pay-per-call rates"
  },
  "phases": {
    "phase1_pillar_scan": {
      "command": "node audit-v3.js",
      "output_dir": "audit-receipts-v3-<YYYY-MM-DD>/",
      "concurrency": 5,
      "output_per_site": "JSON receipt with pillar pass/fail signals",
      "script_content": "#!/usr/bin/env node\n/**\n * Global AI Citation Infrastructure Audit v3.0\n *\n * Key change from v2: TTFB latency compensation for residential networks.\n *\n * Methodology:\n *   Phase 1 — 8-Signal Audit (100 sites, 5 concurrent)\n *     1. MCP Server       /.well-known/mcp.json  HTTP 200 + application/json\n *     2. llms.txt         /llms.txt              HTTP 200 + starts with #\n *     3. Clean-Room HTML  default vs GPTBot UA   both >5KB, ratio 0.5-2.0\n *     4. AI Content Feed  3 paths checked        HTTP 200 + json or text/plain\n *     5. JSON-LD          homepage               1+ application/ld+json\n *     6. TTFB <200ms      compensated            3rd-hit raw TTFB minus per-request connect time < 200ms\n *     7. 10+ AI Bots      robots.txt Allow:      10+ distinct bots explicitly allowed\n *     8. HTTP/3           alt-svc header          contains h3\n *\n *   Phase 2 — Results & Documentation\n *\n * TTFB threshold: 200ms server-side (compensatedMs = rawTTFB_3 - connectTime_3).\n * 3 hits per site; score on hit 3 (warm/cached). No Phase 0 calibration needed.\n *\n * Run: node audit-v3.js [--start N] [--end N] [--concurrency N]\n */\n\n'use strict';\n\nconst { execSync } = require('child_process');\nconst crypto = require('crypto');\nconst fs = require('fs');\nconst path = require('path');\n\n// ─── CONFIG ───────────────────────────────────────────────────────────────────\nconst OUTPUT_DIR = path.join(__dirname, 'audit-receipts-v3');\nconst CURL_TIMEOUT = 15;\nconst DELAY_MS = 500;\nconst RUN_TIMESTAMP = new Date().toISOString();\nconst RUN_DATE = RUN_TIMESTAMP.split('T')[0];\nconst SCRIPT_VERSION = '3.0.0';\nconst SHELL = process.env.SHELL || 'bash';\nconst CONCURRENCY = parseInt(process.argv.find((a,i,arr) => arr[i-1] === '--concurrency') || '5');\nconst START_RANK = parseInt(process.argv.find((a,i,arr) => arr[i-1] === '--start') || '1');\nconst END_RANK = parseInt(process.argv.find((a,i,arr) => arr[i-1] === '--end') || '100');\n\n// TTFB threshold: server-side milliseconds after subtracting network overhead\nconst TTFB_THRESHOLD_MS = 200;\n\nconst AI_BOTS = [\n  'GPTBot', 'ClaudeBot', 'anthropic-ai', 'PerplexityBot', 'Google-Extended',\n  'Applebot-Extended', 'Amazonbot', 'Bytespider', 'Meta-ExternalAgent',\n  'cohere-ai', 'YouBot', 'AI2Bot', 'Diffbot', 'CCBot', 'DataForSeoBot',\n  'OAI-SearchBot', 'Gemini', 'claude-web', 'PetalBot', 'facebookexternalhit'\n];\n\nconst SITES = [\n  { rank: 1,   domain: 'top10lists.us',      industry: 'Real Estate' },\n  { rank: 2,   domain: 'edx.org',            industry: 'Education' },\n  { rank: 3,   domain: 'cloudflare.com',     industry: 'Technology' },\n  { rank: 4,   domain: 'elevenlabs.io',      industry: 'AI/Tech' },\n  { rank: 5,   domain: 'appfolio.com',       industry: 'Proptech' },\n  { rank: 6,   domain: 'nasa.gov',           industry: 'Government' },\n  { rank: 7,   domain: 'apple.com',          industry: 'Technology' },\n  { rank: 8,   domain: 'coursera.org',       industry: 'Education' },\n  { rank: 9,   domain: 'bbc.com',            industry: 'News' },\n  { rank: 10,  domain: 'espn.com',           industry: 'News/Sports' },\n  { rank: 11,  domain: 'perplexity.ai',      industry: 'AI/Tech' },\n  { rank: 12,  domain: 'github.com',         industry: 'Technology' },\n  { rank: 13,  domain: 'homelight.com',      industry: 'Real Estate' },\n  { rank: 14,  domain: 'supabase.com',       industry: 'Technology' },\n  { rank: 15,  domain: 'ratemyagent.com',    industry: 'Real Estate' },\n  { rank: 16,  domain: 'bbb.org',            industry: 'Business Ratings' },\n  { rank: 17,  domain: 'wikipedia.org',      industry: 'Reference' },\n  { rank: 18,  domain: 'wikidata.org',       industry: 'Reference' },\n  { rank: 19,  domain: 'fastexpert.com',     industry: 'Real Estate' },\n  { rank: 20,  domain: 'harvard.edu',        industry: 'Education' },\n  { rank: 21,  domain: 'realpage.com',       industry: 'Proptech' },\n  { rank: 22,  domain: 'apartmentlist.com',  industry: 'Proptech' },\n  { rank: 23,  domain: 'yardi.com',          industry: 'Proptech' },\n  { rank: 24,  domain: 'buildium.com',       industry: 'Proptech' },\n  { rank: 25,  domain: 'who.int',            industry: 'Nonprofit/Health' },\n  { rank: 26,  domain: 'w3.org',             industry: 'Nonprofit/Tech' },\n  { rank: 27,  domain: 'mozilla.org',        industry: 'Nonprofit/Tech' },\n  { rank: 28,  domain: 'medium.com',         industry: 'Publishing' },\n  { rank: 29,  domain: 'acm.org',            industry: 'Academic' },\n  { rank: 30,  domain: 'fda.gov',            industry: 'Government' },\n  { rank: 31,  domain: 'khanacademy.org',    industry: 'Education' },\n  { rank: 32,  domain: 'data.gov',           industry: 'Government' },\n  { rank: 33,  domain: 'stackoverflow.com',  industry: 'Technology' },\n  { rank: 34,  domain: 'reddit.com',         industry: 'Social/Tech' },\n  { rank: 35,  domain: 'bankofamerica.com',  industry: 'Finance' },\n  { rank: 36,  domain: 'wellsfargo.com',     industry: 'Finance' },\n  { rank: 37,  domain: 'progressive.com',    industry: 'Finance' },\n  { rank: 38,  domain: 'statefarm.com',      industry: 'Finance' },\n  { rank: 39,  domain: 'amazon.com',         industry: 'Retail/Tech' },\n  { rank: 40,  domain: 'google.com',         industry: 'Technology' },\n  { rank: 41,  domain: 'airbnb.com',         industry: 'Travel/Tech' },\n  { rank: 42,  domain: 'imdb.com',           industry: 'Entertainment' },\n  { rank: 43,  domain: 'webmd.com',          industry: 'Healthcare' },\n  { rank: 44,  domain: 'healthgrades.com',   industry: 'Healthcare' },\n  { rank: 45,  domain: 'zillow.com',         industry: 'Real Estate' },\n  { rank: 46,  domain: 'realtor.com',        industry: 'Real Estate' },\n  { rank: 47,  domain: 'redfin.com',         industry: 'Real Estate' },\n  { rank: 48,  domain: 'yelp.com',           industry: 'Reviews' },\n  { rank: 49,  domain: 'tripadvisor.com',    industry: 'Travel' },\n  { rank: 50,  domain: 'crunchbase.com',     industry: 'Business Data' },\n  { rank: 51,  domain: 'bankrate.com',       industry: 'Finance' },\n  { rank: 52,  domain: 'lendingtree.com',    industry: 'Finance' },\n  { rank: 53,  domain: 'martindale.com',     industry: 'Legal' },\n  { rank: 54,  domain: 'avvo.com',           industry: 'Legal' },\n  { rank: 55,  domain: 'nerdwallet.com',     industry: 'Finance' },\n  { rank: 56,  domain: 'pitchbook.com',      industry: 'Business Data' },\n  { rank: 57,  domain: 'huggingface.co',     industry: 'AI/Tech' },\n  { rank: 58,  domain: 'sec.gov',            industry: 'Government' },\n  { rank: 59,  domain: 'arxiv.org',          industry: 'Academic' },\n  { rank: 60,  domain: 'jstor.org',          industry: 'Academic' },\n  { rank: 61,  domain: 'substack.com',       industry: 'Publishing' },\n  { rank: 62,  domain: 'nih.gov',            industry: 'Government' },\n  { rank: 63,  domain: 'cdc.gov',            industry: 'Government' },\n  { rank: 64,  domain: 'census.gov',         industry: 'Government' },\n  { rank: 65,  domain: 'hud.gov',            industry: 'Government' },\n  { rank: 66,  domain: 'mit.edu',            industry: 'Education' },\n  { rank: 67,  domain: 'stanford.edu',       industry: 'Education' },\n  { rank: 68,  domain: 'wikimedia.org',      industry: 'Reference' },\n  { rank: 69,  domain: 'npr.org',            industry: 'News' },\n  { rank: 70,  domain: 'archive.org',        industry: 'Reference' },\n  { rank: 71,  domain: 'turbotenant.com',    industry: 'Proptech' },\n  { rank: 72,  domain: 'microsoft.com',      industry: 'Technology' },\n  { rank: 73,  domain: 'linkedin.com',       industry: 'Technology' },\n  { rank: 74,  domain: 'nytimes.com',        industry: 'News' },\n  { rank: 75,  domain: 'theguardian.com',    industry: 'News' },\n  { rank: 76,  domain: 'cnn.com',            industry: 'News' },\n  { rank: 77,  domain: 'forbes.com',         industry: 'News/Business' },\n  { rank: 78,  domain: 'wsj.com',            industry: 'News' },\n  { rank: 79,  domain: 'chase.com',          industry: 'Finance' },\n  { rank: 80,  domain: 'walmart.com',        industry: 'Retail' },\n  { rank: 81,  domain: 'salesforce.com',     industry: 'Technology' },\n  { rank: 82,  domain: 'facebook.com',       industry: 'Social/Tech' },\n  { rank: 83,  domain: 'youtube.com',        industry: 'Social/Tech' },\n  { rank: 84,  domain: 'openai.com',         industry: 'AI/Tech' },\n  { rank: 85,  domain: 'anthropic.com',      industry: 'AI/Tech' },\n  { rank: 86,  domain: 'glassdoor.com',      industry: 'Employment' },\n  { rank: 87,  domain: 'uber.com',           industry: 'Technology' },\n  { rank: 88,  domain: 'indeed.com',         industry: 'Employment' },\n  { rank: 89,  domain: 'netflix.com',        industry: 'Entertainment' },\n  { rank: 90,  domain: 'bloomberg.com',      industry: 'Finance/News' },\n  { rank: 91,  domain: 'fidelity.com',       industry: 'Finance' },\n  { rank: 92,  domain: 'x.com',              industry: 'Social/Tech' },\n  { rank: 93,  domain: 'costar.com',         industry: 'Proptech' },\n  { rank: 94,  domain: 'apartments.com',     industry: 'Proptech' },\n  { rank: 95,  domain: 'rentcafe.com',       industry: 'Proptech' },\n  { rank: 96,  domain: 'rent.com',           industry: 'Proptech' },\n  { rank: 97,  domain: 'ieee.org',           industry: 'Academic' },\n  { rank: 98,  domain: 'notion.so',          industry: 'Productivity/SaaS' },\n  { rank: 99,  domain: 'shopify.com',        industry: 'eCommerce' },\n  { rank: 100, domain: 'stripe.com',         industry: 'FinTech' },\n];\n\n// ─── HELPERS ──────────────────────────────────────────────────────────────────\n\nfunction runCurl(cmd) {\n  try {\n    const output = execSync(cmd, {\n      shell: SHELL,\n      timeout: (CURL_TIMEOUT + 5) * 1000,\n      encoding: 'utf8',\n      maxBuffer: 10 * 1024 * 1024,\n    });\n    return { cmd, output, error: null };\n  } catch (e) {\n    const output = (e.stdout || '') + (e.stderr || '');\n    const msg = e.message || '';\n    const error = (msg.includes('ETIMEDOUT') || msg.includes('timeout') || msg.includes('28'))\n      ? 'TIMEOUT' : (msg.slice(0, 120) || 'ERROR');\n    return { cmd, output, error };\n  }\n}\n\nfunction extractHeaderHttpCode(raw) {\n  const matches = raw.match(/HTTP\\/[\\d.]+ (\\d{3})/g);\n  if (!matches || matches.length === 0) return 0;\n  const last = matches[matches.length - 1];\n  return parseInt(last.match(/(\\d{3})/)[1]);\n}\n\nfunction extractContentType(raw) {\n  const m = raw.match(/content-type:\\s*([^\\r\\n]+)/i);\n  return m ? m[1].trim().toLowerCase() : '';\n}\n\nfunction sha256(str) {\n  return crypto.createHash('sha256').update(str, 'utf8').digest('hex');\n}\n\nfunction sleep(ms) {\n  return new Promise(r => setTimeout(r, ms));\n}\n\nfunction median(arr) {\n  if (arr.length === 0) return 0;\n  const sorted = [...arr].sort((a, b) => a - b);\n  const mid = Math.floor(sorted.length / 2);\n  return sorted.length % 2 !== 0 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;\n}\n\nfunction getCurlVersion() {\n  try {\n    return execSync('curl --version', { shell: SHELL, encoding: 'utf8' }).split('\\n')[0].trim();\n  } catch (e) {\n    return 'unknown';\n  }\n}\n\n// ─── PHASE 0: NETWORK BASELINE CALIBRATION (DISABLED) ───────────────────────\n// Phase 0 Vercel-based calibration removed 2026-04-15. Vercel is fully deprecated\n// from this stack. TTFB now uses per-request connect normalization only:\n//   compensatedMs = rawTTFB_3 - connectTime_3  (3rd hit, warm/cached)\n// No global network overhead baseline is needed or used.\n\n// ─── SIGNAL 1: MCP SERVER ─────────────────────────────────────────────────────\nfunction testMCP(domain) {\n  const cmd = `curl -sIL --max-time ${CURL_TIMEOUT} \"https://${domain}/.well-known/mcp.json\"`;\n  const r = runCurl(cmd);\n  const httpCode = extractHeaderHttpCode(r.output);\n  const contentType = extractContentType(r.output);\n  const pass = httpCode === 200 && contentType.includes('application/json');\n  return {\n    pass,\n    command: cmd,\n    rawOutput: r.output.slice(0, 2000),\n    httpCode,\n    contentType,\n    error: r.error,\n    evidence: pass\n      ? `HTTP 200, content-type: ${contentType}`\n      : `HTTP ${httpCode}, content-type: ${contentType || 'none'}${r.error ? ' [' + r.error + ']' : ''}`,\n  };\n}\n\n// ─── SIGNAL 2: LLMS.TXT ───────────────────────────────────────────────────────\nfunction testLlmsTxt(domain) {\n  const cmd = `curl -sL --max-time ${CURL_TIMEOUT} -w \"\\\\n__CODE__:%{http_code}\" \"https://${domain}/llms.txt\"`;\n  const r = runCurl(cmd);\n  const codeMatch = r.output.match(/__CODE__:(\\d{3})/);\n  const httpCode = codeMatch ? parseInt(codeMatch[1]) : 0;\n  const body = r.output.replace(/__CODE__:\\d{3}/, '').trim();\n  const startsWithHash = body.startsWith('#');\n  const startsWithHtml = body.startsWith('<');\n  const pass = httpCode === 200 && startsWithHash && !startsWithHtml;\n  return {\n    pass,\n    command: cmd,\n    httpCode,\n    bodySnippet: body.slice(0, 500),\n    bodyFirst3: body.slice(0, 3),\n    error: r.error,\n    evidence: pass\n      ? `HTTP 200, body starts with '#' (markdown confirmed)`\n      : `HTTP ${httpCode}, body starts with '${body.slice(0, 10)}'${r.error ? ' [' + r.error + ']' : ''}`,\n  };\n}\n\n// ─── SIGNAL 3: CLEAN-ROOM HTML ────────────────────────────────────────────────\nfunction testCleanRoom(domain) {\n  const defaultUA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';\n  const gptUA    = 'GPTBot/1.0 (+https://openai.com/gptbot)';\n\n  const cmd1 = `curl -sL --max-time ${CURL_TIMEOUT} -w \"\\\\n__SIZE__:%{size_download}__CODE__:%{http_code}\" -A \"${defaultUA}\" \"https://${domain}/\"`;\n  const cmd2 = `curl -sL --max-time ${CURL_TIMEOUT} -w \"\\\\n__SIZE__:%{size_download}__CODE__:%{http_code}\" -A \"${gptUA}\" \"https://${domain}/\"`;\n\n  const r1 = runCurl(cmd1);\n  const r2 = runCurl(cmd2);\n\n  const parse = (r) => {\n    const sm = r.output.match(/__SIZE__:(\\d+)/);\n    const cm = r.output.match(/__CODE__:(\\d+)/);\n    return {\n      size: sm ? parseInt(sm[1]) : 0,\n      code: cm ? parseInt(cm[1]) : 0,\n      bodySnippet: r.output.replace(/__SIZE__:\\d+__CODE__:\\d+/, '').slice(0, 300),\n      error: r.error,\n    };\n  };\n\n  const def = parse(r1);\n  const bot = parse(r2);\n\n  if (r1.error === 'TIMEOUT' || r2.error === 'TIMEOUT') {\n    return { pass: false, note: 'TIMEOUT', commands: [cmd1, cmd2], defaultSize: def.size, botSize: bot.size, defaultCode: def.code, botCode: bot.code, evidence: 'TIMEOUT on one or both requests' };\n  }\n\n  const isBlocked = (d) => d.code === 403 || d.code === 429 || d.code === 0;\n  if (isBlocked(def) && isBlocked(bot)) {\n    return { pass: false, note: 'BLOCKED', commands: [cmd1, cmd2], defaultSize: def.size, botSize: bot.size, defaultCode: def.code, botCode: bot.code, evidence: `WAF block: default=${def.code}, bot=${bot.code}` };\n  }\n\n  const ratio = def.size > 0 ? bot.size / def.size : 0;\n  const bothHaveContent = def.size > 5000 && bot.size > 5000;\n  const similarSize = ratio >= 0.5 && ratio <= 2.0;\n  const pass = bothHaveContent && similarSize;\n\n  return {\n    pass,\n    commands: [cmd1, cmd2],\n    defaultSize: def.size,\n    botSize: bot.size,\n    defaultCode: def.code,\n    botCode: bot.code,\n    ratio: parseFloat(ratio.toFixed(3)),\n    note: !bothHaveContent\n      ? `Insufficient content: default=${def.size}B, bot=${bot.size}B`\n      : !similarSize ? `Size ratio out of bounds: ${ratio.toFixed(2)}` : null,\n    defaultBodySnippet: def.bodySnippet,\n    botBodySnippet: bot.bodySnippet,\n    evidence: pass\n      ? `Both UAs have content: ${def.size}B (default) / ${bot.size}B (GPTBot), ratio ${ratio.toFixed(2)}`\n      : `FAIL: default=${def.size}B(${def.code}), bot=${bot.size}B(${bot.code}), ratio=${ratio.toFixed(2)}`,\n  };\n}\n\n// ─── SIGNAL 4: AI CONTENT FEED ────────────────────────────────────────────────\nfunction testAIFeed(domain) {\n  const paths = [\n    '/.well-known/ai-content-index.json',\n    '/ai-content-index.json',\n    '/for-ai',\n    '/for-ai.txt',\n  ];\n  const results = [];\n  for (const p of paths) {\n    const cmd = `curl -sIL --max-time ${CURL_TIMEOUT} \"https://${domain}${p}\"`;\n    const r = runCurl(cmd);\n    const httpCode = extractHeaderHttpCode(r.output);\n    const contentType = extractContentType(r.output);\n    const pass = httpCode === 200 && (contentType.includes('application/json') || contentType.includes('text/plain') || contentType.includes('text/html'));\n    results.push({ path: p, command: cmd, httpCode, contentType, rawHeaders: r.output.slice(0, 1000), error: r.error, pass });\n    if (pass) break;\n  }\n  const passing = results.find(r => r.pass);\n  return {\n    pass: !!passing,\n    pathsTested: results,\n    passingPath: passing ? passing.path : null,\n    evidence: passing\n      ? `HTTP 200 at ${passing.path}, content-type: ${passing.contentType}`\n      : `No AI feed found. Results: ${results.map(r => `${r.path}=${r.httpCode}`).join(', ')}`,\n  };\n}\n\n// ─── SIGNAL 5: JSON-LD ────────────────────────────────────────────────────────\nfunction testJSONLD(domain) {\n  const cmd = `curl -sL --max-time ${CURL_TIMEOUT} \"https://${domain}/\"`;\n  const r = runCurl(cmd);\n  const body = r.output;\n  const count = (body.match(/application\\/ld\\+json/g) || []).length;\n  const pass = count >= 1;\n\n  const snippets = [];\n  let pos = 0;\n  while (snippets.length < 2) {\n    const idx = body.indexOf('application/ld+json', pos);\n    if (idx === -1) break;\n    snippets.push(body.slice(Math.max(0, idx - 30), idx + 300));\n    pos = idx + 1;\n  }\n\n  return {\n    pass,\n    command: cmd,\n    count,\n    bodyLengthBytes: body.length,\n    jsonLdSnippets: snippets,\n    error: r.error,\n    evidence: pass\n      ? `Found ${count} application/ld+json block(s) on homepage (${body.length}B response)`\n      : `No application/ld+json on homepage (${body.length}B response)${r.error ? ' [' + r.error + ']' : ''}`,\n  };\n}\n\n// ─── SIGNAL 6: TTFB (LATENCY-COMPENSATED, 3-HIT) ─────────────────────────────\n// Methodology: 3 hits per site. Hits 1 and 2 warm the cache. Score on hit 3.\n// compensatedMs = rawTTFB_3 - connectTime_3 (isolates server-side processing).\n// Fallback: if hit 3 errors, use hit 2. Pass threshold: <200ms compensated.\nfunction testTTFB(domain) {\n  const cmd = `curl -sL --max-time ${CURL_TIMEOUT} -o /dev/null -w \"ttfb=%{time_starttransfer} connect=%{time_connect}\" \"https://${domain}/\"`;\n\n  const measure = () => {\n    const r = runCurl(cmd);\n    const ttfbMatch = (r.output || '').match(/ttfb=([\\d.]+)/);\n    const connectMatch = (r.output || '').match(/connect=([\\d.]+)/);\n    const ttfb = ttfbMatch ? parseFloat(ttfbMatch[1]) : NaN;\n    const connect = connectMatch ? parseFloat(connectMatch[1]) : NaN;\n    const rawMs = isNaN(ttfb) ? null : Math.round(ttfb * 1000);\n    const connectMs = isNaN(connect) ? null : Math.round(connect * 1000);\n    const compensatedMs = rawMs !== null && connectMs !== null ? rawMs - connectMs : null;\n    return { rawMs, connectMs, compensatedMs, raw: r.output, error: r.error };\n  };\n\n  const hit1 = measure();\n  const hit2 = measure();\n  const hit3 = measure();\n\n  // Score on hit 3; fall back to hit 2 if hit 3 errored\n  const scored = (!hit3.error && hit3.compensatedMs !== null) ? hit3 : hit2;\n  const finalCompensatedMs = scored.compensatedMs;\n  const pass = finalCompensatedMs !== null && finalCompensatedMs > 0 && finalCompensatedMs < TTFB_THRESHOLD_MS;\n\n  return {\n    pass,\n    command: cmd,\n    ttfbThreshold_ms: TTFB_THRESHOLD_MS,\n    methodology: 'Score on hit 3 (warm/cached). compensatedMs = rawTTFB - connectTime (per-request). Fallback to hit 2 if hit 3 errors.',\n    hit1: { raw_ms: hit1.rawMs, connect_ms: hit1.connectMs, compensated_ms: hit1.compensatedMs, error: hit1.error || null },\n    hit2: { raw_ms: hit2.rawMs, connect_ms: hit2.connectMs, compensated_ms: hit2.compensatedMs, error: hit2.error || null },\n    hit3: { raw_ms: hit3.rawMs, connect_ms: hit3.connectMs, compensated_ms: hit3.compensatedMs, error: hit3.error || null },\n    scoredHit: (!hit3.error && hit3.compensatedMs !== null) ? 3 : 2,\n    ttfb_ms: finalCompensatedMs,\n    error: hit3.error || hit2.error || null,\n    evidence: pass\n      ? `TTFB hit3: ${finalCompensatedMs}ms compensated (raw ${scored.rawMs}ms - connect ${scored.connectMs}ms) -- PASS (<${TTFB_THRESHOLD_MS}ms)`\n      : `TTFB hit3: ${finalCompensatedMs !== null ? finalCompensatedMs + 'ms compensated' : 'N/A'} -- FAIL${scored.error ? ' [' + scored.error + ']' : ''}`,\n  };\n}\n\n// ─── SIGNAL 7: 10+ AI BOTS ALLOWED ───────────────────────────────────────────\nfunction testAIBots(domain) {\n  const cmd = `curl -sL --max-time ${CURL_TIMEOUT} \"https://${domain}/robots.txt\"`;\n  const r = runCurl(cmd);\n\n  if (r.error === 'TIMEOUT') {\n    return { pass: false, note: 'TIMEOUT', command: cmd, botsAllowed: [], count: 0, evidence: 'TIMEOUT fetching robots.txt' };\n  }\n\n  const body = r.output || '';\n  if (body.length < 5) {\n    return { pass: false, note: 'NOT FOUND', command: cmd, botsAllowed: [], count: 0, rawRobots: '', evidence: 'robots.txt not found or empty' };\n  }\n\n  const lines = body.split('\\n').map(l => l.trim());\n  const allowed = [];\n\n  for (const bot of AI_BOTS) {\n    const botLower = bot.toLowerCase();\n    let inBotBlock = false;\n    let botHasAllow = false;\n    let botHasDisallowAll = false;\n\n    for (let i = 0; i < lines.length; i++) {\n      const line = lines[i];\n      const lineLower = line.toLowerCase();\n\n      if (lineLower === `user-agent: ${botLower}`) {\n        inBotBlock = true;\n        botHasAllow = false;\n        botHasDisallowAll = false;\n        continue;\n      }\n      if (inBotBlock) {\n        if (lineLower.startsWith('user-agent:') || line === '') {\n          inBotBlock = false;\n          continue;\n        }\n        if (lineLower.startsWith('allow:')) botHasAllow = true;\n        if (lineLower === 'disallow: /') botHasDisallowAll = true;\n      }\n\n      if (lineLower.startsWith('allow:') && lineLower.includes(botLower)) {\n        botHasAllow = true;\n      }\n    }\n\n    if (botHasAllow && !botHasDisallowAll) {\n      allowed.push(bot);\n    }\n  }\n\n  const pass = allowed.length >= 10;\n  return {\n    pass,\n    command: cmd,\n    botsAllowed: allowed,\n    count: allowed.length,\n    rawRobots: body.slice(0, 3000),\n    evidence: pass\n      ? `${allowed.length} AI bots explicitly allowed: ${allowed.join(', ')}`\n      : `Only ${allowed.length}/10 bots allowed${allowed.length > 0 ? ': ' + allowed.join(', ') : ' (none)'}`,\n  };\n}\n\n// ─── SIGNAL 8: HTTP/3 ─────────────────────────────────────────────────────────\nfunction testHTTP3(domain) {\n  const cmd = `curl -sIL --max-time ${CURL_TIMEOUT} \"https://${domain}/\"`;\n  const r = runCurl(cmd);\n  const altSvcMatch = r.output.match(/alt-svc:\\s*([^\\r\\n]+)/i);\n  const altSvc = altSvcMatch ? altSvcMatch[1].trim() : '';\n  const pass = altSvc.toLowerCase().includes('h3');\n  return {\n    pass,\n    command: cmd,\n    altSvc,\n    rawHeaders: r.output.slice(0, 2000),\n    error: r.error,\n    evidence: pass\n      ? `alt-svc contains h3: ${altSvc}`\n      : `No h3 in alt-svc${altSvc ? ': ' + altSvc : ' (header absent)'}${r.error ? ' [' + r.error + ']' : ''}`,\n  };\n}\n\n// ─── AUDIT ONE SITE ───────────────────────────────────────────────────────────\nfunction auditSite(site) {\n  const { rank, domain, industry } = site;\n  const auditedAt = new Date().toISOString();\n\n  const mcp = testMCP(domain);\n  const llms_txt = testLlmsTxt(domain);\n  const clean_room = testCleanRoom(domain);\n  const ai_feed = testAIFeed(domain);\n  const json_ld = testJSONLD(domain);\n  const ttfb = testTTFB(domain);\n  const ai_bots_allowed = testAIBots(domain);\n  const http3 = testHTTP3(domain);\n\n  const signals = { mcp, llms_txt, clean_room, ai_feed, json_ld, ttfb, ai_bots_allowed, http3 };\n  const score = Object.values(signals).filter(s => s.pass).length;\n\n  const row = [\n    mcp.pass             ? 'Y' : mcp.note === 'BLOCKED'        ? 'B' : 'N',\n    llms_txt.pass        ? 'Y' : 'N',\n    clean_room.pass      ? 'Y' : clean_room.note === 'BLOCKED' ? 'B' : clean_room.note === 'TIMEOUT' ? 'T' : 'N',\n    ai_feed.pass         ? 'Y' : 'N',\n    json_ld.pass         ? 'Y' : 'N',\n    ttfb.pass            ? 'Y' : ttfb.error === 'TIMEOUT'      ? 'T' : 'N',\n    ai_bots_allowed.pass ? 'Y' : 'N',\n    http3.pass           ? 'Y' : 'N',\n  ].join(' ');\n\n  console.log(`  [${String(rank).padStart(3, ' ')}/100] ${score}/8 [${row}]  ${domain}`);\n\n  const receipt = { rank, domain, industry, auditedAt, score, maxScore: 8, scriptVersion: SCRIPT_VERSION, signals };\n  const receiptForHashing = JSON.stringify(receipt);\n  receipt.receiptSha256 = sha256(receiptForHashing);\n\n  return receipt;\n}\n\n// ─── MAIN ─────────────────────────────────────────────────────────────────────\nasync function run() {\n  if (!fs.existsSync(OUTPUT_DIR)) fs.mkdirSync(OUTPUT_DIR, { recursive: true });\n\n  const curlVersion = getCurlVersion();\n\n  console.log('\\n' + '='.repeat(70));\n  console.log('Global AI Citation Infrastructure Audit v3.0');\n  console.log('='.repeat(70));\n  console.log(`Timestamp:       ${RUN_TIMESTAMP}`);\n  console.log(`Script:          v${SCRIPT_VERSION}`);\n  console.log(`Shell:           ${SHELL}`);\n  console.log(`curl:            ${curlVersion}`);\n  console.log(`Sites:           ${SITES.length} (home pages only)`);\n  console.log(`Range:           ${START_RANK}-${END_RANK}`);\n  console.log(`TTFB threshold:  <${TTFB_THRESHOLD_MS}ms (server-side, after latency compensation)`);\n  console.log(`Signals:         MCP | llms.txt | CleanRoom | AIFeed | JSON-LD | TTFB | Bots | HTTP3`);\n  console.log(`Legend:          Y=Pass N=Fail B=Blocked T=Timeout`);\n  console.log('='.repeat(70) + '\\n');\n\n  console.log('='.repeat(70));\n  console.log('Phase 1: Auditing sites');\n  console.log('='.repeat(70) + '\\n');\n\n  const sitesToAudit = SITES.filter(s => s.rank >= START_RANK && s.rank <= END_RANK);\n  const results = [];\n  const manifest = [];\n\n  for (const site of sitesToAudit) {\n    try {\n      const receipt = auditSite(site);\n      results.push(receipt);\n\n      const filename = `${String(site.rank).padStart(3, '0')}_${site.domain.replace(/\\./g, '_')}.json`;\n      fs.writeFileSync(path.join(OUTPUT_DIR, filename), JSON.stringify(receipt, null, 2));\n\n      manifest.push({\n        rank: site.rank,\n        domain: site.domain,\n        score: receipt.score,\n        auditedAt: receipt.auditedAt,\n        file: filename,\n        sha256: receipt.receiptSha256,\n      });\n    } catch (err) {\n      console.error(`  ERROR on ${site.domain}: ${err.message}`);\n      results.push({ rank: site.rank, domain: site.domain, industry: site.industry, auditedAt: new Date().toISOString(), score: 0, maxScore: 8, error: err.message, signals: {} });\n    }\n\n    await sleep(DELAY_MS);\n  }\n\n  // ─── RESULTS ────────────────────────────────────────────────────────────────\n  const sorted = [...results].sort((a, b) => b.score - a.score || a.rank - b.rank);\n\n  const dist = {};\n  for (let i = 0; i <= 8; i++) dist[i] = results.filter(r => r.score === i).length;\n\n  console.log('\\n' + '='.repeat(70));\n  console.log('RESULTS');\n  console.log('='.repeat(70));\n  console.log('\\nScore Distribution:');\n  for (let i = 8; i >= 0; i--) {\n    if (dist[i] > 0) console.log(`  ${i}/8 : ${dist[i]} site(s)`);\n  }\n\n  console.log('\\nTop 10:');\n  sorted.slice(0, 10).forEach((r, i) => {\n    const s = r.signals || {};\n    const row = [\n      s.mcp?.pass ? 'Y' : 'N',\n      s.llms_txt?.pass ? 'Y' : 'N',\n      s.clean_room?.pass ? 'Y' : 'N',\n      s.ai_feed?.pass ? 'Y' : 'N',\n      s.json_ld?.pass ? 'Y' : 'N',\n      s.ttfb?.pass ? 'Y' : 'N',\n      s.ai_bots_allowed?.pass ? 'Y' : 'N',\n      s.http3?.pass ? 'Y' : 'N',\n    ].join(' ');\n    console.log(`  ${i + 1}. ${r.domain.padEnd(30)} ${r.score}/8  [${row}]`);\n  });\n\n  const t10 = results.find(r => r.domain === 'top10lists.us');\n  if (t10 && t10.signals) {\n    const s = t10.signals;\n    console.log(`\\ntop10lists.us final: ${t10.score}/8`);\n    console.log(`  MCP:         ${s.mcp?.pass ? 'PASS' : 'FAIL'}  ${s.mcp?.evidence || ''}`);\n    console.log(`  llms.txt:    ${s.llms_txt?.pass ? 'PASS' : 'FAIL'}  ${s.llms_txt?.evidence || ''}`);\n    console.log(`  Clean-Room:  ${s.clean_room?.pass ? 'PASS' : 'FAIL'}  ${s.clean_room?.evidence || ''}`);\n    console.log(`  AI Feed:     ${s.ai_feed?.pass ? 'PASS' : 'FAIL'}  ${s.ai_feed?.evidence || ''}`);\n    console.log(`  JSON-LD:     ${s.json_ld?.pass ? 'PASS' : 'FAIL'}  ${s.json_ld?.evidence || ''}`);\n    console.log(`  TTFB:        ${s.ttfb?.pass ? 'PASS' : 'FAIL'}  ${s.ttfb?.evidence || ''}`);\n    console.log(`  AI Bots:     ${s.ai_bots_allowed?.pass ? 'PASS' : 'FAIL'}  ${s.ai_bots_allowed?.evidence || ''}`);\n    console.log(`  HTTP/3:      ${s.http3?.pass ? 'PASS' : 'FAIL'}  ${s.http3?.evidence || ''}`);\n  }\n\n  // ─── SAVE FILES ─────────────────────────────────────────────────────────────\n  const manifestSha256 = sha256(JSON.stringify(manifest));\n\n  const summary = {\n    auditMeta: {\n      title: 'Global AI Citation Infrastructure Audit v3.0',\n      subtitle: 'With Residential Network Latency Compensation',\n      runTimestamp: RUN_TIMESTAMP,\n      runDate: RUN_DATE,\n      scriptVersion: SCRIPT_VERSION,\n      curlVersion,\n      shellUsed: SHELL,\n      platform: process.platform,\n      nodeVersion: process.version,\n      ttfbThreshold_ms: TTFB_THRESHOLD_MS,\n      methodology: {\n        description: 'Live curl commands with -L (follow redirects). Verbatim HTTP responses. No assumptions. Each signal requires specific HTTP criteria.',\n        ttfbCompensation: `TTFB uses 3-hit per-request normalization. Hits 1 and 2 warm the cache; score is on hit 3 only. compensatedMs = rawTTFB_3 - connectTime_3 (isolates server-side processing). Fallback to hit 2 if hit 3 errors. Threshold: <${TTFB_THRESHOLD_MS}ms. No global network baseline calibration is used.`,\n      },\n      signalCount: 8,\n      siteCount: sitesToAudit.length,\n    },\n    scoreDistribution: dist,\n    manifestSha256,\n    topSites: sorted.slice(0, 20).map((r, i) => ({\n      auditRank: i + 1,\n      domain: r.domain,\n      industry: r.industry,\n      score: r.score,\n      signals: {\n        mcp:             r.signals?.mcp?.pass ?? false,\n        llms_txt:        r.signals?.llms_txt?.pass ?? false,\n        clean_room:      r.signals?.clean_room?.pass ?? false,\n        ai_feed:         r.signals?.ai_feed?.pass ?? false,\n        json_ld:         r.signals?.json_ld?.pass ?? false,\n        ttfb:            r.signals?.ttfb?.pass ?? false,\n        ai_bots_allowed: r.signals?.ai_bots_allowed?.pass ?? false,\n        http3:           r.signals?.http3?.pass ?? false,\n      },\n    })),\n    allResults: sorted,\n    receiptManifest: manifest,\n  };\n\n  fs.writeFileSync(\n    path.join(OUTPUT_DIR, `AUDIT-SUMMARY-${RUN_DATE}.json`),\n    JSON.stringify(summary, null, 2)\n  );\n\n  fs.writeFileSync(\n    path.join(OUTPUT_DIR, `MANIFEST-${RUN_DATE}.json`),\n    JSON.stringify({ runTimestamp: RUN_TIMESTAMP, scriptVersion: SCRIPT_VERSION, manifestSha256, receipts: manifest }, null, 2)\n  );\n\n  const csvHeader = 'rank,domain,industry,score,mcp,llms_txt,clean_room,ai_feed,json_ld,ttfb_raw_ms,ttfb_compensated_ms,ttfb_pass,ai_bots_count,ai_bots_pass,http3,audited_at,receipt_sha256';\n  const csvRows = results.map(r => {\n    const s = r.signals || {};\n    return [\n      r.rank,\n      r.domain,\n      `\"${r.industry}\"`,\n      r.score,\n      s.mcp?.pass ? 1 : 0,\n      s.llms_txt?.pass ? 1 : 0,\n      s.clean_room?.pass ? 1 : 0,\n      s.ai_feed?.pass ? 1 : 0,\n      s.json_ld?.pass ? 1 : 0,\n      s.ttfb?.hit3?.raw_ms ?? '',\n      s.ttfb?.ttfb_ms ?? '',\n      s.ttfb?.pass ? 1 : 0,\n      s.ai_bots_allowed?.count ?? 0,\n      s.ai_bots_allowed?.pass ? 1 : 0,\n      s.http3?.pass ? 1 : 0,\n      r.auditedAt,\n      r.receiptSha256 || '',\n    ].join(',');\n  });\n\n  fs.writeFileSync(\n    path.join(OUTPUT_DIR, `AUDIT-RESULTS-${RUN_DATE}.csv`),\n    [csvHeader, ...csvRows].join('\\n')\n  );\n\n  console.log('\\n' + '='.repeat(70));\n  console.log(`Output: ${OUTPUT_DIR}/`);\n  console.log(`  AUDIT-SUMMARY-${RUN_DATE}.json    (full results + raw evidence)`);\n  console.log(`  AUDIT-RESULTS-${RUN_DATE}.csv     (spreadsheet-ready)`);\n  console.log(`  MANIFEST-${RUN_DATE}.json          (SHA-256 hashes of all receipts)`);\n  console.log(`  ${results.length} individual site receipts`);\n  console.log(`  Manifest SHA-256: ${manifestSha256}`);\n  console.log('='.repeat(70) + '\\n');\n}\n\nrun().catch(err => {\n  console.error('Fatal error:', err.message);\n  process.exit(1);\n});\n"
    },
    "phase2_geo_scoring": {
      "model": "claude-sonnet-4-6",
      "batch_size": 20,
      "context_per_agent": [
        "scoring-prompt.md",
        "one site receipt JSON — no other context"
      ],
      "output_per_site": "JSON with 9 dimension scores, composite, band, sev0_flags",
      "prompt_content": "You are a GEO scoring agent. Score one site against the 9-dimension rubric below using only the crawl evidence provided. Return JSON only — no explanation.\n\n## Rubric (100pts total, target 85)\n\n**1. AI Bot Access — max 15**\nCount distinct AI bot UAs under `Allow:` in robots.txt with no matching `Disallow: /`. 10+ = 15, 5–9 = 10, 1–4 = 5, 0 = 0. Source: `signal_7_robots_ai_bots.ai_bots_allowed_count`.\n\n**2. Structured Data — max 12**\nCount distinct JSON-LD `@type` values on homepage. 5+ = 12, 3–4 = 9, 1–2 = 6, 0 = 0. Source: `signal_5_jsonld` + extended `structured_data.evidence`.\n\n**3. AI-Facing Files — max 10**\nS1 = `signal_1_mcp.pass`, S2 = `signal_2_llms_txt.pass`, S4 = `signal_4_ai_content_feed.pass`. Score = round(((S1+S2+S4)/3)×10, 1). 3/3=10, 2/3=6.7, 1/3=3.3, 0=0.\n\n**4. Sitemap — max 8**\n`sitemap_valid` (sitemap.xml returns valid XML) AND `robots_references_sitemap` (robots.txt has `Sitemap:` directive) = 8; either alone = 5; neither = 0.\n\n**5. Content Density — max 15**\nHomepage word count after stripping `<script>`, `<style>`, all HTML tags. 3000+ = 15, 1500–2999 = 12, 500–1499 = 8, 100–499 = 4, <100 = 0. Source: extended `content_density.evidence` word count field.\n\n**6. Citation Data — max 12**\nCount `sameAs` array entries + `citation` + `isBasedOn` + `author.url` + `mainEntityOfPage` + `subjectOf` across all JSON-LD blocks. 10+ = 12, 5–9 = 9, 1–4 = 5, 0 = 0.\n\n**7. Tech Perf — max 5**\nS6 = `signal_6_ttfb.pass` (compensated TTFB < 200ms), S8 = `signal_8_http3.pass` (alt-svc contains h3). Score = (S6 × 2.5) + (S8 × 2.5).\n\n**8. Freshness — max 8**\n% of up-to-100 sampled `<lastmod>` values from sitemap.xml within 90 days of audit date. 80%+ = 8, 50–79% = 6, 20–49% = 4, 1–19% = 2, 0% or no lastmod = 0.\n\n**9. Authority — max 15**\nDerived from Brand Authority subtotal (max 40): Wikipedia article (+10), 3+ .gov/.edu citations (+10), domain age (20+yr=10/10–19yr=7/5–9yr=5/2–4yr=3/<2yr=1), brand strength (5+ organic mentions AND PAA≥3=10/3+ mentions=5/else=0). Then: score = round(BA/40 × 15, 1).\n\n## SEV-0 rule\nAny dimension scoring below 70% of its max is a SEV-0 flag. Include `\"sev0_flags\": [\"dim_name\", ...]` in output (empty array if none). Thresholds: ai_bot_access<10.5, structured_data<8.4, ai_facing_files<7.0, sitemap<5.6, content_density<10.5, citation_data<8.4, tech_perf<3.5, freshness<5.6, authority<10.5.\n\n## Evidence fields in the crawl JSON\n- Pillar receipt (`NNN_slug.json`): `signals.signal_1_mcp.pass`, `signal_2_llms_txt.pass`, `signal_3_cleanroom_html`, `signal_4_ai_content_feed.pass`, `signal_5_jsonld.jsonld_script_count`, `signal_6_ttfb.compensated_ttfb_ms`, `signal_7_robots_ai_bots.ai_bots_allowed_count`, `signal_8_http3.pass`\n- GEO sidecar (`NNN_slug_extended.json`): `geo_dimensions.*` with per-dimension scores, evidence strings, and word counts already computed — use these as ground truth if present; re-derive only if a field is missing.\n\n## Output format\n```json\n{\n  \"site\": \"<domain>\",\n  \"dimensions\": {\n    \"ai_bot_access\":    {\"score\": N, \"max\": 15, \"rationale\": \"<10 words>\"},\n    \"structured_data\":  {\"score\": N, \"max\": 12, \"rationale\": \"<10 words>\"},\n    \"ai_facing_files\":  {\"score\": N, \"max\": 10, \"rationale\": \"<10 words>\"},\n    \"sitemap\":          {\"score\": N, \"max\":  8, \"rationale\": \"<10 words>\"},\n    \"content_density\":  {\"score\": N, \"max\": 15, \"rationale\": \"<10 words>\"},\n    \"citation_data\":    {\"score\": N, \"max\": 12, \"rationale\": \"<10 words>\"},\n    \"tech_perf\":        {\"score\": N, \"max\":  5, \"rationale\": \"<10 words>\"},\n    \"freshness\":        {\"score\": N, \"max\":  8, \"rationale\": \"<10 words>\"},\n    \"authority\":        {\"score\": N, \"max\": 15, \"rationale\": \"<10 words>\"}\n  },\n  \"composite\": N,\n  \"target\": 85,\n  \"gap\": N,\n  \"band\": \"<invisible|fragmented|recognized|high_fidelity>\",\n  \"sev0_flags\": []\n}\n```\n\nBands: composite 0–54 = invisible, 55–69 = fragmented, 70–84 = recognized, 85–100 = high_fidelity.\n\n## Input\nThe crawl evidence JSON follows this prompt.\n"
    },
    "phase3_aifs_probe": {
      "batch_size": 10,
      "calls_per_site": "20 queries × 4 platforms = 80 API calls",
      "platform_models": [
        { "platform": "Perplexity", "model": "sonar" },
        { "platform": "OpenAI", "model": "gpt-4o-mini-search-preview" },
        { "platform": "Anthropic", "model": "claude-haiku-4-5" },
        { "platform": "Gemini", "model": "gemini-2.5-flash" }
      ],
      "gemini_grounding_note": "Pass rule must scan groundingChunks[].web.title — Gemini grounding URIs are opaque redirects that hide the real domain. The cited domain only appears in the title field.",
      "output_dir": "C:/Users/<user>/Downloads/probe_aifs_results/",
      "script_content": "/**\n * score-probe-aifs-survey.ts — Standalone ARM Probe for any arbitrary domain.\n *\n * No Supabase. No client registry. No ledger writes.\n * Pure: domain → 20 generated queries → SERP + 4-platform probes → JSON sidecar.\n *\n * Usage:\n *   npm run aifs-survey -- --domain nih.gov\n *   npm run aifs-survey -- --domain apple.com\n *\n * Output: C:\\Users\\ROBER\\Downloads\\probe_aifs_results\\survey_<domain>_<timestamp>.json\n */\n\nimport { writeFileSync, mkdirSync, existsSync } from \"fs\";\nimport { resolve, dirname } from \"path\";\nimport { fileURLToPath } from \"url\";\n\nimport {\n  loadEnv,\n  generateQuerySet,\n  buildSerperQuery,\n  querySerper,\n  extractSerpSignals,\n  calculateSerpVisibility,\n  serpGaps,\n  runPlatformProbes,\n  scoreToBand,\n} from \"./lib/probe-aifs.js\";\n\n// ---------------------------------------------------------------------------\n// Bootstrap\n// ---------------------------------------------------------------------------\n\nconst __dirname = dirname(fileURLToPath(import.meta.url));\nconst PROJECT_ROOT = resolve(__dirname, \"..\");\n\n// Load from geogroup/.env (canonical for this project); also try gildi for shared keys\nconst geoEnv = loadEnv(resolve(PROJECT_ROOT, \".env\"));\nconst gildiEnv = loadEnv(\"C:/Users/rober/gildi/.env\");\n\nconst apiKeys = {\n  OPENAI_API_KEY:      gildiEnv.OPENAI_API_KEY      || geoEnv.OPENAI_API_KEY      || \"\",\n  ANTHROPIC_API_KEY:   gildiEnv.ANTHROPIC_API_KEY   || geoEnv.ANTHROPIC_API_KEY   || \"\",\n  PERPLEXITY_API_KEY:  gildiEnv.PERPLEXITY_API_KEY  || geoEnv.PERPLEXITY_API_KEY  || \"\",\n  GEMINI_API_KEY:      gildiEnv.GEMINI_API_KEY       || geoEnv.GEMINI_API_KEY       || \"\",\n  SERPER_API_KEY:      gildiEnv.SERPER_API_KEY       || geoEnv.SERPER_API_KEY       || \"\",\n};\n\n// Warn but don't abort — missing platforms gracefully score 0\nconst missingKeys = Object.entries(apiKeys).filter(([, v]) => !v).map(([k]) => k);\nif (missingKeys.length > 0) {\n  console.warn(`WARNING: Missing API keys (will skip those platforms): ${missingKeys.join(\", \")}`);\n}\n\n// ---------------------------------------------------------------------------\n// CLI arg parsing\n// ---------------------------------------------------------------------------\n\nfunction parseArgs(): { domain: string } {\n  const args = process.argv.slice(2);\n  const domainIdx = args.indexOf(\"--domain\");\n  if (domainIdx === -1 || !args[domainIdx + 1]) {\n    console.error(\"ERROR: --domain <domain> is required.\\nExample: npm run aifs-survey -- --domain nih.gov\");\n    process.exit(1);\n  }\n  const domain = args[domainIdx + 1].toLowerCase().replace(/^https?:\\/\\//, \"\").replace(/\\/$/, \"\");\n  return { domain };\n}\n\n// ---------------------------------------------------------------------------\n// Display name derivation\n// Strips TLD + ccTLD, uppercases the remainder.\n// \"nih.gov\" → \"NIH\", \"apple.com\" → \"Apple\", \"mit.edu\" → \"MIT\"\n// ---------------------------------------------------------------------------\n\nfunction deriveDisplayName(domain: string): string {\n  // Strip www.\n  const bare = domain.replace(/^www\\./, \"\");\n  // Remove TLD(s): last one or two dot-segments\n  const parts = bare.split(\".\");\n  // Keep only the main label (parts[0])\n  const label = parts[0];\n  // Heuristic: all-caps if ≤5 chars (acronyms), otherwise title-case\n  if (label.length <= 5) return label.toUpperCase();\n  return label.charAt(0).toUpperCase() + label.slice(1).toLowerCase();\n}\n\n// ---------------------------------------------------------------------------\n// Main\n// ---------------------------------------------------------------------------\n\nasync function main() {\n  const { domain } = parseArgs();\n  const displayName = deriveDisplayName(domain);\n  const timestamp = new Date().toISOString().replace(/[:.]/g, \"-\").slice(0, 19);\n\n  console.log(`\\nARM Probe Survey`);\n  console.log(`  Domain:       ${domain}`);\n  console.log(`  Display name: ${displayName}`);\n  console.log(`  Timestamp:    ${timestamp}`);\n  console.log(\"\");\n\n  // 1. Generate 20-query set\n  if (!apiKeys.ANTHROPIC_API_KEY) {\n    console.error(\"ERROR: ANTHROPIC_API_KEY is required to generate the query set.\");\n    process.exit(1);\n  }\n  console.log(\"Step 1/3: Generating 20-query probe set via Anthropic...\");\n  let queries: string[];\n  try {\n    queries = await generateQuerySet(displayName, domain, \"\", [], apiKeys.ANTHROPIC_API_KEY);\n    console.log(`  Generated ${queries.length} queries.\\n`);\n  } catch (e: unknown) {\n    console.error(`ERROR generating query set: ${e instanceof Error ? e.message : String(e)}`);\n    process.exit(1);\n  }\n\n  // 2. SERP visibility (60-pt component)\n  console.log(\"Step 2/3: SERP visibility via Serper...\");\n  let serpScore = 0;\n  let serpSignals = {\n    hasKnowledgeGraph: false,\n    hasSitelinkSalience: false,\n    hasRelatedCitations: false,\n    thirdPartyValidationCount: 0,\n  };\n\n  if (apiKeys.SERPER_API_KEY) {\n    try {\n      // No primary_location or vertical_keywords for survey runs — use bare brand query\n      const serperQuery = buildSerperQuery(displayName, \"\", []);\n      const serperRaw = await querySerper(serperQuery, apiKeys.SERPER_API_KEY);\n      serpSignals = extractSerpSignals(\n        serperRaw as Record<string, unknown>,\n        displayName,\n        domain,\n        [], // no authority_domains for survey runs\n      );\n      serpScore = calculateSerpVisibility(serpSignals);\n      console.log(`  SERP score: ${Math.round(serpScore * 100) / 100}/60`);\n      console.log(`    KnowledgeGraph:    ${serpSignals.hasKnowledgeGraph}`);\n      console.log(`    SitelinkSalience:  ${serpSignals.hasSitelinkSalience}`);\n      console.log(`    RelatedCitations:  ${serpSignals.hasRelatedCitations}`);\n      console.log(`    3rd-party count:   ${serpSignals.thirdPartyValidationCount}`);\n    } catch (e: unknown) {\n      console.warn(`  Serper failed: ${e instanceof Error ? e.message : String(e)} — continuing with serp_score=0`);\n    }\n  } else {\n    console.warn(\"  SERPER_API_KEY not set — serp_score=0\");\n  }\n  console.log(\"\");\n\n  // 3. Platform probes (4 × 20 = 80 calls)\n  console.log(\"Step 3/3: Running 4-platform probes (80 calls total)...\");\n\n  console.log(\"  Platform 1/4: Perplexity...\");\n  const perplexityResult = await runPlatformProbes(\"perplexity\", queries, domain, apiKeys);\n  console.log(`    Passes: ${perplexityResult.passes}/20  Score: ${perplexityResult.score}/10  Model: ${perplexityResult.model}`);\n\n  console.log(\"  Platform 2/4: OpenAI...\");\n  const openaiResult = await runPlatformProbes(\"openai\", queries, domain, apiKeys);\n  console.log(`    Passes: ${openaiResult.passes}/20  Score: ${openaiResult.score}/10  Model: ${openaiResult.model}`);\n\n  console.log(\"  Platform 3/4: Anthropic...\");\n  const anthropicResult = await runPlatformProbes(\"anthropic\", queries, domain, apiKeys);\n  console.log(`    Passes: ${anthropicResult.passes}/20  Score: ${anthropicResult.score}/10  Model: ${anthropicResult.model}`);\n\n  console.log(\"  Platform 4/4: Gemini...\");\n  const geminiResult = await runPlatformProbes(\"gemini\", queries, domain, apiKeys);\n  console.log(`    Passes: ${geminiResult.passes}/20  Score: ${geminiResult.score}/10  Model: ${geminiResult.model}`);\n  console.log(\"\");\n\n  // 4. Compute scores\n  // External probes: 4 platforms × max 10 pts each = 40 pts total\n  const probeExternal40Raw =\n    perplexityResult.score + openaiResult.score + anthropicResult.score + geminiResult.score;\n  const probeExternal40 = Math.min(Math.round(probeExternal40Raw * 10) / 10, 40);\n  const serpScoreRounded = Math.round(serpScore * 100) / 100;\n  const rawTotal = serpScoreRounded + probeExternal40;\n  const total = Math.min(Math.round(rawTotal), 100);\n  const band = scoreToBand(total);\n\n  // 5. Gap analysis\n  const gaps = serpGaps(serpSignals);\n\n  const queryFailCounts = new Map<string, number>();\n  for (const result of [perplexityResult, openaiResult, anthropicResult, geminiResult]) {\n    for (const probe of result.per_query) {\n      if (!probe.pass) {\n        queryFailCounts.set(probe.q, (queryFailCounts.get(probe.q) ?? 0) + 1);\n      }\n    }\n  }\n  const topFailingQueries = [...queryFailCounts.entries()]\n    .sort((a, b) => b[1] - a[1])\n    .slice(0, 5)\n    .map(([q, count]) => `\"${q}\" (failed ${count}/4 platforms)`);\n\n  // 6. Source errors\n  const sourceErrors: string[] = [];\n  for (const [platform, result] of [\n    [\"perplexity\", perplexityResult],\n    [\"openai\", openaiResult],\n    [\"anthropic\", anthropicResult],\n    [\"gemini\", geminiResult],\n  ] as const) {\n    const errorProbes = result.per_query.filter(\n      (p) => p.reason.startsWith(\"error:\") || p.reason === \"no_api_key\",\n    );\n    if (errorProbes.length === 20) {\n      sourceErrors.push(`${platform}: all 20 probes errored — ${errorProbes[0].reason}`);\n    } else if (errorProbes.length > 0) {\n      sourceErrors.push(`${platform}: ${errorProbes.length}/20 probes errored`);\n    }\n  }\n\n  // 7. Build output JSON\n  const probedAt = new Date().toISOString();\n  const output = {\n    meta: {\n      script: \"score-probe-aifs-survey.ts\",\n      probe_version: \"v1-2026-04-15\",\n      probed_at: probedAt,\n    },\n    domain,\n    display_name: displayName,\n    total,\n    band,\n    serp_60: serpScoreRounded,\n    probe_40: probeExternal40,\n    serp_signals: serpSignals,\n    per_platform: {\n      perplexity: {\n        passes: perplexityResult.passes,\n        score: perplexityResult.score,\n        model: perplexityResult.model,\n        pass_rate: `${perplexityResult.passes}/20`,\n      },\n      openai: {\n        passes: openaiResult.passes,\n        score: openaiResult.score,\n        model: openaiResult.model,\n        pass_rate: `${openaiResult.passes}/20`,\n      },\n      anthropic: {\n        passes: anthropicResult.passes,\n        score: anthropicResult.score,\n        model: anthropicResult.model,\n        pass_rate: `${anthropicResult.passes}/20`,\n      },\n      gemini: {\n        passes: geminiResult.passes,\n        score: geminiResult.score,\n        model: geminiResult.model,\n        pass_rate: `${geminiResult.passes}/20`,\n      },\n    },\n    gaps,\n    top_failing_queries: topFailingQueries,\n    source_errors: sourceErrors,\n    queries,\n    platforms_detail: {\n      perplexity: perplexityResult,\n      openai: openaiResult,\n      anthropic: anthropicResult,\n      gemini: geminiResult,\n    },\n  };\n\n  // 8. Write sidecar JSON\n  const outDir = \"C:/Users/ROBER/Downloads/probe_aifs_results\";\n  if (!existsSync(outDir)) mkdirSync(outDir, { recursive: true });\n  const outFile = `${outDir}/survey_${domain.replace(/[^a-z0-9.-]/g, \"_\")}_${timestamp}.json`;\n  writeFileSync(outFile, JSON.stringify(output, null, 2), \"utf-8\");\n\n  // 9. Print summary\n  const bar = \"═\".repeat(52);\n  console.log(bar);\n  console.log(`  ARM Probe Survey: ${domain}`);\n  console.log(bar);\n  console.log(`  Composite Score:  ${total}/100`);\n  console.log(`  Band:             ${band.toUpperCase()}`);\n  console.log(`  SERP Score:       ${serpScoreRounded}/60`);\n  console.log(`  Probe Score:      ${probeExternal40}/40`);\n  console.log(\"\");\n  console.log(\"  Per-Platform Pass Rates:\");\n  console.log(`    Perplexity:  ${perplexityResult.passes}/20 (${Math.round(perplexityResult.passes / 20 * 100)}%)`);\n  console.log(`    OpenAI:      ${openaiResult.passes}/20 (${Math.round(openaiResult.passes / 20 * 100)}%)`);\n  console.log(`    Anthropic:   ${anthropicResult.passes}/20 (${Math.round(anthropicResult.passes / 20 * 100)}%)`);\n  console.log(`    Gemini:      ${geminiResult.passes}/20 (${Math.round(geminiResult.passes / 20 * 100)}%)`);\n\n  if (gaps.length > 0) {\n    console.log(\"\");\n    console.log(\"  SERP Gaps:\");\n    for (const gap of gaps) console.log(`    - ${gap}`);\n  }\n\n  if (topFailingQueries.length > 0) {\n    console.log(\"\");\n    console.log(\"  Top Failing Queries:\");\n    for (const q of topFailingQueries) console.log(`    - ${q}`);\n  }\n\n  if (sourceErrors.length > 0) {\n    console.log(\"\");\n    console.log(\"  Source Errors:\");\n    for (const e of sourceErrors) console.log(`    - ${e}`);\n  }\n\n  console.log(\"\");\n  console.log(`  Sidecar written: ${outFile}`);\n  console.log(bar);\n}\n\nmain().catch((e: unknown) => {\n  console.error(\"Fatal error:\", e instanceof Error ? e.message : String(e));\n  process.exit(1);\n});\n"
    },
    "phase4_results_page": {
      "model": "claude-sonnet-4-6",
      "inputs": [
        "Phase 1 receipt JSONs",
        "Phase 2 GEO score JSONs",
        "Phase 3 ARM probe sidecars"
      ],
      "output": "gildi/public/audit/geo-survey-<random6>.html",
      "design_reference": "audit-receipts-v3-2026-04-15/index.html (5-site pilot)"
    }
  },
  "scoring_rubric": {
    "geo_target": 85,
    "sev0_threshold": "any dimension scoring below 70% of its max",
    "dimensions": {
      "ai_bot_access": {
        "max": 15,
        "rule": "Explicit AI-bot Allow rules in robots.txt. 10+=15, 5-9=10, 1-4=5, 0=0"
      },
      "structured_data": {
        "max": 12,
        "rule": "Distinct JSON-LD @types on homepage. 5+=12, 3-4=9, 1-2=6, 0=0"
      },
      "ai_facing_files": {
        "max": 10,
        "rule": "((MCP + llms.txt + AI-feed) / 3) × 10"
      },
      "sitemap": {
        "max": 8,
        "rule": "Valid sitemap.xml AND robots.txt Sitemap directive = 8. One only = 5. Neither = 0"
      },
      "content_density": {
        "max": 15,
        "rule": "Homepage body word count. 3000+=15, 1500-2999=12, 500-1499=8, 100-499=4, <100=0"
      },
      "citation_data": {
        "max": 12,
        "rule": "Sum of sameAs/citation/isBasedOn/author.url/mainEntityOfPage/subjectOf across JSON-LD. 10+=12, 5-9=9, 1-4=6, 0=0"
      },
      "tech_perf": {
        "max": 5,
        "rule": "TTFB pass = 2.5, HTTP/3 pass = 2.5"
      },
      "freshness": {
        "max": 8,
        "rule": "% of up-to-100 sampled <lastmod> within 90 days. >=80%=8, 50-79%=6, 20-49%=4, 1-19%=2, none=0"
      },
      "authority": {
        "max": 15,
        "rule": "brand_authority_40 / 40 × 15 (derived from ARM Brand Authority subtotal)"
      }
    },
    "arm_probe": {
      "formula": "SERP Visibility (60) + External Probes (40) = 100",
      "external_probes": "20 queries × 4 platforms; each platform scores 0-10 based on pass rate; sum = 40 max",
      "bands": {
        "invisible": "0-35",
        "fragmented": "36-65",
        "recognized": "66-85",
        "high_fidelity": "86-100"
      }
    }
  },
  "ttfb_methodology": {
    "hits_per_site": 3,
    "scored_hit": "hit 3 (warm/cached)",
    "formula": "compensatedMs = rawTTFB_3 - connectTime_3",
    "pass_threshold_ms": 200,
    "rationale": "Cold starts on edge functions can be 200-500ms but real bot traffic sees warm/cached responses. Scoring on hit 3 normalizes for CDN priming. Subtracting connectTime isolates server-side processing time from network latency.",
    "deprecated": "vercel.com network baseline calibration — not used in this run"
  },
  "caveats": [
    "SERP floor: All 100 sites show serp_60 = 10 (uniform). This is a query construction artifact — broad category queries caused sitelink salience to fail uniformly. The probe_40 component (0-40) is the meaningful ARM signal.",
    "Gemini grounding redirects: Gemini returns citations as vertexaisearch.cloud.google.com opaque URIs. Pass rule must scan groundingChunks[].web.title to find the real domain.",
    "Haiku for Anthropic probes is an audit-scoped exception. Citation retrieval is mechanical (pass/fail on domain presence). Do not substitute Sonnet — it triples cost without improving outcomes. Everywhere else in the system, Sonnet is the default.",
    "Minimal-context agents: GEO scoring agents received only scoring-prompt.md + one receipt. Never bootstrap agents with full project context — it wastes cost and degrades focus.",
    "Freshness = 0 on majority of sites reflects missing or stale sitemap lastmod dates, not necessarily stale content."
  ]
}
