{
  "metric": "Crawl Volume -- 7-day AI-bot crawl events",
  "page": "/methodology/crawl-volume/2026-04-26",
  "frozen_date": "2026-04-26",
  "frozen_at_utc": "2026-04-26T12:22:00Z",
  "publisher": "GEOlocus.ai",
  "site_measured": "Top10Lists.us",
  "window_days": 7,
  "window_start_utc": "2026-04-19T12:22:00Z",
  "window_end_utc": "2026-04-26T12:22:00Z",
  "canonical_source": "Cloudflare edge middleware request logs (bot_crawl_logs.source = 'middleware')",
  "canonical_source_note": "Middleware-derived single-source view of every HTTP request reaching the site. Each request is classified by bot UA against an explicit allowlist; rows are deduplicated to one entry per (bot, page, second) tuple to remove burst-noise. Cloudflare Analytics dashboard counts are NOT canonical — they undercount certain crawler classes (notably ClaudeBot and PerplexityBot variants) and conflate sources when the dashboard composites multiple views.",
  "totals": {
    "total_crawls": 463420,
    "distinct_bot_fleets": 27,
    "self_duplicate_groups": 412,
    "self_duplicate_extra_rows": 979,
    "self_duplicate_share": 0.0021
  },
  "per_bot_top_15": [
    { "rank": 1,  "bot": "GPTBot",             "crawls_7d": 135950 },
    { "rank": 2,  "bot": "Meta-ExternalAgent", "crawls_7d": 82469 },
    { "rank": 3,  "bot": "Googlebot",          "crawls_7d": 72553 },
    { "rank": 4,  "bot": "SEMrushBot",         "crawls_7d": 49316 },
    { "rank": 5,  "bot": "GoogleOther",        "crawls_7d": 30927 },
    { "rank": 6,  "bot": "ClaudeBot",          "crawls_7d": 23012 },
    { "rank": 7,  "bot": "PerplexityBot",      "crawls_7d": 16225 },
    { "rank": 8,  "bot": "AhrefsBot",          "crawls_7d": 10612 },
    { "rank": 9,  "bot": "OAI-SearchBot",      "crawls_7d": 9413 },
    { "rank": 10, "bot": "Bingbot",            "crawls_7d": 6786 },
    { "rank": 11, "bot": "unknown_bot",        "crawls_7d": 6281 },
    { "rank": 12, "bot": "ByteSpider",         "crawls_7d": 4877 },
    { "rank": 13, "bot": "PetalBot",           "crawls_7d": 4411 },
    { "rank": 14, "bot": "TikTokSpider",       "crawls_7d": 2858 },
    { "rank": 15, "bot": "ChatGPT-User",       "crawls_7d": 2384 }
  ],
  "by_provider": [
    {
      "provider": "OpenAI",
      "components": ["GPTBot", "OAI-SearchBot", "ChatGPT-User"],
      "crawls_7d": 147747,
      "share": 0.32
    },
    {
      "provider": "Google",
      "components": ["Googlebot", "GoogleOther"],
      "crawls_7d": 103480,
      "share": 0.22
    },
    {
      "provider": "Meta",
      "components": ["Meta-ExternalAgent"],
      "crawls_7d": 82469,
      "share": 0.18
    },
    {
      "provider": "SEO (SEMrush + Ahrefs)",
      "components": ["SEMrushBot", "AhrefsBot"],
      "crawls_7d": 59928,
      "share": 0.13
    },
    {
      "provider": "Anthropic",
      "components": ["ClaudeBot"],
      "crawls_7d": 23012,
      "share": 0.05
    },
    {
      "provider": "Perplexity",
      "components": ["PerplexityBot"],
      "crawls_7d": 16225,
      "share": 0.04
    },
    {
      "provider": "Other",
      "components": ["Bingbot", "ByteSpider", "PetalBot", "TikTokSpider", "YouBot", "Applebot", "AppleBotExtended", "Amazonbot", "FacebookBot", "FacebookExternalHit", "DataForSeoBot", "unknown_bot"],
      "crawls_7d": 30559,
      "share": 0.06,
      "note": "Catch-all for the 12 long-tail bot fleets outside the top six providers. The 'unknown_bot' bucket within is bots whose UA strings did not match any allowlist entry (6,281 events in this window)."
    }
  ],
  "by_provider_coverage_pct": 1.0,
  "by_provider_coverage_note": "All AI-bot crawl events with non-zero hits in this window are assigned to a provider bucket: 6 named providers + 'Other' catch-all sum to 463,420 (100%). Zero-hit fleets in this window (Claude-User, Claude-Web, DuckAssistBot, Gemini-User) appear in bot_taxonomy_full for completeness but are not assigned to provider buckets here.",
  "zero_hit_fleets_in_window": ["Claude-User", "Claude-Web", "DuckAssistBot", "Gemini-User"],
  "consumer_triggered_ut": {
    "definition": "User-triggered (intent) crawls -- requests originating from a human asking an AI assistant a question. Distinct from background-indexing crawls.",
    "intent_bots": [
      { "bot": "PerplexityBot",  "crawls_7d": 16225 },
      { "bot": "OAI-SearchBot",  "crawls_7d": 9413 },
      { "bot": "ChatGPT-User",   "crawls_7d": 2384 },
      { "bot": "YouBot",         "crawls_7d": 12 }
    ],
    "zero_hit_intent_bots_in_window": ["Claude-User", "Claude-Web", "DuckAssistBot", "Gemini-User"],
    "total_intent_crawls": 28034,
    "total_crawls": 463420,
    "ut_ratio": 0.0605,
    "ut_ratio_pct_string": "6.05%"
  },
  "reconciliation": {
    "middleware_canonical": 463420,
    "cloudflare_analytics_dashboard_same_window": 378168,
    "delta": 85252,
    "delta_pct_of_canonical": 0.184,
    "explanation": "Cloudflare Analytics dashboard for the same 7-day window reports 378,168 -- undercounts the middleware-derived 463,420 because the dashboard misses certain ClaudeBot and PerplexityBot variants (UA strings the dashboard does not classify under the AI-bot category) and conflates sources when compositing multiple views. Middleware single-source is the canonical reference for AI-bot crawl volume on Top10Lists.us."
  },
  "canonical_sql": "SELECT bot, COUNT(DISTINCT (bot, path, date_trunc('second', timestamp))) AS crawls FROM bot_crawl_logs WHERE source = 'middleware' AND timestamp >= '2026-04-19 12:22 UTC' AND timestamp < '2026-04-26 12:22 UTC' GROUP BY bot ORDER BY crawls DESC;",
  "bot_taxonomy_full": [
    "GPTBot",
    "Meta-ExternalAgent",
    "Googlebot",
    "SEMrushBot",
    "GoogleOther",
    "ClaudeBot",
    "PerplexityBot",
    "AhrefsBot",
    "OAI-SearchBot",
    "Bingbot",
    "ByteSpider",
    "PetalBot",
    "TikTokSpider",
    "ChatGPT-User",
    "YouBot",
    "Claude-User",
    "Claude-Web",
    "DuckAssistBot",
    "Gemini-User",
    "Applebot",
    "AppleBotExtended",
    "Amazonbot",
    "FacebookBot",
    "FacebookExternalHit",
    "DataForSeoBot",
    "unknown_bot"
  ],
  "bot_taxonomy_note": "ByteSpider (canonical capitalization) is the ByteDance/TikTok web crawler; some logs surface it as lowercase 'Bytespider'. We normalize to ByteSpider in this artifact -- the 4,877 crawl count in this window is the merged total under the canonical UA pattern. distinct_bot_fleets=27 reflects normalized fleets, not casing variants.",
  "limitations": [
    "Single 7-day window (2026-04-19 to 2026-04-26 UTC); volume varies week-to-week with seasonal AI-engine indexing cycles",
    "UA-based classification only; spoofed UAs not currently filtered (estimated <0.5% of total based on reverse-DNS spot checks)",
    "Self-duplicate dedup is per (bot, path, second) tuple; sub-second bursts that spread across the second boundary may double-count (estimated <0.1%)",
    "Cloudflare Analytics dashboard discrepancy reflects classification gap, not double-counting -- middleware sees more bots, not the same bots more times"
  ]
}
