Skip to content

Commit 17d0867

Browse files
kennethkalmerclaude
andcommitted
Add User-Agent based bot detection for markdown serving
Enhances content negotiation to serve markdown to LLM bots based on User-Agent strings, in addition to Accept header detection. This ensures bots like Claude, ChatGPT, Perplexity, and Google AI get markdown even if they don't send proper Accept headers. Bot detection: - Detects several LLM bot User-Agents (Claude, ChatGPT, Perplexity, Google AI) - Conservative list - no generic HTTP libraries to avoid false positives - Combines with existing Accept header logic using nginx map variables - Serves markdown if EITHER bot detected OR Accept header requests markdown Implementation: - Added $is_llm_bot map for User-Agent pattern matching - Updated $docs_file_extension map to combine bot + Accept header detection - Uses map variable concatenation: "${is_llm_bot}${wants_markdown_via_accept}" - Works seamlessly with existing try_files logic Testing: - Added new tests for bot User-Agent detection - Tests bot override behavior (bot gets markdown even with Accept: text/html) - Verified browsers still get HTML by default 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent addcdec commit 17d0867

File tree

2 files changed

+87
-15
lines changed

2 files changed

+87
-15
lines changed

bin/assert-content-negotiation.sh

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,24 @@ export PORT=${PORT:-3001}
2222
# $3: expected_status - Expected HTTP status code
2323
# $4: expected_format - "html", "markdown", or "any"
2424
# $5: test_name - Human-readable test description
25+
# $6: user_agent - Optional User-Agent string
2526
run_test() {
2627
local path="$1"
2728
local accept_header="$2"
2829
local expected_status="$3"
2930
local expected_format="$4"
3031
local test_name="$5"
32+
local user_agent="${6:-}"
3133

3234
echo "🧪 $test_name"
3335

34-
# Build curl command with optional Accept header
36+
# Build curl command with optional Accept header and User-Agent
3537
local curl_cmd="curl --silent --header \"X-Forwarded-Proto: https\""
3638

39+
if [ -n "$user_agent" ]; then
40+
curl_cmd="$curl_cmd --user-agent \"$user_agent\""
41+
fi
42+
3743
if [ -n "$accept_header" ]; then
3844
curl_cmd="$curl_cmd --header \"Accept: $accept_header\""
3945
fi
@@ -130,8 +136,26 @@ run_test "/docs/nonexistent" "text/markdown" "404" "any" "404 with markdown Acce
130136
run_test "/llms.txt" "" "200" "any" "Non-docs paths unaffected"
131137
echo
132138

139+
# Group 6: Bot Detection (User-Agent)
140+
echo "Group 6: Bot Detection (User-Agent)"
141+
echo "------------------------------------"
142+
run_test "/docs/channels" "" "200" "markdown" "Claude-User bot gets markdown" "Claude-User/1.0"
143+
run_test "/docs/channels" "" "200" "markdown" "ClaudeBot gets markdown" "Mozilla/5.0 (compatible; ClaudeBot/1.0)"
144+
run_test "/docs/channels" "" "200" "markdown" "ChatGPT-User bot gets markdown" "ChatGPT-User"
145+
run_test "/docs/channels" "" "200" "markdown" "GPTBot gets markdown" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0)"
146+
run_test "/docs/channels" "" "200" "markdown" "PerplexityBot gets markdown" "PerplexityBot"
147+
run_test "/docs/channels" "" "200" "html" "Regular browser gets HTML" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
148+
echo
149+
150+
# Group 7: Combined Bot + Accept Header
151+
echo "Group 7: Combined Bot + Accept Header"
152+
echo "--------------------------------------"
153+
run_test "/docs/channels" "text/html" "200" "markdown" "Bot overrides Accept: text/html" "Claude-User/1.0"
154+
run_test "/docs/channels" "text/markdown" "200" "markdown" "Bot + markdown Accept both work" "GPTBot/1.0"
155+
echo
156+
133157
echo "================================"
134-
echo "✅ All 16 tests passed!"
158+
echo "✅ All 23 tests passed!"
135159
echo "================================"
136160

137161
# Exit explicitly with success

config/nginx.conf.erb

Lines changed: 61 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,25 +64,73 @@ http {
6464

6565
##
6666
# CONTENT NEGOTIATION FOR MARKDOWN
67-
# Maps Accept header to file extension preference
67+
# Serves markdown to LLM bots and clients that request it via Accept header
6868

69-
map $http_accept $docs_file_extension {
70-
default ".html";
69+
# Detect LLM bots by User-Agent
70+
map $http_user_agent $is_llm_bot {
71+
default 0;
72+
73+
# Anthropic / Claude
74+
"~*Claude-User" 1;
75+
"~*ClaudeBot" 1;
76+
"~*anthropic-ai" 1;
77+
78+
# OpenAI / ChatGPT
79+
"~*ChatGPT-User" 1;
80+
"~*GPTBot" 1;
81+
82+
# Perplexity
83+
"~*PerplexityBot" 1;
84+
"~*Perplexity-User" 1;
85+
86+
# Google AI
87+
"~*Google-Extended" 1;
88+
"~*GoogleOther" 1;
89+
"~*Gemini" 1;
90+
91+
# Mistral AI
92+
"~*MistralAI-User" 1;
93+
94+
# Meta / Facebook
95+
"~*Meta-ExternalAgent" 1;
96+
97+
# Amazon
98+
"~*Amazonbot" 1;
99+
100+
# ByteDance / TikTok
101+
"~*Bytespider" 1;
102+
}
103+
104+
# Detect markdown request via Accept header
105+
map $http_accept $wants_markdown_via_accept {
106+
default 0;
71107

72108
# Exact markdown MIME types
73-
"text/markdown" ".md";
74-
"application/markdown" ".md";
75-
"text/plain" ".md";
109+
"text/markdown" 1;
110+
"application/markdown" 1;
111+
"text/plain" 1;
112+
113+
# Browsers explicitly want HTML (check first before wildcard patterns)
114+
"~*^text/html" 0;
76115

77-
# IMPORTANT: Check start-of-string patterns FIRST (before wildcard patterns)
78-
# Explicit HTML request gets HTML (handles browser defaults like "text/html, text/markdown")
79-
"~*^text/html" ".html";
116+
# Accept header contains markdown types
117+
"~*text/markdown" 1;
118+
"~*application/markdown" 1;
80119

81-
# Handle multiple Accept values - prefer markdown if explicitly requested
82-
"~*text/markdown" ".md";
83-
"~*application/markdown" ".md";
120+
# Wildcard gets HTML
121+
"*/*" 0;
122+
}
123+
124+
# Serve markdown if bot detected OR markdown requested via Accept header
125+
# Combines: ${is_llm_bot}${wants_markdown_via_accept} → "00", "01", "10", or "11"
126+
map "${is_llm_bot}${wants_markdown_via_accept}" $docs_file_extension {
127+
default ".html";
84128

85-
"*/*" ".html";
129+
# If either variable is 1, serve markdown
130+
"10" ".md"; # Bot detected, no markdown Accept
131+
"01" ".md"; # No bot, markdown Accept
132+
"11" ".md"; # Both
133+
"00" ".html"; # Neither
86134
}
87135

88136
# Translate extension to file path

0 commit comments

Comments
 (0)