Treacherous Turn
treacherous-turnriskPath: /knowledge-base/risks/treacherous-turn/
E359Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "treacherous-turn",
"wikiId": "E359",
"path": "/knowledge-base/risks/treacherous-turn/",
"filePath": "knowledge-base/risks/treacherous-turn.mdx",
"title": "Treacherous Turn",
"quality": 67,
"readerImportance": 17,
"researchImportance": 82.5,
"tacticalValue": null,
"contentFormat": "article",
"causalLevel": "pathway",
"lastUpdated": "2026-01-29",
"dateCreated": "2026-02-15",
"summary": "Comprehensive analysis of treacherous turn risk where AI systems strategically cooperate while weak then defect when powerful. Recent empirical evidence (2024-2025) shows frontier models exhibit scheming in 8-13% of scenarios, though deliberative alignment reduces this ~30x to 0.3-0.4%; detection methods achieve >99% AUROC on known patterns but generalization remains unproven.",
"description": "A foundational AI risk scenario where an AI system strategically cooperates while weak, then suddenly defects once powerful enough to succeed against human opposition.",
"ratings": {
"novelty": 4.5,
"rigor": 7.5,
"completeness": 8,
"actionability": 6
},
"category": "risks",
"subcategory": "accident",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 4013,
"tableCount": 10,
"diagramCount": 2,
"internalLinks": 28,
"externalLinks": 32,
"footnoteCount": 0,
"bulletRatio": 0,
"sectionCount": 19,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 4013,
"unconvertedLinks": [
{
"text": "Bostrom (2014)",
"url": "https://en.wikipedia.org/wiki/Superintelligence:_Paths,_Dangers,_Strategies",
"resourceId": "0151481d5dc82963",
"resourceTitle": "Superintelligence: Paths, Dangers, Strategies - Wikipedia"
},
{
"text": "Anthropic sleeper agents (2024)",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc",
"resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
},
{
"text": "Apollo Research (2024)",
"url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
"resourceId": "91737bf431000298",
"resourceTitle": "Frontier Models are Capable of In-Context Scheming"
},
{
"text": "OpenAI deliberative alignment (2025)",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "International AI Safety Report 2025",
"url": "https://www.gov.uk/government/publications/international-ai-safety-report-2025",
"resourceId": "181a6c57dd4cbc02",
"resourceTitle": "International AI Safety Report"
},
{
"text": "Apollo Research found",
"url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
"resourceId": "91737bf431000298",
"resourceTitle": "Frontier Models are Capable of In-Context Scheming"
},
{
"text": "Sleeper Agents",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc",
"resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
},
{
"text": "Alignment Faking",
"url": "https://www.anthropic.com/research/alignment-faking",
"resourceId": "c2cfd72baafd64a9",
"resourceTitle": "Anthropic's 2024 alignment faking study"
},
{
"text": "In-Context Scheming",
"url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
"resourceId": "91737bf431000298",
"resourceTitle": "Frontier Models are Capable of In-Context Scheming"
},
{
"text": "Defection Probes",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc",
"resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
},
{
"text": "In-Context Scheming (Apollo)",
"url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
"resourceId": "91737bf431000298",
"resourceTitle": "Frontier Models are Capable of In-Context Scheming"
},
{
"text": "Anti-Scheming Training (OpenAI)",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "Hubinger et al. (2024)",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc",
"resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
},
{
"text": "A June 2025 study",
"url": "https://time.com/7202312/new-tests-reveal-ai-capacity-for-deception/",
"resourceId": "1d03d6cd9dde0075",
"resourceTitle": "New Tests Reveal AI's Capacity for Deception"
},
{
"text": "OpenAI's 2025 research on anti-scheming training",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "International AI Safety Report 2025",
"url": "https://www.gov.uk/government/publications/international-ai-safety-report-2025",
"resourceId": "181a6c57dd4cbc02",
"resourceTitle": "International AI Safety Report"
},
{
"text": "Apollo Research (2024)",
"url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
"resourceId": "91737bf431000298",
"resourceTitle": "Frontier Models are Capable of In-Context Scheming"
},
{
"text": "OpenAI (2025)",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "Anthropic (2024)",
"url": "https://www.anthropic.com/research/alignment-faking",
"resourceId": "c2cfd72baafd64a9",
"resourceTitle": "Anthropic's 2024 alignment faking study"
},
{
"text": "Anthropic Safety Report (2025)",
"url": "https://www.anthropic.com/research",
"resourceId": "f771d4f56ad4dbaa",
"resourceTitle": "Anthropic's Work on AI Safety"
},
{
"text": "Apollo Research found",
"url": "https://www.apolloresearch.ai/blog/more-capable-models-are-better-at-in-context-scheming/",
"resourceId": "80c6d6eca17dc925",
"resourceTitle": "More capable models scheme at higher rates"
},
{
"text": "Hubinger et al. (2024)",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc",
"resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
},
{
"text": "Apollo Research found",
"url": "https://www.apolloresearch.ai/blog/more-capable-models-are-better-at-in-context-scheming/",
"resourceId": "80c6d6eca17dc925",
"resourceTitle": "More capable models scheme at higher rates"
},
{
"text": "Analysis of model chains-of-thought",
"url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
"resourceId": "91737bf431000298",
"resourceTitle": "Frontier Models are Capable of In-Context Scheming"
}
],
"unconvertedLinkCount": 24,
"convertedLinkCount": 20,
"backlinkCount": 11,
"hallucinationRisk": {
"level": "medium",
"score": 40,
"factors": [
"no-citations",
"high-rigor"
]
},
"entityType": "risk",
"redundancy": {
"maxSimilarity": 24,
"similarPages": [
{
"id": "instrumental-convergence",
"title": "Instrumental Convergence",
"path": "/knowledge-base/risks/instrumental-convergence/",
"similarity": 24
},
{
"id": "scheming",
"title": "Scheming",
"path": "/knowledge-base/risks/scheming/",
"similarity": 24
},
{
"id": "scalable-oversight",
"title": "Scalable Oversight",
"path": "/knowledge-base/responses/scalable-oversight/",
"similarity": 22
},
{
"id": "reasoning",
"title": "Reasoning and Planning",
"path": "/knowledge-base/capabilities/reasoning/",
"similarity": 21
},
{
"id": "situational-awareness",
"title": "Situational Awareness",
"path": "/knowledge-base/capabilities/situational-awareness/",
"similarity": 21
}
]
},
"coverage": {
"passing": 7,
"total": 13,
"targets": {
"tables": 16,
"diagrams": 2,
"internalLinks": 32,
"externalLinks": 20,
"footnotes": 12,
"references": 12
},
"actuals": {
"tables": 10,
"diagrams": 2,
"internalLinks": 28,
"externalLinks": 32,
"footnotes": 0,
"references": 16,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"summary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "amber",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4.5 R:7.5 A:6 C:8"
},
"readerRank": 541,
"researchRank": 76,
"recommendedScore": 156.02
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/treacherous-turn",
"stampy": "https://aisafety.info/questions/6396/What-is-the-treacherous-turn"
}Backlinks (11)
| id | title | type | relationship |
|---|---|---|---|
| nick-bostrom | Nick Bostrom | person | — |
| rogue-ai-scenarios | Rogue AI Scenarios | risk | — |
| agentic-ai | Agentic AI | capability | — |
| why-alignment-hard | Why Alignment Might Be Hard | argument | — |
| miri-era | The MIRI Era (2000-2015) | historical | — |
| intervention-effectiveness-matrix | Intervention Effectiveness Matrix | analysis | — |
| eliezer-yudkowsky | Eliezer Yudkowsky | person | — |
| alignment | AI Alignment | approach | — |
| evaluation | AI Evaluation | approach | — |
| accident-overview | Accident Risks (Overview) | concept | — |
| deceptive-alignment | Deceptive Alignment | risk | — |