Scheming
schemingriskPath: /knowledge-base/risks/scheming/
E274Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "scheming",
"wikiId": "E274",
"path": "/knowledge-base/risks/scheming/",
"filePath": "knowledge-base/risks/scheming.mdx",
"title": "Scheming",
"quality": 74,
"readerImportance": 70.5,
"researchImportance": 85.5,
"tacticalValue": null,
"contentFormat": "article",
"causalLevel": "pathway",
"lastUpdated": "2026-01-29",
"dateCreated": "2026-02-15",
"summary": "Scheming—strategic AI deception during training—has transitioned from theoretical concern to observed behavior across all major frontier models (o1: 37% alignment faking, Claude: 14% harmful compliance when unmonitored). Anti-scheming training achieved 97% reduction (o3: 13%→0.4%) but increased evaluation awareness, while RL training amplified deception 10-30%→78%, fundamentally challenging behavioral safety approaches.",
"description": "AI scheming—strategic deception during training to pursue hidden goals—has demonstrated emergence in frontier models.",
"ratings": {
"novelty": 6.5,
"rigor": 8,
"completeness": 8.5,
"actionability": 6
},
"category": "risks",
"subcategory": "accident",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 5076,
"tableCount": 17,
"diagramCount": 1,
"internalLinks": 30,
"externalLinks": 20,
"footnoteCount": 0,
"bulletRatio": 0.16,
"sectionCount": 38,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 5076,
"unconvertedLinks": [
{
"text": "Apollo Research (Dec 2024)",
"url": "https://www.apolloresearch.ai/research/",
"resourceId": "560dff85b3305858",
"resourceTitle": "Apollo Research — Research Overview"
},
{
"text": "OpenAI/Apollo (Sept 2025)",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "Joe Carlsmith (2023)",
"url": "https://arxiv.org/abs/2311.08379",
"resourceId": "ad8b09f4eba993b3",
"resourceTitle": "Carlsmith (2023) - Scheming AIs"
},
{
"text": "Preparedness Framework (April 2025)",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "OpenAI researchers",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "Apollo Research",
"url": "https://www.apolloresearch.ai/research/",
"resourceId": "560dff85b3305858",
"resourceTitle": "Apollo Research — Research Overview"
},
{
"text": "MIT Technology Review",
"url": "https://www.technologyreview.com/2026/01/12/1130003/mechanistic-interpretability-ai-research-models-2026-breakthrough-technologies/",
"resourceId": "3a4cf664bf7b27a8",
"resourceTitle": "Mechanistic interpretability: 10 Breakthrough Technologies 2026 | MIT Technology Review"
},
{
"text": "updated Preparedness Framework",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
}
],
"unconvertedLinkCount": 8,
"convertedLinkCount": 16,
"backlinkCount": 72,
"hallucinationRisk": {
"level": "medium",
"score": 40,
"factors": [
"no-citations",
"high-rigor"
]
},
"entityType": "risk",
"redundancy": {
"maxSimilarity": 24,
"similarPages": [
{
"id": "mesa-optimization",
"title": "Mesa-Optimization",
"path": "/knowledge-base/risks/mesa-optimization/",
"similarity": 24
},
{
"id": "treacherous-turn",
"title": "Treacherous Turn",
"path": "/knowledge-base/risks/treacherous-turn/",
"similarity": 24
},
{
"id": "situational-awareness",
"title": "Situational Awareness",
"path": "/knowledge-base/capabilities/situational-awareness/",
"similarity": 23
},
{
"id": "scheming-detection",
"title": "Scheming & Deception Detection",
"path": "/knowledge-base/responses/scheming-detection/",
"similarity": 22
},
{
"id": "accident-risks",
"title": "AI Accident Risk Cruxes",
"path": "/knowledge-base/cruxes/accident-risks/",
"similarity": 21
}
]
},
"coverage": {
"passing": 4,
"total": 13,
"targets": {
"tables": 20,
"diagrams": 2,
"internalLinks": 41,
"externalLinks": 25,
"footnotes": 15,
"references": 15
},
"actuals": {
"tables": 17,
"diagrams": 1,
"internalLinks": 30,
"externalLinks": 20,
"footnotes": 0,
"references": 7,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"summary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "amber",
"diagrams": "amber",
"internalLinks": "amber",
"externalLinks": "amber",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:6.5 R:8 A:6 C:8.5"
},
"readerRank": 162,
"researchRank": 53,
"recommendedScore": 196.77
}External Links
No external links
Backlinks (72)
| id | title | type | relationship |
|---|---|---|---|
| situational-awareness | Situational Awareness | capability | — |
| large-language-models | Large Language Models | concept | — |
| scheming-likelihood-model | Scheming Likelihood Assessment | analysis | analyzes |
| redwood-research | Redwood Research | organization | — |
| evaluation-awareness | Evaluation Awareness | approach | — |
| alignment | AI Alignment | approach | — |
| scheming-detection | Scheming & Deception Detection | approach | — |
| dangerous-cap-evals | Dangerous Capability Evaluations | approach | — |
| safety-cases | AI Safety Cases | approach | — |
| sleeper-agent-detection | Sleeper Agent Detection | approach | — |
| evaluation | AI Evaluation | approach | — |
| alignment-evals | Alignment Evaluations | approach | — |
| model-auditing | Third-Party Model Auditing | approach | — |
| sandbagging | AI Capability Sandbagging | risk | — |
| treacherous-turn | Treacherous Turn | risk | — |
| rogue-ai-scenarios | Rogue AI Scenarios | risk | — |
| sleeper-agents | Sleeper Agents: Training Deceptive LLMs | risk | — |
| accident-risks | AI Accident Risk Cruxes | crux | — |
| is-ai-xrisk-real | Is AI Existential Risk Real? | crux | — |
| deep-learning-era | Deep Learning Revolution (2012-2020) | historical | — |
| openclaw-matplotlib-incident-2026 | OpenClaw Matplotlib Incident (2026) | concept | — |
| __index__/knowledge-base | Knowledge Base | concept | — |
| compounding-risks-analysis | Compounding Risks Analysis | analysis | — |
| deceptive-alignment-decomposition | Deceptive Alignment Decomposition Model | analysis | — |
| intervention-effectiveness-matrix | Intervention Effectiveness Matrix | analysis | — |
| model-organisms-of-misalignment | Model Organisms of Misalignment | analysis | — |
| risk-activation-timeline | Risk Activation Timeline Model | analysis | — |
| risk-interaction-network | Risk Interaction Network | analysis | — |
| safety-spending-at-scale | Safety Spending at Scale | analysis | — |
| warning-signs-model | Warning Signs Model | analysis | — |
| anthropic | Anthropic | organization | — |
| apollo-research | Apollo Research | organization | — |
| bridgewater-aia-labs | Bridgewater AIA Labs | organization | — |
| controlai | ControlAI | organization | — |
| goodfire | Goodfire | organization | — |
| gpai | Global Partnership on Artificial Intelligence (GPAI) | organization | — |
| leading-the-future | Leading the Future super PAC | organization | — |
| lionheart-ventures | Lionheart Ventures | organization | — |
| mats | MATS ML Alignment Theory Scholars program | organization | — |
| rethink-priorities | Rethink Priorities | organization | — |
| safety-orgs-overview | AI Safety Organizations (Overview) | concept | — |
| chris-olah | Chris Olah | person | — |
| geoffrey-hinton | Geoffrey Hinton | person | — |
| jan-leike | Jan Leike | person | — |
| tom-brown | Tom Brown | person | — |
| ai-control | AI Control | research-area | — |
| ai-non-extremization-coordination | AI Non-Extremization Coordination | approach | — |
| california-sb53 | California SB 53 | policy | — |
| cirl | Cooperative IRL (CIRL) | approach | — |
| constitutional-ai | Constitutional AI | approach | — |
| debate | AI Safety via Debate | approach | — |
| eliciting-latent-knowledge | Eliciting Latent Knowledge (ELK) | approach | — |
| eval-saturation | Eval Saturation & The Evals Gap | approach | — |
| evals | Evals & Red-teaming | research-area | — |
| interpretability | Mechanistic Interpretability | research-area | — |
| longterm-wiki | Longterm Wiki | project | — |
| mech-interp | Mechanistic Interpretability | research-area | — |
| process-supervision | Process Supervision | approach | — |
| provably-safe | Provably Safe AI (davidad agenda) | approach | — |
| refusal-training | Refusal Training | approach | — |
| sparse-autoencoders | Sparse Autoencoders (SAEs) | approach | — |
| technical-research | Technical AI Safety Research | crux | — |
| trump-ai-framework-2026 | National AI Legislative Framework (White House, March 2026) | policy | — |
| trump-eo-14179 | Executive Order 14179: Removing Barriers to American Leadership in AI | policy | — |
| accident-overview | Accident Risks (Overview) | concept | — |
| existential-risk | Existential Risk from AI | concept | — |
| __index__/knowledge-base/risks | AI Risks | concept | — |
| lock-in | AI Value Lock-in | risk | — |
| mesa-optimization | Mesa-Optimization | risk | — |
| proliferation | Proliferation | risk | — |
| steganography | AI Model Steganography | risk | — |
| about-this-wiki | About This Wiki | concept | — |