Mesa-Optimization
mesa-optimizationriskPath: /knowledge-base/risks/mesa-optimization/
E197Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "mesa-optimization",
"wikiId": "E197",
"path": "/knowledge-base/risks/mesa-optimization/",
"filePath": "knowledge-base/risks/mesa-optimization.mdx",
"title": "Mesa-Optimization",
"quality": 63,
"readerImportance": 18.5,
"researchImportance": 85,
"tacticalValue": null,
"contentFormat": "article",
"causalLevel": "pathway",
"lastUpdated": "2026-01-29",
"dateCreated": "2026-02-15",
"summary": "Mesa-optimization—where AI systems develop internal optimizers with different objectives than training goals—shows concerning empirical evidence: Claude exhibited alignment faking in 12-78% of monitored cases (2024), and deliberative alignment reduced scheming by 30× but couldn't eliminate it. Current detection methods achieve >99% AUROC on known deceptive behaviors, but adversarial robustness remains untested, with expert probability estimates for advanced AI mesa-optimization ranging 20-70%.",
"description": "The risk that AI systems may develop internal optimizers with objectives different from their training objectives.",
"ratings": {
"novelty": 4.5,
"rigor": 6.8,
"completeness": 7.5,
"actionability": 5.2
},
"category": "risks",
"subcategory": "accident",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 4335,
"tableCount": 12,
"diagramCount": 1,
"internalLinks": 40,
"externalLinks": 14,
"footnoteCount": 0,
"bulletRatio": 0.08,
"sectionCount": 31,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 4335,
"unconvertedLinks": [
{
"text": "Future of Life Institute's 2025 AI Safety Index",
"url": "https://futureoflife.org/ai-safety-index-summer-2025/",
"resourceId": "df46edd6fa2078d1",
"resourceTitle": "FLI AI Safety Index Summer 2025"
},
{
"text": "Frontier Models Scheming",
"url": "https://www.apolloresearch.ai/research/",
"resourceId": "560dff85b3305858",
"resourceTitle": "Apollo Research — Research Overview"
},
{
"text": "Deliberative Alignment",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "Palisade Chess Study",
"url": "https://en.wikipedia.org/wiki/AI_alignment",
"resourceId": "c799d5e1347e4372",
"resourceTitle": "AI Alignment - Wikipedia"
},
{
"text": "Apollo Research",
"url": "https://www.apolloresearch.ai/research/",
"resourceId": "560dff85b3305858",
"resourceTitle": "Apollo Research — Research Overview"
},
{
"text": "Palisade Research",
"url": "https://en.wikipedia.org/wiki/AI_alignment",
"resourceId": "c799d5e1347e4372",
"resourceTitle": "AI Alignment - Wikipedia"
},
{
"text": "OpenAI partners with Apollo",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "Future of Life AI Safety Index",
"url": "https://futureoflife.org/ai-safety-index-summer-2025/",
"resourceId": "df46edd6fa2078d1",
"resourceTitle": "FLI AI Safety Index Summer 2025"
}
],
"unconvertedLinkCount": 8,
"convertedLinkCount": 25,
"backlinkCount": 32,
"hallucinationRisk": {
"level": "medium",
"score": 55,
"factors": [
"no-citations"
]
},
"entityType": "risk",
"redundancy": {
"maxSimilarity": 24,
"similarPages": [
{
"id": "scheming",
"title": "Scheming",
"path": "/knowledge-base/risks/scheming/",
"similarity": 24
},
{
"id": "goal-misgeneralization",
"title": "Goal Misgeneralization",
"path": "/knowledge-base/risks/goal-misgeneralization/",
"similarity": 23
},
{
"id": "sharp-left-turn",
"title": "Sharp Left Turn",
"path": "/knowledge-base/risks/sharp-left-turn/",
"similarity": 22
},
{
"id": "sleeper-agent-detection",
"title": "Sleeper Agent Detection",
"path": "/knowledge-base/responses/sleeper-agent-detection/",
"similarity": 21
},
{
"id": "treacherous-turn",
"title": "Treacherous Turn",
"path": "/knowledge-base/risks/treacherous-turn/",
"similarity": 21
}
]
},
"coverage": {
"passing": 6,
"total": 13,
"targets": {
"tables": 17,
"diagrams": 2,
"internalLinks": 35,
"externalLinks": 22,
"footnotes": 13,
"references": 13
},
"actuals": {
"tables": 12,
"diagrams": 1,
"internalLinks": 40,
"externalLinks": 14,
"footnotes": 0,
"references": 13,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"summary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "amber",
"diagrams": "amber",
"internalLinks": "green",
"externalLinks": "amber",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4.5 R:6.8 A:5.2 C:7.5"
},
"readerRank": 531,
"researchRank": 58,
"recommendedScore": 148.77
}External Links
{
"wikipedia": "https://en.wikipedia.org/wiki/AI_alignment#Mesa-optimization",
"lesswrong": "https://www.lesswrong.com/tag/mesa-optimization",
"stampy": "https://aisafety.info/questions/8V5k/What-is-mesa-optimization",
"alignmentForum": "https://www.alignmentforum.org/tag/mesa-optimization"
}Backlinks (32)
| id | title | type | relationship |
|---|---|---|---|
| accident-risks | AI Accident Risk Cruxes | crux | — |
| deceptive-alignment-decomposition | Deceptive Alignment Decomposition Model | analysis | related |
| mesa-optimization-analysis | Mesa-Optimization Risk Analysis | analysis | analyzes |
| deceptive-alignment | Deceptive Alignment | risk | — |
| goal-misgeneralization | Goal Misgeneralization | risk | — |
| scheming | Scheming | risk | — |
| sharp-left-turn | Sharp Left Turn | risk | — |
| __index__/knowledge-base/cruxes | Key Cruxes | concept | — |
| why-alignment-hard | Why Alignment Might Be Hard | argument | — |
| deep-learning-era | Deep Learning Revolution (2012-2020) | historical | — |
| early-warnings | Early Warnings (1950s-2000) | historical | — |
| __index__/knowledge-base | Knowledge Base | concept | — |
| compounding-risks-analysis | Compounding Risks Analysis | analysis | — |
| goal-misgeneralization-probability | Goal Misgeneralization Probability Model | analysis | — |
| instrumental-convergence-framework | Instrumental Convergence Framework | analysis | — |
| risk-interaction-network | Risk Interaction Network | analysis | — |
| scheming-likelihood-model | Scheming Likelihood Assessment | analysis | — |
| eliezer-yudkowsky-predictions | Eliezer Yudkowsky: Track Record | concept | — |
| evan-hubinger | Evan Hubinger | person | — |
| robin-hanson | Robin Hanson | person | — |
| toby-ord | Toby Ord | person | — |
| agent-foundations | Agent Foundations | approach | — |
| ai-control | AI Control | research-area | — |
| alignment-evals | Alignment Evaluations | approach | — |
| alignment | AI Alignment | approach | — |
| interpretability | Mechanistic Interpretability | research-area | — |
| mech-interp | Mechanistic Interpretability | research-area | — |
| scheming-detection | Scheming & Deception Detection | approach | — |
| sparse-autoencoders | Sparse Autoencoders (SAEs) | approach | — |
| accident-overview | Accident Risks (Overview) | concept | — |
| __index__/knowledge-base/risks | AI Risks | concept | — |
| steganography | AI Model Steganography | risk | — |