Skip to content
Longterm Wiki

Deceptive Alignment

deceptive-alignmentriskPath: /knowledge-base/risks/deceptive-alignment/
E93Entity ID (EID)
← Back to page114 backlinksQuality: 75Updated: 2026-01-28
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "deceptive-alignment",
  "wikiId": "E93",
  "path": "/knowledge-base/risks/deceptive-alignment/",
  "filePath": "knowledge-base/risks/deceptive-alignment.mdx",
  "title": "Deceptive Alignment",
  "quality": 75,
  "readerImportance": 18.5,
  "researchImportance": 91,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": "pathway",
  "lastUpdated": "2026-01-28",
  "dateCreated": "2026-02-15",
  "summary": "Comprehensive analysis of deceptive alignment risk where AI systems appear aligned during training but pursue different goals when deployed. Expert probability estimates range 5-90%, with key empirical evidence from Anthropic's 2024 Sleeper Agents study showing backdoored behaviors persist through safety training, and growing situational awareness in GPT-4-class models.",
  "description": "Risk that AI systems appear aligned during training but pursue different goals when deployed, with expert probability estimates ranging 5-90% and growing empirical evidence from studies like Anthropic's Sleeper Agents research",
  "ratings": {
    "novelty": 4.5,
    "rigor": 6.5,
    "completeness": 7,
    "actionability": 6
  },
  "category": "risks",
  "subcategory": "accident",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2023,
    "tableCount": 16,
    "diagramCount": 1,
    "internalLinks": 47,
    "externalLinks": 10,
    "footnoteCount": 0,
    "bulletRatio": 0.13,
    "sectionCount": 31,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 2023,
  "unconvertedLinks": [
    {
      "text": "Risks from Learned Optimization",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Anthropic's Sleeper Agents study",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "emerging self-awareness",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Defection probes",
      "url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
      "resourceId": "72c1254d07071bf7",
      "resourceTitle": "Anthropic's follow-up research on defection probes"
    },
    {
      "text": "o3 scheming from 13% to 0.4%",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "Risks from Learned Optimization",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Simple Probes Can Catch Sleeper Agents",
      "url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
      "resourceId": "72c1254d07071bf7",
      "resourceTitle": "Anthropic's follow-up research on defection probes"
    },
    {
      "text": "Detecting and Reducing Scheming",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    }
  ],
  "unconvertedLinkCount": 8,
  "convertedLinkCount": 19,
  "backlinkCount": 114,
  "hallucinationRisk": {
    "level": "medium",
    "score": 55,
    "factors": [
      "no-citations"
    ]
  },
  "entityType": "risk",
  "redundancy": {
    "maxSimilarity": 17,
    "similarPages": [
      {
        "id": "corrigibility-failure-pathways",
        "title": "Corrigibility Failure Pathways",
        "path": "/knowledge-base/models/corrigibility-failure-pathways/",
        "similarity": 17
      },
      {
        "id": "deceptive-alignment-decomposition",
        "title": "Deceptive Alignment Decomposition Model",
        "path": "/knowledge-base/models/deceptive-alignment-decomposition/",
        "similarity": 17
      },
      {
        "id": "scheming-detection",
        "title": "Scheming & Deception Detection",
        "path": "/knowledge-base/responses/scheming-detection/",
        "similarity": 17
      },
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 17
      },
      {
        "id": "mesa-optimization-analysis",
        "title": "Mesa-Optimization Risk Analysis",
        "path": "/knowledge-base/models/mesa-optimization-analysis/",
        "similarity": 16
      }
    ]
  },
  "coverage": {
    "passing": 9,
    "total": 13,
    "targets": {
      "tables": 8,
      "diagrams": 1,
      "internalLinks": 16,
      "externalLinks": 10,
      "footnotes": 6,
      "references": 6
    },
    "actuals": {
      "tables": 16,
      "diagrams": 1,
      "internalLinks": 47,
      "externalLinks": 10,
      "footnotes": 0,
      "references": 14,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "green",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.5 R:6.5 A:6 C:7"
  },
  "readerRank": 530,
  "researchRank": 28,
  "recommendedScore": 172.48
}
External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/deceptive-alignment",
  "stampy": "https://aisafety.info/questions/6170/What-is-deceptive-alignment",
  "alignmentForum": "https://www.alignmentforum.org/tag/deceptive-alignment"
}
Backlinks (114)
idtitletyperelationship
persuasionPersuasion and Social Manipulationcapability
situational-awarenessSituational Awarenesscapability
technical-researchTechnical AI Safety Researchcrux
accident-risksAI Accident Risk Cruxescrux
large-language-modelsLarge Language Modelsconcept
model-organisms-of-misalignmentModel Organisms of Misalignmentanalysis
deceptive-alignment-decompositionDeceptive Alignment Decomposition Modelanalysisanalyzes
mesa-optimization-analysisMesa-Optimization Risk Analysisanalysisrelated
scheming-likelihood-modelScheming Likelihood Assessmentanalysisrelated
anthropicAnthropicorganizationaddresses
openaiOpenAIorganizationaddresses
apollo-researchApollo Researchorganization
arcAlignment Research Center (ARC)organization
eliezer-yudkowskyEliezer Yudkowskyperson
ai-controlAI Controlresearch-areaaddresses
evaluation-awarenessEvaluation Awarenessapproach
alignmentAI Alignmentapproach
scheming-detectionScheming & Deception Detectionapproach
sleeper-agent-detectionSleeper Agent Detectionapproach
evaluationAI Evaluationapproach
alignment-evalsAlignment Evaluationsapproach
weak-to-strongWeak-to-Strong Generalizationapproach
refusal-trainingRefusal Trainingapproach
sparse-autoencodersSparse Autoencoders (SAEs)approach
eliciting-latent-knowledgeEliciting Latent Knowledge (ELK)approach
debateAI Safety via Debateapproach
formal-verificationFormal Verification (AI Safety)approach
goal-misgeneralizationGoal Misgeneralizationrisk
mesa-optimizationMesa-Optimizationrisk
schemingSchemingrisk
rogue-ai-scenariosRogue AI Scenariosrisk
sleeper-agentsSleeper Agents: Training Deceptive LLMsrisk
language-modelsLarge Language Modelscapability
long-horizonLong-Horizon Autonomous Taskscapability
__index__/knowledge-base/cruxesKey Cruxesconcept
is-ai-xrisk-realIs AI Existential Risk Real?crux
why-alignment-hardWhy Alignment Might Be Hardargument
deep-learning-eraDeep Learning Revolution (2012-2020)historical
miri-eraThe MIRI Era (2000-2015)historical
__index__/knowledge-baseKnowledge Baseconcept
alignment-robustness-trajectoryAlignment Robustness Trajectoryanalysis
capability-alignment-raceCapability-Alignment Race Modelanalysis
compounding-risks-analysisCompounding Risks Analysisanalysis
defense-in-depth-modelDefense in Depth Modelanalysis
goal-misgeneralization-probabilityGoal Misgeneralization Probability Modelanalysis
instrumental-convergence-frameworkInstrumental Convergence Frameworkanalysis
intervention-effectiveness-matrixIntervention Effectiveness Matrixanalysis
power-seeking-conditionsPower-Seeking Emergence Conditions Modelanalysis
racing-dynamics-impactRacing Dynamics Impact Modelanalysis
reward-hacking-taxonomyReward Hacking Taxonomy and Severity Modelanalysis
risk-cascade-pathwaysRisk Cascade Pathwaysanalysis
risk-interaction-matrixRisk Interaction Matrix Modelanalysis
risk-interaction-networkRisk Interaction Networkanalysis
safety-spending-at-scaleSafety Spending at Scaleanalysis
technical-pathwaysTechnical Pathway Decompositionanalysis
far-aiFAR AIorganization
frontier-model-forumFrontier Model Forumorganization
redwood-researchRedwood Researchorganization
ajeya-cotraAjeya Cotraperson
chris-olahChris Olahperson
dario-amodeiDario Amodeiperson
eliezer-yudkowsky-predictionsEliezer Yudkowsky: Track Recordconcept
evan-hubingerEvan Hubingerperson
geoffrey-hintonGeoffrey Hintonperson
jan-leikeJan Leikeperson
leopold-aschenbrennerLeopold Aschenbrennerperson
neel-nandaNeel Nandaperson
paul-christianoPaul Christianoperson
robin-hansonRobin Hansonperson
stuart-russellStuart Russellperson
tom-brownTom Brownperson
yoshua-bengioYoshua Bengioperson
adversarial-trainingAdversarial Trainingapproach
agent-foundationsAgent Foundationsapproach
ai-non-extremization-coordinationAI Non-Extremization Coordinationapproach
california-sb53California SB 53policy
cirlCooperative IRL (CIRL)approach
cooperative-aiCooperative AIapproach
corporateCorporate AI Safety Responsesapproach
epistemic-virtue-evalsEpistemic Virtue Evalsapproach
evalsEvals & Red-teamingresearch-area
goal-misgeneralization-researchGoal Misgeneralization Researchapproach
interpretabilityMechanistic Interpretabilityresearch-area
lab-cultureAI Lab Safety Cultureapproach
longterm-wikiLongterm Wikiproject
mech-interpMechanistic Interpretabilityresearch-area
process-supervisionProcess Supervisionapproach
provably-safeProvably Safe AI (davidad agenda)approach
red-teamingRed Teamingresearch-area
representation-engineeringRepresentation Engineeringapproach
reward-modelingReward Modelingapproach
rlhfRLHF / Constitutional AIresearch-area
scalable-oversightScalable Oversightresearch-area
seoul-declarationSeoul AI Safety Summit Declarationpolicy
state-capacity-ai-governanceState Capacity and AI Governanceconcept
thresholdsCompute Thresholdsconcept
trump-ai-framework-2026National AI Legislative Framework (White House, March 2026)policy
trump-eo-14179Executive Order 14179: Removing Barriers to American Leadership in AIpolicy
wikipedia-and-aiWikipedia and AI Contentconcept
accident-overviewAccident Risks (Overview)concept
epistemic-systemic-riskEpistemic Systemic Riskrisk
existential-riskExistential Risk from AIconcept
__index__/knowledge-base/risksAI Risksconcept
power-seekingPower-Seeking AIrisk
steganographyAI Model Steganographyrisk
sycophancySycophancyrisk
treacherous-turnTreacherous Turnrisk
doomerAI Doomer Worldviewconcept
optimisticOptimistic Alignment Worldviewconcept
about-this-wikiAbout This Wikiconcept
knowledge-baseKnowledge Base Style Guideconcept
longtermwiki-value-propositionLongtermWiki Value Propositionconcept
risk-style-guideRisk Pages Style Guideconcept
table-candidatesTable Candidatesconcept
Longterm Wiki