Alignment Robustness
alignment-robustness (E20)← Back to pagePath: /ai-transition-model/alignment-robustness/
Page Metadata
{
"id": "alignment-robustness",
"numericId": null,
"path": "/ai-transition-model/alignment-robustness/",
"filePath": "ai-transition-model/alignment-robustness.mdx",
"title": "Alignment Robustness",
"quality": null,
"importance": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": null,
"llmSummary": "This page contains only a React component import with no actual content rendered in the provided text. Cannot assess importance or quality without the actual substantive content.",
"structuredSummary": null,
"description": null,
"ratings": {
"novelty": 0,
"rigor": 0,
"actionability": 0,
"completeness": 0
},
"category": "ai-transition-model",
"subcategory": "factors-misalignment-potential",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 0,
"tableCount": 0,
"diagramCount": 0,
"internalLinks": 0,
"externalLinks": 0,
"footnoteCount": 0,
"bulletRatio": 0,
"sectionCount": 0,
"hasOverview": false,
"structuralScore": 2
},
"suggestedQuality": 13,
"updateFrequency": null,
"evergreen": true,
"wordCount": 0,
"unconvertedLinks": [],
"unconvertedLinkCount": 0,
"convertedLinkCount": 0,
"backlinkCount": 12,
"redundancy": {
"maxSimilarity": 0,
"similarPages": []
}
}Entity Data
{
"id": "alignment-robustness",
"type": "ai-transition-model-parameter",
"title": "Alignment Robustness",
"description": "How reliably AI systems pursue intended goals across contexts, distribution shifts, and adversarial conditions. Measures the stability of alignment under real-world deployment.",
"tags": [
"safety",
"technical",
"alignment"
],
"relatedEntries": [
{
"id": "reward-hacking",
"type": "risk",
"relationship": "decreases"
},
{
"id": "mesa-optimization",
"type": "risk",
"relationship": "decreases"
},
{
"id": "goal-misgeneralization",
"type": "risk",
"relationship": "decreases"
},
{
"id": "deceptive-alignment",
"type": "risk",
"relationship": "decreases"
},
{
"id": "sycophancy",
"type": "risk",
"relationship": "decreases"
},
{
"id": "interpretability",
"type": "approach",
"relationship": "supports"
},
{
"id": "evals",
"type": "approach",
"relationship": "supports"
},
{
"id": "ai-control",
"type": "approach",
"relationship": "supports"
},
{
"id": "interpretability-coverage",
"type": "ai-transition-model-parameter",
"relationship": "related"
},
{
"id": "safety-capability-gap",
"type": "ai-transition-model-parameter",
"relationship": "related"
},
{
"id": "human-oversight-quality",
"type": "ai-transition-model-parameter",
"relationship": "related"
},
{
"id": "alignment-progress",
"type": "ai-transition-model-metric",
"relationship": "measured-by"
},
{
"id": "deceptive-alignment-decomposition",
"type": "model",
"relationship": "analyzed-by"
},
{
"id": "corrigibility-failure-pathways",
"type": "model",
"relationship": "analyzed-by"
},
{
"id": "safety-capability-tradeoff",
"type": "model",
"relationship": "analyzed-by"
},
{
"id": "racing-dynamics-model",
"type": "model",
"relationship": "analyzed-by"
}
],
"sources": [],
"lastUpdated": "2025-12",
"customFields": [
{
"label": "Direction",
"value": "Higher is better"
},
{
"label": "Current Trend",
"value": "Declining relative to capability (1-2% reward hacking in frontier models)"
},
{
"label": "Key Measurement",
"value": "Behavioral reliability under distribution shift, reward hacking rates"
}
]
}Canonical Facts (0)
No facts for this entity
External Links
{
"lesswrong": "https://www.lesswrong.com/tag/ai-alignment"
}Backlinks (12)
| id | title | type | relationship |
|---|---|---|---|
| misalignment-potential | Misalignment Potential | ai-transition-model-factor | composed-of |
| alignment-progress | Alignment Progress | ai-transition-model-metric | measures |
| safety-capability-gap | Safety-Capability Gap | ai-transition-model-parameter | related |
| existential-catastrophe | Existential Catastrophe | ai-transition-model-scenario | mitigates |
| ai-takeover | AI Takeover | ai-transition-model-scenario | mitigated-by |
| deceptive-alignment-decomposition | Deceptive Alignment Decomposition Model | model | models |
| carlsmith-six-premises | Carlsmith's Six-Premise Argument | model | models |
| corrigibility-failure-pathways | Corrigibility Failure Pathways | model | models |
| safety-capability-tradeoff | Safety-Capability Tradeoff Model | model | affects |
| alignment-robustness-trajectory | Alignment Robustness Trajectory Model | model | models |
| interpretability | Interpretability | safety-agenda | increases |
| scalable-oversight | Scalable Oversight | safety-agenda | supports |
Frontmatter
{
"title": "Alignment Robustness",
"sidebar": {
"order": 8
},
"importance": 0,
"quality": 0,
"llmSummary": "This page contains only a React component import with no actual content rendered in the provided text. Cannot assess importance or quality without the actual substantive content.",
"ratings": {
"novelty": 0,
"rigor": 0,
"actionability": 0,
"completeness": 0
},
"clusters": [
"ai-safety"
],
"subcategory": "factors-misalignment-potential"
}Raw MDX Source
---
title: Alignment Robustness
sidebar:
order: 8
importance: 0
quality: 0
llmSummary: This page contains only a React component import with no actual content rendered in the provided text. Cannot assess importance or quality without the actual substantive content.
ratings:
novelty: 0
rigor: 0
actionability: 0
completeness: 0
clusters:
- ai-safety
subcategory: factors-misalignment-potential
---
import {TransitionModelContent} from '@components/wiki/TransitionModelContent';
<TransitionModelContent entityId="E303" />