Longterm Wiki

Alignment Robustness

alignment-robustness (E20)
← Back to pagePath: /ai-transition-model/alignment-robustness/
Page Metadata
{
  "id": "alignment-robustness",
  "numericId": null,
  "path": "/ai-transition-model/alignment-robustness/",
  "filePath": "ai-transition-model/alignment-robustness.mdx",
  "title": "Alignment Robustness",
  "quality": null,
  "importance": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": null,
  "llmSummary": "This page contains only a React component import with no actual content rendered in the provided text. Cannot assess importance or quality without the actual substantive content.",
  "structuredSummary": null,
  "description": null,
  "ratings": {
    "novelty": 0,
    "rigor": 0,
    "actionability": 0,
    "completeness": 0
  },
  "category": "ai-transition-model",
  "subcategory": "factors-misalignment-potential",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 0,
    "tableCount": 0,
    "diagramCount": 0,
    "internalLinks": 0,
    "externalLinks": 0,
    "footnoteCount": 0,
    "bulletRatio": 0,
    "sectionCount": 0,
    "hasOverview": false,
    "structuralScore": 2
  },
  "suggestedQuality": 13,
  "updateFrequency": null,
  "evergreen": true,
  "wordCount": 0,
  "unconvertedLinks": [],
  "unconvertedLinkCount": 0,
  "convertedLinkCount": 0,
  "backlinkCount": 12,
  "redundancy": {
    "maxSimilarity": 0,
    "similarPages": []
  }
}
Entity Data
{
  "id": "alignment-robustness",
  "type": "ai-transition-model-parameter",
  "title": "Alignment Robustness",
  "description": "How reliably AI systems pursue intended goals across contexts, distribution shifts, and adversarial conditions. Measures the stability of alignment under real-world deployment.",
  "tags": [
    "safety",
    "technical",
    "alignment"
  ],
  "relatedEntries": [
    {
      "id": "reward-hacking",
      "type": "risk",
      "relationship": "decreases"
    },
    {
      "id": "mesa-optimization",
      "type": "risk",
      "relationship": "decreases"
    },
    {
      "id": "goal-misgeneralization",
      "type": "risk",
      "relationship": "decreases"
    },
    {
      "id": "deceptive-alignment",
      "type": "risk",
      "relationship": "decreases"
    },
    {
      "id": "sycophancy",
      "type": "risk",
      "relationship": "decreases"
    },
    {
      "id": "interpretability",
      "type": "approach",
      "relationship": "supports"
    },
    {
      "id": "evals",
      "type": "approach",
      "relationship": "supports"
    },
    {
      "id": "ai-control",
      "type": "approach",
      "relationship": "supports"
    },
    {
      "id": "interpretability-coverage",
      "type": "ai-transition-model-parameter",
      "relationship": "related"
    },
    {
      "id": "safety-capability-gap",
      "type": "ai-transition-model-parameter",
      "relationship": "related"
    },
    {
      "id": "human-oversight-quality",
      "type": "ai-transition-model-parameter",
      "relationship": "related"
    },
    {
      "id": "alignment-progress",
      "type": "ai-transition-model-metric",
      "relationship": "measured-by"
    },
    {
      "id": "deceptive-alignment-decomposition",
      "type": "model",
      "relationship": "analyzed-by"
    },
    {
      "id": "corrigibility-failure-pathways",
      "type": "model",
      "relationship": "analyzed-by"
    },
    {
      "id": "safety-capability-tradeoff",
      "type": "model",
      "relationship": "analyzed-by"
    },
    {
      "id": "racing-dynamics-model",
      "type": "model",
      "relationship": "analyzed-by"
    }
  ],
  "sources": [],
  "lastUpdated": "2025-12",
  "customFields": [
    {
      "label": "Direction",
      "value": "Higher is better"
    },
    {
      "label": "Current Trend",
      "value": "Declining relative to capability (1-2% reward hacking in frontier models)"
    },
    {
      "label": "Key Measurement",
      "value": "Behavioral reliability under distribution shift, reward hacking rates"
    }
  ]
}
Canonical Facts (0)

No facts for this entity

External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/ai-alignment"
}
Backlinks (12)
idtitletyperelationship
misalignment-potentialMisalignment Potentialai-transition-model-factorcomposed-of
alignment-progressAlignment Progressai-transition-model-metricmeasures
safety-capability-gapSafety-Capability Gapai-transition-model-parameterrelated
existential-catastropheExistential Catastropheai-transition-model-scenariomitigates
ai-takeoverAI Takeoverai-transition-model-scenariomitigated-by
deceptive-alignment-decompositionDeceptive Alignment Decomposition Modelmodelmodels
carlsmith-six-premisesCarlsmith's Six-Premise Argumentmodelmodels
corrigibility-failure-pathwaysCorrigibility Failure Pathwaysmodelmodels
safety-capability-tradeoffSafety-Capability Tradeoff Modelmodelaffects
alignment-robustness-trajectoryAlignment Robustness Trajectory Modelmodelmodels
interpretabilityInterpretabilitysafety-agendaincreases
scalable-oversightScalable Oversightsafety-agendasupports
Frontmatter
{
  "title": "Alignment Robustness",
  "sidebar": {
    "order": 8
  },
  "importance": 0,
  "quality": 0,
  "llmSummary": "This page contains only a React component import with no actual content rendered in the provided text. Cannot assess importance or quality without the actual substantive content.",
  "ratings": {
    "novelty": 0,
    "rigor": 0,
    "actionability": 0,
    "completeness": 0
  },
  "clusters": [
    "ai-safety"
  ],
  "subcategory": "factors-misalignment-potential"
}
Raw MDX Source
---
title: Alignment Robustness
sidebar:
  order: 8
importance: 0
quality: 0
llmSummary: This page contains only a React component import with no actual content rendered in the provided text. Cannot assess importance or quality without the actual substantive content.
ratings:
  novelty: 0
  rigor: 0
  actionability: 0
  completeness: 0
clusters:
  - ai-safety
subcategory: factors-misalignment-potential
---
import {TransitionModelContent} from '@components/wiki/TransitionModelContent';

<TransitionModelContent entityId="E303" />