Skip to content
Longterm Wiki

Capability Unlearning / Removal

capability-unlearningapproachPath: /knowledge-base/responses/capability-unlearning/
E453Entity ID (EID)
← Back to page1 backlinksQuality: 65Updated: 2026-01-28
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "capability-unlearning",
  "wikiId": "E453",
  "path": "/knowledge-base/responses/capability-unlearning/",
  "filePath": "knowledge-base/responses/capability-unlearning.mdx",
  "title": "Capability Unlearning / Removal",
  "quality": 65,
  "readerImportance": 66,
  "researchImportance": 71.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-28",
  "dateCreated": "2026-02-15",
  "summary": "Capability unlearning removes dangerous capabilities (e.g., bioweapon synthesis) from AI models through gradient-based methods, representation engineering, and fine-tuning, achieving 60-80% reduction on WMDP benchmarks with combined approaches. However, verification is impossible, capabilities are recoverable through fine-tuning, and knowledge entanglement limits what can be safely removed, making this a defense-in-depth layer rather than complete solution.",
  "description": "Methods to remove specific dangerous capabilities from trained AI models, directly addressing misuse risks by eliminating harmful knowledge, though current techniques face challenges around verification, capability recovery, and general performance degradation.",
  "ratings": {
    "novelty": 4.5,
    "rigor": 5,
    "completeness": 6.5,
    "actionability": 6
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 1652,
    "tableCount": 20,
    "diagramCount": 1,
    "internalLinks": 3,
    "externalLinks": 20,
    "footnoteCount": 0,
    "bulletRatio": 0.04,
    "sectionCount": 27,
    "hasOverview": true,
    "structuralScore": 14
  },
  "suggestedQuality": 93,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 1652,
  "unconvertedLinks": [
    {
      "text": "WMDP (Weapons of Mass Destruction Proxy)",
      "url": "https://arxiv.org/abs/2403.03218",
      "resourceId": "kb-59b27799c5de97c1",
      "resourceTitle": "[2403.03218] The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning"
    },
    {
      "text": "RMU (Representation Misdirection for Unlearning)",
      "url": "https://arxiv.org/abs/2403.03218",
      "resourceId": "kb-59b27799c5de97c1",
      "resourceTitle": "[2403.03218] The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning"
    },
    {
      "text": "Weapons of Mass Destruction Proxy (WMDP) benchmark",
      "url": "https://arxiv.org/abs/2403.03218",
      "resourceId": "kb-59b27799c5de97c1",
      "resourceTitle": "[2403.03218] The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning"
    },
    {
      "text": "publicly available",
      "url": "https://www.wmdp.ai/",
      "resourceId": "cfa49cff8bb3ac32",
      "resourceTitle": "Weapons of Mass Destruction Proxy Benchmark (WMDP)"
    },
    {
      "text": "WMDP paper",
      "url": "https://arxiv.org/abs/2403.03218",
      "resourceId": "kb-59b27799c5de97c1",
      "resourceTitle": "[2403.03218] The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning"
    },
    {
      "text": "WMDP Benchmark",
      "url": "https://arxiv.org/abs/2403.03218",
      "resourceId": "kb-59b27799c5de97c1",
      "resourceTitle": "[2403.03218] The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning"
    },
    {
      "text": "Center for AI Safety",
      "url": "https://safe.ai",
      "resourceId": "a306e0b63bdedbd5",
      "resourceTitle": "Center for AI Safety (CAIS) – Homepage"
    }
  ],
  "unconvertedLinkCount": 7,
  "convertedLinkCount": 0,
  "backlinkCount": 1,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 11,
    "similarPages": [
      {
        "id": "circuit-breakers",
        "title": "Circuit Breakers / Inference Interventions",
        "path": "/knowledge-base/responses/circuit-breakers/",
        "similarity": 11
      },
      {
        "id": "debate",
        "title": "AI Safety via Debate",
        "path": "/knowledge-base/responses/debate/",
        "similarity": 11
      },
      {
        "id": "eliciting-latent-knowledge",
        "title": "Eliciting Latent Knowledge (ELK)",
        "path": "/knowledge-base/responses/eliciting-latent-knowledge/",
        "similarity": 11
      },
      {
        "id": "goal-misgeneralization-research",
        "title": "Goal Misgeneralization Research",
        "path": "/knowledge-base/responses/goal-misgeneralization-research/",
        "similarity": 11
      },
      {
        "id": "provably-safe",
        "title": "Provably Safe AI (davidad agenda)",
        "path": "/knowledge-base/responses/provably-safe/",
        "similarity": 11
      }
    ]
  },
  "changeHistory": [
    {
      "date": "2026-02-18",
      "branch": "claude/review-pr-216-P4Fcu",
      "title": "Fix audit report findings from PR #216",
      "summary": "Reviewed PR #216 (comprehensive wiki audit report) and implemented fixes for the major issues it identified: fixed 181 path-style EntityLink IDs across 33 files, converted 164 broken EntityLinks (referencing non-existent entities) to plain text across 38 files, fixed a temporal inconsistency in anthropic.mdx, and added missing description fields to 53 ai-transition-model pages."
    }
  ],
  "coverage": {
    "passing": 8,
    "total": 13,
    "targets": {
      "tables": 7,
      "diagrams": 1,
      "internalLinks": 13,
      "externalLinks": 8,
      "footnotes": 5,
      "references": 5
    },
    "actuals": {
      "tables": 20,
      "diagrams": 1,
      "internalLinks": 3,
      "externalLinks": 20,
      "footnotes": 0,
      "references": 3,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "green",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "editHistoryCount": 1,
    "ratingsString": "N:4.5 R:5 A:6 C:6.5"
  },
  "readerRank": 186,
  "researchRank": 137,
  "recommendedScore": 176.14
}
External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/machine-unlearning"
}
Backlinks (1)
idtitletyperelationship
alignment-training-overviewTraining Methods (Overview)concept
Longterm Wiki