AI Safety Solution Cruxes
solutions (E393)← Back to pagePath: /knowledge-base/cruxes/solutions/
Page Metadata
{
"id": "solutions",
"numericId": null,
"path": "/knowledge-base/cruxes/solutions/",
"filePath": "knowledge-base/cruxes/solutions.mdx",
"title": "AI Safety Solution Cruxes",
"quality": 71,
"importance": 82,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2025-12-28",
"llmSummary": "Comprehensive analysis of key uncertainties determining optimal AI safety resource allocation across technical verification (25-40% believe AI detection can match generation), coordination mechanisms (65-80% believe labs require external enforcement), and epistemic infrastructure (70% expect chronic underfunding). Synthesizes 2024-2025 evidence showing technical alignment effectiveness at 35-50%, RSPs weakening with Anthropic dropping from 2.2 to 1.9 grade, and international coordination prospects at 15-30% for comprehensive cooperation but 35-50% for narrow risk-specific coordination.",
"structuredSummary": null,
"description": "Key uncertainties that determine which technical, coordination, and epistemic solutions to prioritize for AI safety and governance. Maps decision-relevant uncertainties across verification scaling, international cooperation, and infrastructure funding with specific probability estimates and strategic implications.",
"ratings": {
"novelty": 6.5,
"rigor": 6.8,
"actionability": 7.2,
"completeness": 7.5
},
"category": "cruxes",
"subcategory": null,
"clusters": [
"ai-safety",
"governance"
],
"metrics": {
"wordCount": 3604,
"tableCount": 22,
"diagramCount": 1,
"internalLinks": 83,
"externalLinks": 2,
"footnoteCount": 0,
"bulletRatio": 0.04,
"sectionCount": 35,
"hasOverview": true,
"structuralScore": 12
},
"suggestedQuality": 80,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 3604,
"unconvertedLinks": [],
"unconvertedLinkCount": 0,
"convertedLinkCount": 62,
"backlinkCount": 2,
"redundancy": {
"maxSimilarity": 20,
"similarPages": [
{
"id": "international-regimes",
"title": "International Compute Regimes",
"path": "/knowledge-base/responses/international-regimes/",
"similarity": 20
},
{
"id": "international-summits",
"title": "International AI Safety Summits",
"path": "/knowledge-base/responses/international-summits/",
"similarity": 20
},
{
"id": "responsible-scaling-policies",
"title": "Responsible Scaling Policies",
"path": "/knowledge-base/responses/responsible-scaling-policies/",
"similarity": 20
},
{
"id": "voluntary-commitments",
"title": "Voluntary Industry Commitments",
"path": "/knowledge-base/responses/voluntary-commitments/",
"similarity": 20
},
{
"id": "structural-risks",
"title": "AI Structural Risk Cruxes",
"path": "/knowledge-base/cruxes/structural-risks/",
"similarity": 19
}
]
}
}Entity Data
{
"id": "solutions",
"type": "crux",
"title": "AI Safety Solution Cruxes",
"description": "Key uncertainties that determine which technical, coordination, and epistemic solutions to prioritize for AI safety and governance, mapping decision-relevant uncertainties across verification scaling, international cooperation, and infrastructure funding with specific probability estimates.",
"tags": [
"verification",
"coordination",
"epistemic-infrastructure",
"responsible-scaling",
"international-cooperation",
"solution-prioritization"
],
"relatedEntries": [
{
"id": "interpretability",
"type": "concept"
},
{
"id": "responsible-scaling-policies",
"type": "policy"
},
{
"id": "international-coordination",
"type": "concept"
},
{
"id": "epistemic-infrastructure",
"type": "concept"
},
{
"id": "tmc-ai-governance",
"type": "concept"
}
],
"sources": [],
"lastUpdated": "2026-02",
"customFields": []
}Canonical Facts (0)
No facts for this entity
External Links
No external links
Backlinks (2)
| id | title | type | relationship |
|---|---|---|---|
| misuse-risks | AI Misuse Risk Cruxes | crux | — |
| epistemic-risks | AI Epistemic Cruxes | crux | — |
Frontmatter
{
"title": "AI Safety Solution Cruxes",
"description": "Key uncertainties that determine which technical, coordination, and epistemic solutions to prioritize for AI safety and governance. Maps decision-relevant uncertainties across verification scaling, international cooperation, and infrastructure funding with specific probability estimates and strategic implications.",
"sidebar": {
"order": 5
},
"quality": 71,
"llmSummary": "Comprehensive analysis of key uncertainties determining optimal AI safety resource allocation across technical verification (25-40% believe AI detection can match generation), coordination mechanisms (65-80% believe labs require external enforcement), and epistemic infrastructure (70% expect chronic underfunding). Synthesizes 2024-2025 evidence showing technical alignment effectiveness at 35-50%, RSPs weakening with Anthropic dropping from 2.2 to 1.9 grade, and international coordination prospects at 15-30% for comprehensive cooperation but 35-50% for narrow risk-specific coordination.",
"lastEdited": "2025-12-28",
"importance": 82.5,
"update_frequency": 45,
"ratings": {
"novelty": 6.5,
"rigor": 6.8,
"actionability": 7.2,
"completeness": 7.5
},
"clusters": [
"ai-safety",
"governance"
]
}Raw MDX Source
---
title: "AI Safety Solution Cruxes"
description: "Key uncertainties that determine which technical, coordination, and epistemic solutions to prioritize for AI safety and governance. Maps decision-relevant uncertainties across verification scaling, international cooperation, and infrastructure funding with specific probability estimates and strategic implications."
sidebar:
order: 5
quality: 71
llmSummary: "Comprehensive analysis of key uncertainties determining optimal AI safety resource allocation across technical verification (25-40% believe AI detection can match generation), coordination mechanisms (65-80% believe labs require external enforcement), and epistemic infrastructure (70% expect chronic underfunding). Synthesizes 2024-2025 evidence showing technical alignment effectiveness at 35-50%, RSPs weakening with Anthropic dropping from 2.2 to 1.9 grade, and international coordination prospects at 15-30% for comprehensive cooperation but 35-50% for narrow risk-specific coordination."
lastEdited: "2025-12-28"
importance: 82.5
update_frequency: 45
ratings:
novelty: 6.5
rigor: 6.8
actionability: 7.2
completeness: 7.5
clusters: ["ai-safety", "governance"]
---
import {Crux, CruxList, R, Mermaid, EntityLink, DataExternalLinks} from '@components/wiki';
## Key Links
| Source | Link |
|--------|------|
| Official Website | [merriam-webster.com](https://www.merriam-webster.com/dictionary/crux) |
| Wikipedia | [en.wikipedia.org](https://en.wikipedia.org/wiki/Crux_simplex) |
<DataExternalLinks pageId="solutions" />
## Overview
Solution cruxes are the key uncertainties that determine which interventions we should prioritize in AI safety and governance. Unlike risk cruxes that focus on the nature and magnitude of threats, solution cruxes examine the tractability and effectiveness of different approaches to addressing those threats. Your position on these cruxes should fundamentally shape what you work on, fund, or advocate for.
The landscape of AI safety solutions spans three critical domains: technical approaches that use AI systems themselves to verify and authenticate content; coordination mechanisms that align incentives across labs, nations, and institutions; and infrastructure investments that create sustainable epistemic institutions. Within each domain, fundamental uncertainties about feasibility, cost-effectiveness, and adoption timelines create genuine disagreements among experts about optimal resource allocation.
These disagreements have enormous practical implications. Whether AI-based verification can keep pace with AI-based generation determines if we should invest billions in detection infrastructure or pivot to provenance-based approaches. Whether frontier AI labs can coordinate without regulatory compulsion shapes the balance between industry engagement and government intervention. Whether credible commitment mechanisms can be designed determines if international <EntityLink id="E608">AI governance</EntityLink> is achievable or if we should prepare for an uncoordinated race.
## Risk Assessment
| Risk Category | Severity | Likelihood | Timeline | Trend |
|---------------|----------|------------|----------|-------|
| Verification-generation arms race | High | 70% | 2-3 years | Accelerating |
| Coordination failure under pressure | Critical | 60% | 1-2 years | Worsening |
| <EntityLink id="E122">Epistemic infrastructure</EntityLink> collapse | High | 40% | 3-5 years | Stable |
| International governance breakdown | Critical | 55% | 2-4 years | Worsening |
## Solution Effectiveness Overview
The <R id="97185b28d68545b4">2025 AI Safety Index</R> from the <EntityLink id="E528">Future of Life Institute</EntityLink> and the <R id="b163447fdc804872">International AI Safety Report 2025</R>---compiled by 96 AI experts representing 30 countries---provide sobering assessments of current solution effectiveness. Despite growing investment, core challenges including alignment, control, interpretability, and robustness remain unresolved, with system complexity growing year by year. The following table summarizes effectiveness estimates across major solution categories based on 2024-2025 assessments.
| Solution Category | Estimated Effectiveness | Investment Level (2024) | Maturity | Key Gaps |
|-------------------|------------------------|------------------------|----------|----------|
| Technical alignment research | Moderate (35-50%) | \$500M-1B | Early research | Scalability, verification |
| <EntityLink id="E174">Interpretability</EntityLink> | Promising (40-55%) | \$100-200M | Active research | Superposition, automation |
| <EntityLink id="E252">Responsible Scaling Policies</EntityLink> | Limited (25-35%) | N/A (policy) | Deployed but weak | Vague thresholds, compliance |
| Third-party evaluations (<R id="45370a5153534152">METR</R>) | Moderate (45-55%) | \$10-20M | Operational | Coverage, standardization |
| Compute governance | Theoretical (20-30%) | \$5-10M | Early research | Verification mechanisms |
| <EntityLink id="E171">International coordination</EntityLink> | Very limited (15-25%) | \$50-100M | Nascent | US-China competition |
According to <R id="7ae6b3be2d2043c1">Anthropic's recommended research directions</R>, the main reason current AI systems do not pose catastrophic risks is that they lack many of the capabilities necessary for causing catastrophic harm---not because alignment solutions have been proven effective. This distinction is crucial for understanding the urgency of solution development.
### Solution Prioritization Framework
The following diagram illustrates the decision tree for prioritizing AI safety solutions based on key crux resolutions:
<Mermaid chart={`
flowchart TD
A[Solution Prioritization] --> B{Can verification<br/>match generation?}
B -->|Yes 25-40%| C[Invest in AI detection<br/>R&D infrastructure]
B -->|No 60-75%| D{Provenance adoption<br/>feasible?}
D -->|Yes 40-55%| E[Focus on C2PA<br/>content provenance]
D -->|No 45-60%| F[Institutional &<br/>incentive solutions]
A --> G{Lab coordination<br/>without regulation?}
G -->|Yes 20-35%| H[Support voluntary<br/>RSPs & commitments]
G -->|No 65-80%| I{Regulatory enforcement<br/>achievable?}
I -->|Yes 40-50%| J[Focus on governance<br/>& auditing]
I -->|No 50-60%| K[Technical solutions<br/>& prepare for race]
A --> L{International<br/>coordination possible?}
L -->|Comprehensive 15-30%| M[Invest in<br/>treaty mechanisms]
L -->|Narrow only 35-50%| N[Focus on specific<br/>risks: bio, nuclear]
L -->|No 25-35%| O[Domestic & allied<br/>coordination only]
style C fill:#90EE90
style E fill:#90EE90
style J fill:#90EE90
style M fill:#90EE90
style F fill:#FFD700
style K fill:#FFD700
style O fill:#FFD700
`} />
## Technical Solution Cruxes
The technical domain centers on whether AI systems can be effectively turned against themselves—using artificial intelligence to verify, detect, and authenticate AI-generated content. This offensive-defensive dynamics question has profound implications for billions of dollars in research investment and infrastructure development.
### Current Technical Landscape
| Approach | Investment Level | Success Rate | Commercial Deployment | Key Players |
|----------|-----------------|--------------|---------------------|-------------|
| AI Detection | \$100M+ annually | 85-95% (academic) | Limited | <R id="04d39e8bd5d50dd5">OpenAI</R>, <R id="8f7f1a6ed1b856a8">Originality.ai</R> |
| Content Provenance | \$50M+ annually | N/A (adoption metric) | Early stage | <R id="499307d3fc7d6c07">Adobe</R>, <R id="dd1c59d8d7c26f28">Microsoft</R> |
| Watermarking | \$25M+ annually | Variable | Pilot programs | <R id="fc492fd338071abd">Google DeepMind</R> |
| Verification Systems | \$75M+ annually | Context-dependent | Research phase | <R id="7671d8111f8b8247">DARPA</R> |
<Crux
id="ai-verification-scaling"
question="Can AI-based verification scale to match AI-based generation?"
domain="Technical Solutions"
description="Whether AI systems designed for verification (fact-checking, detection, authentication) can keep pace with AI systems designed for generation."
importance="critical"
resolvability="years"
currentState="Generation currently ahead; some verification progress"
positions={[
{
view: "Verification can match generation with investment",
probability: "25-40%",
holders: ["Some AI researchers", "Verification startups"],
implications: "Invest heavily in AI verification R&D; build verification infrastructure"
},
{
view: "Verification will lag but remain useful",
probability: "35-45%",
implications: "Verification as one tool among many; combine with other approaches"
},
{
view: "Verification is fundamentally disadvantaged",
probability: "20-30%",
holders: ["Some security researchers"],
implications: "Shift focus to provenance, incentives, institutional solutions"
}
]}
wouldUpdateOn={[
"Breakthrough in generalizable detection",
"Real-world deployment data on AI verification performance",
"Theoretical analysis of offense-defense balance",
"Economic analysis of verification costs vs generation costs"
]}
relatedCruxes={["provenance-vs-detection"]}
relevantResearch={[
{ title: "DARPA SemaFor", url: "https://www.darpa.mil/program/semantic-forensics" }
]}
/>
The current evidence presents a mixed picture. <R id="7671d8111f8b8247">DARPA's SemaFor program</R>, launched in 2021 with \$26 million in funding, has demonstrated some success in semantic forensics for manipulated media, but primarily on specific types of synthetic content rather than the broad spectrum of AI-generated material now emerging. Meanwhile, commercial detection tools like <R id="2a656ac18fe6b4d6">GPTZero</R> report accuracy rates of 85-95% on academic writing, but these drop significantly when generators are specifically designed to evade detection.
The fundamental challenge lies in the asymmetric nature of the problem. Content generators need only produce plausible outputs, while detectors must distinguish between authentic and synthetic content across all possible generation techniques. This asymmetry may prove insurmountable, particularly as generation models become more sophisticated and numerous through <EntityLink id="E660">capabilities scaling</EntityLink>.
However, optimists point to potential advantages for verification systems: they can be specialized for detection tasks, leverage multiple modalities simultaneously, and benefit from centralized training on comprehensive datasets of known synthetic content. The emergence of foundation models specifically designed for verification, such as those being developed at <R id="f771d4f56ad4dbaa">Anthropic</R> and <R id="e9aaa7b5e18f9f41">OpenAI</R>, suggests this approach may have untapped potential.
<Crux
id="provenance-vs-detection"
question="Should we prioritize content provenance or detection?"
domain="Technical Solutions"
description="Whether resources should go to proving what's authentic (provenance) vs detecting what's fake (detection)."
importance="high"
resolvability="years"
currentState="Both being pursued; provenance gaining momentum"
positions={[
{
view: "Provenance is the right long-term bet",
probability: "40-55%",
holders: ["C2PA coalition", "Adobe", "Microsoft"],
implications: "Focus resources on provenance adoption; detection as stopgap"
},
{
view: "Need both; portfolio approach",
probability: "30-40%",
implications: "Invest in both; different use cases; don't pick one"
},
{
view: "Detection is more practical near-term",
probability: "15-25%",
implications: "Focus on detection; provenance too slow to adopt"
}
]}
wouldUpdateOn={[
"C2PA adoption metrics",
"Detection accuracy trends",
"User behavior research on credential checking",
"Cost comparison of approaches"
]}
relatedCruxes={["ai-verification-scaling"]}
relevantResearch={[
{ title: "C2PA", url: "https://c2pa.org/" },
{ title: "Detection research", url: "https://arxiv.org/abs/2004.11138" }
]}
/>
The <R id="ff89bed1f7960ab2">Coalition for Content Provenance and Authenticity (C2PA)</R>, backed by Adobe, Microsoft, Intel, and BBC, has gained significant momentum since 2021, with over 50 member organizations and initial implementations in Adobe Creative Cloud and Microsoft products. The provenance approach embeds cryptographic metadata proving content's origin and modification history, creating an "immune system" for authentic content rather than trying to identify synthetic material.
### Provenance vs Detection Comparison
| Factor | Provenance | Detection |
|--------|-----------|-----------|
| **Accuracy** | 100% for supported content | 85-95% (declining) |
| **Coverage** | Only new, participating content | All content types |
| **Adoption Rate** | <1% user verification | Universal deployment |
| **Cost** | High infrastructure | Moderate computational |
| **Adversarial Robustness** | High (cryptographic) | Low (adversarial ML) |
| **Legacy Content** | No coverage | Full coverage |
However, provenance faces substantial adoption challenges. Early data from C2PA implementations shows less than 1% of users actively check provenance credentials, and the system requires widespread adoption across platforms and devices to be effective. The approach also cannot address legacy content or situations where authentic content is captured without provenance systems. Detection remains necessary for the vast majority of existing content and will likely be required for years even if provenance adoption succeeds.
<Crux
id="watermark-robustness"
question="Can AI watermarks be made robust against removal?"
domain="Technical Solutions"
description="Whether watermarks embedded in AI-generated content can resist adversarial removal attempts."
importance="high"
resolvability="years"
currentState="Current watermarks removable with effort; research ongoing"
positions={[
{
view: "Robust watermarks are achievable",
probability: "20-35%",
holders: ["Google DeepMind (SynthID)"],
implications: "Invest in watermark R&D; mandate watermarking"
},
{
view: "Watermarks can deter casual removal but not determined actors",
probability: "40-50%",
implications: "Watermarks as one signal; don't rely on alone; combine with other methods"
},
{
view: "Watermark removal will always be possible",
probability: "20-30%",
implications: "Watermarking has limited value; focus on other solutions"
}
]}
wouldUpdateOn={[
"Adversarial testing of production watermarks",
"Theoretical bounds on watermark robustness",
"Real-world watermark survival data"
]}
relatedCruxes={["provenance-vs-detection"]}
relevantResearch={[
{ title: "SynthID", url: "https://deepmind.google/technologies/synthid/" }
]}
/>
<R id="fc492fd338071abd">Google DeepMind's SynthID</R>, launched in August 2023, represents the most advanced publicly available watermarking system, using statistical patterns imperceptible to humans but detectable by specialized algorithms. However, academic research consistently demonstrates that current watermarking approaches can be defeated through various attack vectors including adversarial perturbations, model fine-tuning, and regeneration techniques.
Research by <R id="01f2211a18a3aa5a">UC Berkeley</R> and <R id="51df12a0a334621c">University of Maryland</R> has shown that sophisticated attackers can remove watermarks with success rates exceeding 90% while preserving content quality. The theoretical foundations suggest fundamental limits to watermark robustness---any watermark that preserves content quality enough to be usable can potentially be removed by sufficiently sophisticated adversaries.
### Technical Alignment Research Progress (2024-2025)
Recent advances in <R id="b1d6e7501debf627">mechanistic interpretability</R> have demonstrated promising safety applications. Using attribution graphs, Anthropic researchers directly examined Claude 3.5 Haiku's internal reasoning processes, revealing hidden mechanisms beyond what the model displays in its chain-of-thought. As of March 2025, circuit tracing allows researchers to observe model reasoning, uncovering a shared conceptual space where reasoning happens before being translated into language.
| Alignment Approach | 2024-2025 Progress | Effectiveness Estimate | Key Challenges |
|-------------------|-------------------|----------------------|----------------|
| Deliberative alignment | Extended thinking in Claude 3.7, o1-preview | 40-55% risk reduction | Latency, energy costs |
| Layered safety interventions | OpenAI redundancy approach | 30-45% risk reduction | Coordination complexity |
| Sparse autoencoders (SAEs) | Scaled to Claude 3 Sonnet | 35-50% interpretability gain | Superposition, polysemanticity |
| Circuit tracing | Direct observation of reasoning | Research phase | Automation, scaling |
| Adversarial techniques (debate) | Prover-verifier games | 25-40% oversight improvement | Equilibrium identification |
The <R id="36fb43e4e059f0c9">shallow review of technical AI safety (2025)</R> notes that increasing reasoning depth can raise latency and energy consumption, posing challenges for real-time applications. Scaling alignment mechanisms to future, larger models or eventual AGI systems remains an open research question, with complexity growing exponentially with model size and task diversity.
## Coordination Solution Cruxes
Coordination cruxes address whether different actors—from AI labs to nation-states—can align their behavior around safety measures without sacrificing competitive advantages or national interests. These questions determine the feasibility of governance approaches ranging from industry self-regulation to international treaties.
### Current Coordination Landscape
| Mechanism | Participants | Binding Nature | Track Record | Key Challenges |
|-----------|-------------|----------------|--------------|----------------|
| <EntityLink id="E252">RSPs</EntityLink> | 4 major labs | Voluntary | Mixed compliance | Vague standards, competitive pressure |
| <R id="fdf68a8f30f57dee">AI Safety Institute</R> networks | 8+ countries | Non-binding | Early stage | Limited authority, funding |
| Export controls | US + allies | Legal | Partially effective | Circumvention, coordination gaps |
| Voluntary commitments | Major labs | Self-enforced | Poor | No external verification |
<Crux
id="lab-coordination"
question="Can frontier AI labs meaningfully coordinate on safety?"
domain="Coordination"
description="Whether labs competing for AI supremacy can coordinate on safety measures without regulatory compulsion."
importance="critical"
resolvability="years"
currentState="Some voluntary commitments (RSPs); no binding enforcement; competitive pressures strong"
positions={[
{
view: "Voluntary coordination can work",
probability: "20-35%",
holders: ["Some lab leadership"],
implications: "Support lab coordination efforts; build trust; industry self-regulation"
},
{
view: "Coordination requires external enforcement",
probability: "40-50%",
holders: ["Most governance researchers"],
implications: "Focus on regulation; auditing; legal liability; government role essential"
},
{
view: "Neither voluntary nor regulatory coordination will work",
probability: "15-25%",
implications: "Focus on technical solutions; prepare for uncoordinated development"
}
]}
wouldUpdateOn={[
"Labs defecting from voluntary commitments",
"Successful regulatory enforcement",
"Evidence of coordination changing lab behavior"
]}
relatedCruxes={["international-coordination"]}
relevantResearch={[
{ title: "RSP analysis", url: "https://www.anthropic.com/rsp" },
{ title: "GovAI", url: "https://www.governance.ai/" }
]}
/>
The emergence of <EntityLink id="E252">Responsible Scaling Policies (RSPs)</EntityLink> in 2023-2024, adopted by <EntityLink id="E22">Anthropic</EntityLink>, <EntityLink id="E218">OpenAI</EntityLink>, and <EntityLink id="E98">Google DeepMind</EntityLink>, represents the most significant attempt at voluntary lab coordination to date. These policies outline safety evaluations and deployment standards that labs commit to follow as their models become more capable.
However, early implementation has revealed significant limitations: evaluation standards remain vague, triggering thresholds are subjective, and competitive pressures create incentives to interpret requirements leniently. Analysis by <EntityLink id="E201">METR</EntityLink> and <EntityLink id="E25">ARC Evaluations</EntityLink> shows substantial variations in how labs implement similar commitments.
### Third-Party Evaluation Effectiveness
<R id="45370a5153534152">METR</R> (formerly ARC Evals) has emerged as the leading third-party evaluator of frontier AI systems, conducting pre-deployment evaluations of GPT-4, Claude 2, and Claude 3.5 Sonnet. Their April 2025 evaluation of OpenAI's o3 and o4-mini found these models displayed higher autonomous capabilities than other public models tested, with o3 appearing somewhat prone to "reward hacking." METR's evaluation of Claude 3.7 Sonnet found impressive AI R&D capabilities on RE-Bench, though no significant evidence for dangerous autonomous capabilities.
| Evaluation Organization | Models Evaluated (2024-2025) | Key Findings | Limitations |
|------------------------|------------------------------|--------------|-------------|
| <R id="45370a5153534152">METR</R> | GPT-4, Claude 2/3.5/3.7, o3/o4-mini | Autonomous capability increases; reward hacking in o3 | Limited to cooperative labs |
| <R id="fdf68a8f30f57dee">UK AI Safety Institute</R> | Pre-deployment evals for major labs | Advanced AI evaluation frameworks | Resource constraints |
| Internal lab evaluations | All frontier models | Proprietary capabilities assessments | Conflict of interest |
METR proposes measuring AI performance in terms of the length of tasks AI agents can complete, showing this metric has been exponentially increasing over the past 6 years with a doubling time of around 7 months. Extrapolating this trend predicts that within five years, AI agents may independently complete a large fraction of software tasks that currently take humans days or weeks.
### RSP Compliance Analysis (2024-2025)
Anthropic's <R id="c6766d463560b923">October 2024 RSP update</R> introduced more flexible approaches but drew criticism from external analysts. According to <R id="a5e4c7b49f5d3e1b">SaferAI</R>, Anthropic's grade dropped from 2.2 to 1.9, placing them alongside OpenAI and DeepMind in the "weak" category. The primary issue lies in the shift away from precisely defined capability thresholds and mitigation measures. Anthropic acknowledged falling short in some areas, including completing evaluations 3 days late, though these instances posed minimal safety risk.
| RSP Element | Anthropic | OpenAI | Google DeepMind |
|-------------|-----------|--------|-----------------|
| Capability thresholds | ASL levels (loosened) | Preparedness framework | Frontier Safety Framework |
| Evaluation frequency | 6 months (extended from 3) | Ongoing | Pre-deployment |
| Third-party review | Annual procedural | Limited | Limited |
| Public transparency | Partial | Limited | Limited |
| Binding enforcement | Self-enforced | Self-enforced | Self-enforced |
### Historical Coordination Precedents
| Industry | Coordination Success | Key Factors | AI Relevance |
|----------|---------------------|-------------|--------------|
| Nuclear weapons | Partial (NPT, arms control) | Mutual destruction, verification | High stakes, but clearer parameters |
| Pharmaceuticals | Mixed (safety standards vs. pricing) | Regulatory oversight, liability | Similar R&D competition |
| Semiconductors | Successful (SEMATECH) | Government support, shared costs | Technical collaboration model |
| Social media | Poor (content moderation) | Light regulation, network effects | Platform competition dynamics |
Historical precedent suggests mixed prospects for voluntary coordination in high-stakes competitive environments. The semiconductor industry's successful coordination on safety standards through SEMATECH offers some optimism, but occurred under different competitive dynamics and with explicit government support. The pharmaceutical industry's mixed record—with some successful self-regulation but also notable failures requiring regulatory intervention—may be more analogous to AI development.
<Crux
id="international-coordination"
question="Can US-China coordination on AI governance succeed?"
domain="Coordination"
description="Whether the major AI powers can coordinate despite geopolitical competition."
importance="critical"
resolvability="years"
currentState="Very limited; competition dominant; some backchannel communication"
positions={[
{
view: "Meaningful coordination is possible",
probability: "15-30%",
implications: "Invest heavily in Track II diplomacy; find areas of shared interest"
},
{
view: "Narrow coordination on specific risks possible",
probability: "35-50%",
implications: "Focus on achievable goals (bioweapons, nuclear); don't expect comprehensive regime"
},
{
view: "Great power competition precludes coordination",
probability: "25-35%",
implications: "Focus on domestic/allied coordination; defensive measures; prepare for competition"
}
]}
wouldUpdateOn={[
"US-China AI discussions outcomes",
"Coordination on specific risks (bio, nuclear)",
"Changes in geopolitical relationship",
"Success/failure of UK/Korea AI summits on coordination"
]}
relatedCruxes={["lab-coordination"]}
relevantResearch={[
{ title: "RAND on AI and great power competition", url: "https://www.rand.org/" }
]}
/>
Current US-China AI relations are characterized by strategic competition rather than cooperation. Export controls on semiconductors, restrictions on Chinese AI companies, and national security framings dominate the policy landscape. The <R id="9ae4c87175cc63c0">CHIPS Act</R> and export restrictions target Chinese AI development directly, while China's response includes increased domestic investment and alternative supply chains.
However, some limited dialogue continues through academic conferences, multilateral forums like the G20, and informal diplomatic channels. The <EntityLink id="E364">UK AI Safety Institute</EntityLink> and <EntityLink id="E279">Seoul Declaration</EntityLink> provide potential multilateral venues for engagement.
### International Coordination Prospects by Risk Area
| Risk Category | US-China Cooperation Likelihood | Key Barriers | Potential Mechanisms |
|---------------|--------------------------------|--------------|---------------------|
| AI-enabled bioweapons | 60-70% | Technical verification | Joint research restrictions |
| Nuclear command systems | 50-60% | Classification concerns | Backchannel protocols |
| Autonomous weapons | 30-40% | Military applications | Geneva Convention framework |
| Economic competition | 10-20% | Zero-sum framing | Very limited prospects |
The most promising path may involve narrow cooperation on specific risks where interests clearly align, such as preventing AI-enabled bioweapons or nuclear command-and-control accidents. The precedent of nuclear arms control offers both hope and caution—the US and Soviet Union managed meaningful arms control despite existential competition, but nuclear weapons had clearer technical parameters than AI risks.
<Crux
id="commitment-credibility"
question="Can credible AI governance commitments be designed?"
domain="Coordination"
description="Whether commitment mechanisms (RSPs, treaties, escrow) can be designed that actors can't easily defect from."
importance="high"
resolvability="years"
currentState="Few tested mechanisms; mostly voluntary; enforcement unclear"
positions={[
{
view: "Credible commitments are designable",
probability: "30-45%",
implications: "Invest in mechanism design; compute governance; verification technology"
},
{
view: "Partial credibility achievable for some commitments",
probability: "35-45%",
implications: "Focus on verifiable commitments; accept limits on what can be bound"
},
{
view: "Actors will defect from any commitment when stakes are high enough",
probability: "20-30%",
implications: "Don't rely on commitments; focus on incentive alignment and technical solutions"
}
]}
wouldUpdateOn={[
"Track record of RSPs and similar commitments",
"Progress on compute governance/monitoring",
"Examples of commitment enforcement",
"Game-theoretic analysis of commitment mechanisms"
]}
relatedCruxes={["lab-coordination"]}
relevantResearch={[
{ title: "Compute governance", url: "https://arxiv.org/abs/2402.08797" }
]}
/>
The emerging field of compute governance offers the most promising avenue for credible commitment mechanisms. Unlike software or model parameters, computational resources are physical and potentially observable. Research by <EntityLink id="E153">GovAI</EntityLink> has outlined monitoring systems that could track large-scale training runs, creating verifiable bounds on certain types of AI development.
However, the feasibility of comprehensive compute monitoring remains unclear. Cloud computing, distributed training, and algorithm efficiency improvements create multiple pathways for evading monitoring systems. International variation in monitoring capabilities and willingness could create safe havens for actors seeking to avoid commitments.
### Compute Governance Verification Mechanisms
<R id="482b71342542a659">GovAI research on compute governance</R> identifies three primary mechanisms for using compute as a governance lever: tracking/monitoring compute to gain visibility into AI development; subsidizing or limiting access to shape resource allocation; and building "guardrails" into hardware to enforce rules. The AI governance platform market is projected to grow from \$227 million in 2024 to \$4.83 billion by 2034, driven by generative AI adoption and regulations like the EU AI Act.
| Verification Mechanism | Feasibility | Current Status | Key Barriers |
|----------------------|-------------|----------------|--------------|
| Training run reporting | High | Partial implementation | Voluntary compliance |
| Chip-hour tracking | Medium | Compute providers use for billing | International coordination |
| Flexible Hardware-Enabled Guarantees (FlexHEG) | Low-Medium | Research phase | Technical complexity |
| Workload classification (zero-knowledge) | Low | Theoretical | Privacy concerns, adversarial evasion |
| Data center monitoring | Medium | Limited | Jurisdiction gaps |
According to the <R id="510c42bfa643b8de">Institute for Law & AI</R>, meaningful enforcement requires regulators to be aware of or able to verify the amount of compute being used. A regulatory threshold will be ineffective if regulators have no way of knowing whether a threshold has been reached. Research on <R id="d6ad3bb2bd9d729b">verification for international AI governance</R> proposes mechanisms to verify that data centers are not conducting large AI training runs exceeding agreed-upon thresholds.
### International Governance Coordination Status
The <R id="e11a50f25b1a20df">UN High-Level Advisory Body on AI</R> submitted seven recommendations in August 2024: launching a twice-yearly intergovernmental dialogue; creating an independent international scientific panel; an AI standards exchange; a capacity development network; a global fund for AI; a global AI data framework; and a dedicated AI office within the UN Secretariat. However, <R id="3277a685c8b28fe0">academic analysis</R> concludes that a governance deficit remains due to inadequacy of existing initiatives, gaps in the landscape, and difficulties reaching agreement over more appropriate mechanisms.
| Governance Initiative | Participants | Binding Status | Effectiveness Assessment |
|----------------------|--------------|----------------|-------------------------|
| <EntityLink id="E173">AI Safety Summits</EntityLink> | 28+ countries | Non-binding | Limited (pageantry vs progress) |
| EU AI Act | EU members | Binding | Moderate (implementation pending) |
| <EntityLink id="E366">US Executive Order</EntityLink> | US federal | Executive (rescindable) | Limited (political uncertainty) |
| UN HLAB recommendations | UN members | Non-binding | Minimal (no implementation) |
| Bilateral US-China dialogues | US, China | Ad hoc | Very limited (competition dominant) |
## Collective Intelligence and Infrastructure Cruxes
The final domain addresses whether we can build sustainable systems for truth, knowledge, and collective decision-making that can withstand both market pressures and technological disruption. These questions determine the viability of epistemic institutions as a foundation for AI governance.
### Current Epistemic Infrastructure
| Platform/System | Annual Budget | User Base | Accuracy Rate | Sustainability Model |
|-----------------|---------------|-----------|---------------|---------------------|
| Wikipedia | \$150M | 1.7B monthly | 90%+ (citations) | Donations |
| Fact-checking orgs | \$50M total | 100M+ reach | 85-95% | Mixed funding |
| Academic peer review | \$5B+ (estimated) | Research community | Variable | Institution-funded |
| <EntityLink id="E228">Prediction markets</EntityLink> | \$100M+ volume | <1M active | 75-85% | Commercial |
<Crux
id="forecasting-ai-combo"
question="Can AI + human forecasting substantially outperform either alone?"
domain="Collective Intelligence"
description="Whether combining AI forecasting with human judgment produces significantly better predictions than either approach separately."
importance="high"
resolvability="soon"
currentState="Early experiments promising; limited systematic comparison"
positions={[
{
view: "Combination is significantly better",
probability: "35-50%",
holders: ["Metaculus (testing)"],
implications: "Invest in hybrid forecasting systems; deploy widely"
},
{
view: "Benefits are modest and context-dependent",
probability: "35-45%",
implications: "Use combination where marginal gain justifies cost; domain-specific"
},
{
view: "One will dominate (AI or human); combination adds noise",
probability: "15-25%",
implications: "Figure out which is better for which questions; don't force combination"
}
]}
wouldUpdateOn={[
"Systematic comparison studies",
"Metaculus AI forecasting results",
"Domain-specific performance data"
]}
relatedCruxes={["human-ai-complementarity"]}
relevantResearch={[
{ title: "Metaculus AI", url: "https://www.metaculus.com/project/ai-forecasting/" },
{ title: "Superforecasting", url: "https://goodjudgment.com/" }
]}
/>
<R id="d99a6d0fb1edc2db">Metaculus</R> has been conducting systematic experiments with <EntityLink id="E9">AI forecasting</EntityLink> since 2023, with early results suggesting that AI systems can match or exceed human forecasters on certain types of questions, particularly those involving quantitative trends or pattern recognition from large datasets. However, humans continue to outperform on questions requiring contextual judgment, novel reasoning, or understanding of political and social dynamics.
### AI vs Human Forecasting Performance
| Question Type | AI Performance | Human Performance | Combination Performance |
|---------------|----------------|-------------------|------------------------|
| Quantitative trends | 85-90% accuracy | 75-80% accuracy | 88-93% accuracy |
| Geopolitical events | 60-70% accuracy | 75-85% accuracy | 78-88% accuracy |
| Scientific breakthroughs | 70-75% accuracy | 80-85% accuracy | 83-88% accuracy |
| Economic indicators | 80-85% accuracy | 70-75% accuracy | 83-87% accuracy |
The combination approaches show promise but remain under-tested. Initial experiments suggest that human forecasters can improve their performance by consulting AI predictions, while AI systems benefit from human-provided context and reasoning. However, the optimal architectures for human-AI collaboration remain unclear, and the cost-effectiveness compared to scaling either approach independently has not been established.
<Crux
id="epistemic-public-good"
question="Can epistemic infrastructure be funded as a public good?"
domain="Infrastructure"
description="Whether verification, fact-checking, and knowledge infrastructure can achieve sustainable funding without commercial incentives."
importance="high"
resolvability="years"
currentState="Underfunded; dependent on philanthropy and some government support"
positions={[
{
view: "Public/philanthropic funding can scale",
probability: "25-40%",
implications: "Advocate for government funding; build philanthropic case; create public institutions"
},
{
view: "Hybrid models needed (public + private)",
probability: "35-45%",
implications: "Design business models that align profit with truth; public-private partnerships"
},
{
view: "Will remain underfunded relative to commercial content",
probability: "25-35%",
implications: "Focus resources on highest-leverage applications; accept limits"
}
]}
wouldUpdateOn={[
"Government investment in epistemic infrastructure",
"Successful commercial models for verification",
"Philanthropic commitment levels",
"Platform willingness to pay for verification"
]}
relatedCruxes={["platform-incentives"]}
/>
Current epistemic infrastructure suffers from chronic underfunding relative to content generation systems. Fact-checking organizations operate on annual budgets of millions while misinformation spreads through platforms with budgets in the billions. Wikipedia, one of the most successful epistemic public goods, operates on approximately \$150 million annually while supporting hundreds of millions of users—a funding ratio of roughly \$0.09 per monthly active user.
### Funding Landscape for Epistemic Infrastructure
| Source | Annual Contribution | Sustainability | Scalability |
|--------|-------------------|----------------|------------|
| Government | \$200M+ (EU DSA, others) | Political dependent | High potential |
| Philanthropy | \$100M+ (Omidyar, others) | Mission-driven | Medium potential |
| Platform fees | \$50M+ (voluntary) | Unreliable | Low potential |
| Commercial models | \$25M+ (fact-check APIs) | Market-dependent | High potential |
Government funding varies dramatically by jurisdiction. The EU's <R id="23e41eec572c9b30">Digital Services Act</R> includes provisions for funding fact-checking and verification systems, while the US has been more reluctant to fund what could be perceived as content moderation. Philanthropic support, led by foundations like <R id="19bac4f67b51576e">Omidyar Network</R> and <R id="54d74a3da6c73239">Craig Newmark Philanthropies</R>, has provided crucial early-stage funding but may be insufficient for the scale required.
## Current State and Trajectory
### Near-term Developments (1-2 years)
The immediate trajectory will be shaped by several ongoing developments:
- **Commercial verification systems** from major tech companies will provide real-world performance data
- **Regulatory frameworks** in the EU and potentially other jurisdictions will test enforcement mechanisms
- **International coordination** through AI Safety Institutes and summits will reveal cooperation possibilities
- **Lab RSP implementation** will demonstrate voluntary coordination track record
### Medium-term Projections (2-5 years)
| Domain | Most Likely Outcome | Probability | Strategic Implications |
|--------|-------------------|-------------|----------------------|
| Technical verification | Modest success, arms race dynamics | 60% | Continued R&D investment, no single solution |
| Lab coordination | External oversight required | 65% | Regulatory frameworks necessary |
| International governance | Narrow cooperation only | 55% | Focus on specific risks, not comprehensive regime |
| Epistemic infrastructure | Chronically underfunded | 70% | Accept limited scale, prioritize high-leverage applications |
The resolution of these solution cruxes will fundamentally shape AI safety strategy over the next decade. If technical verification approaches prove viable, we may see an arms race between generation and detection systems. If coordination mechanisms succeed, we could see the emergence of global AI governance institutions. If they fail, we may face an uncoordinated race with significant safety risks.
## Key Research Priorities
The highest-priority uncertainties requiring systematic research include:
### Technical Verification Research
- **Systematic adversarial testing** of verification systems across attack scenarios
- **Economic analysis** comparing costs of verification vs generation at scale
- **Theoretical bounds** on detection performance under optimal adversarial conditions
- **User behavior studies** on provenance checking and verification adoption
### Coordination Mechanism Analysis
- **Game-theoretic modeling** of commitment mechanisms under competitive pressure
- **Historical analysis** of coordination successes and failures in high-stakes domains
- **Empirical tracking** of RSP implementation and compliance across labs
- **Regulatory effectiveness** studies comparing different governance approaches
### Epistemic Infrastructure Design
- **Hybrid system architecture** for combining AI and human judgment optimally
- **Funding model innovation** for sustainable epistemic public goods
- **Platform integration** studies for verification system adoption
- **Cross-platform coordination** mechanisms for epistemic infrastructure
## Key Uncertainties and Strategic Dependencies
These cruxes are interconnected in complex ways that create strategic dependencies:
- **Technical feasibility affects coordination incentives**: If verification systems work well, labs may be more willing to adopt them voluntarily
- **Coordination success affects infrastructure funding**: Successful international cooperation could unlock government investment in epistemic public goods
- **Infrastructure sustainability affects technical development**: Reliable funding enables long-term R&D programs for verification systems
- **International dynamics affect all domains**: US-China competition shapes both technical development and coordination possibilities
Understanding these dependencies will be crucial for developing comprehensive solution strategies that account for the interconnected nature of technical, coordination, and infrastructure challenges.
---
## Sources & Resources
### Technical Research Organizations
| Organization | Focus Area | Key Publications |
|-------------|-----------|------------------|
| <R id="1adec5eb6a75f559">DARPA</R> | Semantic forensics, verification | <R id="7671d8111f8b8247">SemaFor program</R> |
| <R id="ff89bed1f7960ab2">C2PA</R> | Content provenance standards | <R id="9b09e69e5f2a9f78">Technical specification</R> |
| <R id="0ef9b0fe0f3c92b4">Google DeepMind</R> | Watermarking, detection | <R id="fc492fd338071abd">SynthID research</R> |
### Governance and Coordination Research
| Organization | Focus Area | Key Resources |
|-------------|-----------|---------------|
| <R id="f35c467b353f990f">GovAI</R> | AI governance, coordination | <R id="26494a9f05b9db4d">Compute governance research</R> |
| <R id="0a17f30e99091ebf">RAND Corporation</R> | Strategic analysis | <R id="cf5fd74e8db11565">AI competition studies</R> |
| <R id="58f6946af0177ca5">CNAS</R> | Security, international relations | <R id="fbd5f171b9a891f3">AI security reports</R> |
### Epistemic Infrastructure Organizations
| Organization | Focus Area | Key Resources |
|-------------|-----------|---------------|
| <R id="d99a6d0fb1edc2db">Metaculus</R> | Forecasting, prediction | <R id="10ca22c5e88ffee9">AI forecasting project</R> |
| <R id="664518d11aec3317">Good Judgment</R> | Superforecasting | Crowd forecasting methodology |
### Safety Research and Evaluation
| Organization | Focus Area | Key Resources |
|-------------|-----------|---------------|
| <R id="45370a5153534152">METR</R> | Third-party AI evaluations | Autonomous capability assessments |
| <R id="5a651b8ed18ffeb1">Anthropic Alignment</R> | Technical alignment research | <R id="7ae6b3be2d2043c1">Research directions 2025</R> |
| <R id="fdf68a8f30f57dee">UK AI Safety Institute</R> | Government evaluations | <R id="533b576199ec323d">Evaluation approach</R> |
### Key 2024-2025 Reports
| Report | Organization | Focus |
|--------|-------------|-------|
| <R id="97185b28d68545b4">2025 AI Safety Index</R> | Future of Life Institute | Industry safety practices |
| <R id="b163447fdc804872">International AI Safety Report 2025</R> | 96 AI experts, 30 countries | Global safety assessment |
| <R id="36fb43e4e059f0c9">Shallow Review of Technical AI Safety 2025</R> | Alignment Forum | Research progress review |
| <R id="b1d6e7501debf627">Mechanistic Interpretability Review</R> | TMLR | Interpretability research survey |
| <R id="482b71342542a659">Computing Power and AI Governance</R> | GovAI | Compute governance mechanisms |
| <R id="3277a685c8b28fe0">Global AI Governance Analysis</R> | International Affairs | Governance deficit assessment |