Get your first hallucination regression report in under 10 minutes. The only change to your existing code is adding logprobs=True.
# Core
pip install llmguard
# With Claude/Gemini proxy support
pip install llmguard[proxy]
export LLMGUARD_API_KEY="lg_sk_your_key_here"
import llmguard from openai import OpenAI client = OpenAI() guard = llmguard.Client(api_key="lg_sk_...") # Your existing call with one addition response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": prompt}], logprobs=True, # ← only addition top_logprobs=5 ) result = guard.evaluate( response=response, checkpoint_id="v15", baseline_id="v14" ) print(result.risk_score) # 0.847 print(result.output_class) # "confident_hallucination" print(result.flagged_ranges) # [[12, 18], [34, 41]]curl -X POST https://api.llmguard.io/v1/evaluate \ -H "Authorization: Bearer lg_sk_..." \ -H "Content-Type: application/json" \ -d '{ "feature_matrix": [[0.91, 0.04, 2.41, 0.82, 0.012], ...], "model_family": "gpt-4o", "checkpoint_id": "v15", "baseline_id": "v14", "domain": "legal" }'
All API requests must include your API key as a Bearer token in the Authorization header.
Authorization: Bearer lg_sk_your_api_key_here
API keys are generated in your Settings → API Keys page. Never expose your API key in client-side code or public repositories.
The simplest integration path. Add logprobs=True and top_logprobs=5 to any ChatCompletion call during evaluation runs.
import llmguard
from openai import OpenAI
client = OpenAI()
guard = llmguard.Client()
# Batch evaluation example
batch = guard.create_batch(
checkpoint_id="v15",
baseline_id="v14",
prompt_suite="legal_qa_1000"
)
def my_model(prompt: str):
return client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
logprobs=True,
top_logprobs=5
)
report = batch.run(model_fn=my_model)
print(f"Delta: {report.hallucination_delta}")
print(f"Recommendation: {report.recommendation}")
from vllm import LLM, SamplingParams
import llmguard
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
params = SamplingParams(temperature=0.7, logprobs=5)
guard = llmguard.Client()
outputs = llm.generate(prompts, params)
for output in outputs:
result = guard.evaluate_vllm(
output=output,
checkpoint_id="llama3-ft-v3",
baseline_id="llama3-ft-v2"
)
print(result.risk_score, result.output_class)
import anthropic, llmguard
client = anthropic.Anthropic()
guard = llmguard.Client()
response = client.messages.create(
model="claude-opus-4-5",
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
# SDK detects missing logprobs → routes to proxy automatically
result = guard.evaluate(
response=response,
checkpoint_id="claude-ft-v2",
baseline_id="claude-ft-v1",
proxy="llama3-8b"
)
print(result.metadata.evaluation_method) # "proxy"
Score a single token sequence against the TTM hallucination model. The feature matrix must be pre-computed by the SDK on the client side.
{
"risk_score": 0.847,
"output_class": "confident_hallucination",
"class_probabilities": {
"confident_correct": 0.041,
"confident_hallucination": 0.847,
"uncertain_hallucination": 0.062,
"genuine_uncertainty": 0.031,
"creative_generation": 0.019
},
"flagged_token_ranges": [[12, 18], [34, 41]],
"inference_latency_ms": 4.2,
"calibration_version": "2025-07-01",
"evaluation_method": "native"
}
{
"batch_id": "batch_abc123",
"status": "created",
"estimated_duration_minutes": 12,
"prompt_count": 1000
}
{
"batch_id": "batch_abc123",
"status": "complete",
"hallucination_delta": 0.127,
"confidence_interval_95": [0.089, 0.165],
"recommendation": "block_deployment",
"domain_breakdown": {
"legal": {"baseline": 0.312, "candidate": 0.441, "delta": 0.129},
"reasoning": {"baseline": 0.231, "candidate": 0.287, "delta": 0.056}
},
"confident_hallucination_stats": {
"baseline_rate": 0.031,
"candidate_rate": 0.089,
"delta": 0.058
},
"generated_at": "2025-07-15T14:32:11Z",
"evaluation_duration_seconds": 487
}
| Exception | Cause | Handling |
|---|---|---|
| LLMGuardAuthError | Invalid or expired API key | Re-generate key in Settings |
| LLMGuardRateLimitError | Rate limit exceeded | Respect retry_after_seconds attribute |
| LLMGuardNoLogprobsError | Response has no logprob data | Use evaluate_with_proxy() |
| LLMGuardCalibrationError | Unknown model family | Specify supported model_family |
| Signal | Formula | Range | High Value Means |
|---|---|---|---|
| top1_prob | softmax(L)[argmax] | [0, 1] | Model strongly prefers one token |
| top2_prob | softmax(L)[2nd] | [0, 1] | Model has a strong second choice |
| logit_gap | L[1st] − L[2nd] | [0, ∞) | High relative confidence in top choice |
| entropy | −Σ p·log₂(p) | [0, log₂(V)] | Uncertainty spread across many tokens |
| temporal_variance | Var(P₁[t−N:t]) | [0, 0.25] | Confidence fluctuating over recent tokens |