# llm_with_tracing.py
# Purpose: A production-ready LLM call wrapper with full observability.
# Every call is traced in Langfuse: input, output, tokens, cost, latency.
#
# Prerequisites:
# pip install langfuse anthropic python-dotenv
#
# Setup:
# 1. Create a free account at https://cloud.langfuse.com
# 2. Get your keys from Settings > API Keys
# 3. Create a .env file with the variables below
#
# Run:
# python llm_with_tracing.py
import os
import time
from dotenv import load_dotenv
import anthropic
from langfuse import Langfuse
# Load environment variables from .env file
load_dotenv()
# Required environment variables in your .env:
# LANGFUSE_PUBLIC_KEY=pk-lf-…
# LANGFUSE_SECRET_KEY=sk-lf-…
# LANGFUSE_HOST=https://cloud.langfuse.com (or your self-hosted URL)
# ANTHROPIC_API_KEY=sk-ant-…
# Initialize clients
langfuse_client = Langfuse() # Reads keys automatically from environment
anthropic_client = anthropic.Anthropic() # Reads ANTHROPIC_API_KEY from environment
# ── Configuration ─────────────────────────────────────────────────────────────
# Store your prompt here, not inline in the API call.
# This makes it versionable and testable independently.
SYSTEM_PROMPT = “”“You are a helpful customer support assistant.
Answer questions clearly and concisely.
If you do not know something, say so directly — do not guess.”“”
MODEL = “claude-sonnet-4-20250514”
# Anthropic’s pricing as of mid-2026 (update when pricing changes)
# Used to calculate cost per call for cost tracking
COST_PER_INPUT_TOKEN = 3.00 / 1_000_000 # $3.00 per million input tokens
COST_PER_OUTPUT_TOKEN = 15.00 / 1_000_000 # $15.00 per million output tokens
def call_llm_with_tracing(
user_message: str,
session_id: str = “default-session”,
user_id: str = “anonymous”
) -> str:
“”“
Make a traced LLM call. Every call creates a Langfuse trace with:
– Full input and output
– Token usage (input, output, total)
– Calculated cost in USD
– Latency in milliseconds
– Model used and session context
Parameters:
user_message : The message from the user
session_id : Groups related calls into one conversation in Langfuse
user_id : Associates the call with a specific user for analytics
Returns:
The LLM response as a string
““”
# Create a top-level trace for this user interaction
# The trace appears in the Langfuse dashboard as one unit of work
trace = langfuse_client.trace(
name=“customer-support-call”,
session_id=session_id,
user_id=user_id,
input={“user_message”: user_message, “system_prompt”: SYSTEM_PROMPT}
)
# Create a generation span inside the trace
# This captures model-specific details: model name, tokens, cost
generation = trace.generation(
name=“claude-completion”,
model=MODEL,
input={
“system”: SYSTEM_PROMPT,
“messages”: [{“role”: “user”, “content”: user_message}]
}
)
start_time = time.time()
try:
# Make the API call
response = anthropic_client.messages.create(
model=MODEL,
max_tokens=1024,
system=SYSTEM_PROMPT,
messages=[{“role”: “user”, “content”: user_message}]
)
latency_ms = int((time.time() – start_time) * 1000)
# Extract the response text
response_text = response.content[0].text
# Extract token usage from the response
input_tokens = response.usage.input_tokens
output_tokens = response.usage.output_tokens
total_tokens = input_tokens + output_tokens
# Calculate cost for this call
cost_usd = (
input_tokens * COST_PER_INPUT_TOKEN +
output_tokens * COST_PER_OUTPUT_TOKEN
)
# Update the generation span with results
# This data populates the Langfuse cost and token dashboards
generation.end(
output=response_text,
usage={
“input”: input_tokens,
“output”: output_tokens,
“total”: total_tokens,
“unit”: “TOKENS”
},
metadata={
“latency_ms”: latency_ms,
“cost_usd”: round(cost_usd, 6),
“model”: MODEL
}
)
# Update the trace with the final output
trace.update(
output={“response”: response_text},
metadata={“total_cost_usd”: round(cost_usd, 6)}
)
# Print a summary to stdout for local visibility
print(f“\n{‘─’ * 60}”)
print(f“User: {user_message}”)
print(f“Claude: {response_text}”)
print(f“Tokens: {input_tokens} in / {output_tokens} out / {total_tokens} total”)
print(f“Cost: ${cost_usd:.6f}”)
print(f“Latency: {latency_ms}ms”)
print(f“Trace: {langfuse_client.base_url}/trace/{trace.id}”)
print(f“{‘─’ * 60}\n”)
return response_text
except Exception as e:
# Record the error in the trace so it shows up in Langfuse
generation.end(
output=None,
metadata={“error”: str(e), “latency_ms”: int((time.time() – start_time) * 1000)}
)
trace.update(output={“error”: str(e)})
# Always flush before raising — ensures the error trace is sent
langfuse_client.flush()
raise
finally:
# Flush sends all buffered events to Langfuse
# In a long-running service, Langfuse flushes automatically.
# In a script, you must flush manually before the process exits.
langfuse_client.flush()
# ── Run a demonstration ────────────────────────────────────────────────────────
if __name__ == “__main__”:
# Simulate two turns of a customer support conversation
test_messages = [
“What is your return policy for electronics?”,
“Can I return an item I bought 45 days ago?”
]
session = “demo-session-001”
for i, message in enumerate(test_messages):
print(f“\nCall {i + 1}/{len(test_messages)}”)
try:
call_llm_with_tracing(
user_message=message,
session_id=session,
user_id=“test-user-42”
)
except Exception as e:
print(f“Error on call {i + 1}: {e}”)
