The Roadmap for Mastering LLMOps in 2026

# llm_with_tracing.py

# Purpose: A production-ready LLM call wrapper with full observability.

# Every call is traced in Langfuse: input, output, tokens, cost, latency.

# Prerequisites:

# pip install langfuse anthropic python-dotenv

# Setup:

# 1. Create a free account at https://cloud.langfuse.com

# 2. Get your keys from Settings > API Keys

# 3. Create a .env file with the variables below

# Run:

# python llm_with_tracing.py

import os

import time

from dotenv import load_dotenv

import anthropic

from langfuse import Langfuse

# Load environment variables from .env file

load_dotenv()

# Required environment variables in your .env:

# LANGFUSE_PUBLIC_KEY=pk-lf-…

# LANGFUSE_SECRET_KEY=sk-lf-…

# LANGFUSE_HOST=https://cloud.langfuse.com (or your self-hosted URL)

# ANTHROPIC_API_KEY=sk-ant-…

# Initialize clients

langfuse_client = Langfuse() # Reads keys automatically from environment

anthropic_client = anthropic.Anthropic() # Reads ANTHROPIC_API_KEY from environment

# ── Configuration ─────────────────────────────────────────────────────────────

# Store your prompt here, not inline in the API call.

# This makes it versionable and testable independently.

SYSTEM_PROMPT = “”“You are a helpful customer support assistant.

Answer questions clearly and concisely.

If you do not know something, say so directly — do not guess.”“”

MODEL = “claude-sonnet-4-20250514”

# Anthropic’s pricing as of mid-2026 (update when pricing changes)

# Used to calculate cost per call for cost tracking

COST_PER_INPUT_TOKEN = 3.00 / 1_000_000 # $3.00 per million input tokens

COST_PER_OUTPUT_TOKEN = 15.00 / 1_000_000 # $15.00 per million output tokens

def call_llm_with_tracing(

user_message: str,

session_id: str = “default-session”,

user_id: str = “anonymous”

) -> str:

“”“

Make a traced LLM call. Every call creates a Langfuse trace with:

– Full input and output

– Token usage (input, output, total)

– Calculated cost in USD

– Latency in milliseconds

– Model used and session context

Parameters:

user_message : The message from the user

session_id : Groups related calls into one conversation in Langfuse

user_id : Associates the call with a specific user for analytics

Returns:

The LLM response as a string

““”

# Create a top-level trace for this user interaction

# The trace appears in the Langfuse dashboard as one unit of work

trace = langfuse_client.trace(

name=“customer-support-call”,

session_id=session_id,

user_id=user_id,

input={“user_message”: user_message, “system_prompt”: SYSTEM_PROMPT}

)

# Create a generation span inside the trace

# This captures model-specific details: model name, tokens, cost

generation = trace.generation(

name=“claude-completion”,

model=MODEL,

input={

“system”: SYSTEM_PROMPT,

“messages”: [{“role”: “user”, “content”: user_message}]

}

)

start_time = time.time()

try:

# Make the API call

response = anthropic_client.messages.create(

model=MODEL,

max_tokens=1024,

system=SYSTEM_PROMPT,

messages=[{“role”: “user”, “content”: user_message}]

)

latency_ms = int((time.time() – start_time) * 1000)

# Extract the response text

response_text = response.content[0].text

# Extract token usage from the response

input_tokens = response.usage.input_tokens

output_tokens = response.usage.output_tokens

total_tokens = input_tokens + output_tokens

# Calculate cost for this call

cost_usd = (

input_tokens * COST_PER_INPUT_TOKEN +

output_tokens * COST_PER_OUTPUT_TOKEN

)

# Update the generation span with results

# This data populates the Langfuse cost and token dashboards

generation.end(

output=response_text,

usage={

“input”: input_tokens,

“output”: output_tokens,

“total”: total_tokens,

“unit”: “TOKENS”

metadata={

“latency_ms”: latency_ms,

“cost_usd”: round(cost_usd, 6),

“model”: MODEL

}

)

# Update the trace with the final output

trace.update(

output={“response”: response_text},

metadata={“total_cost_usd”: round(cost_usd, 6)}

)

# Print a summary to stdout for local visibility

print(f“\n{‘─’ * 60}”)

print(f“User: {user_message}”)

print(f“Claude: {response_text}”)

print(f“Tokens: {input_tokens} in / {output_tokens} out / {total_tokens} total”)

print(f“Cost: ${cost_usd:.6f}”)

print(f“Latency: {latency_ms}ms”)

print(f“Trace: {langfuse_client.base_url}/trace/{trace.id}”)

print(f“{‘─’ * 60}\n”)

return response_text

except Exception as e:

# Record the error in the trace so it shows up in Langfuse

generation.end(

output=None,

metadata={“error”: str(e), “latency_ms”: int((time.time() – start_time) * 1000)}

)

trace.update(output={“error”: str(e)})

# Always flush before raising — ensures the error trace is sent

langfuse_client.flush()

raise

finally:

# Flush sends all buffered events to Langfuse

# In a long-running service, Langfuse flushes automatically.

# In a script, you must flush manually before the process exits.

langfuse_client.flush()

# ── Run a demonstration ────────────────────────────────────────────────────────

if __name__ == “__main__”:

# Simulate two turns of a customer support conversation

test_messages = [

“What is your return policy for electronics?”,

“Can I return an item I bought 45 days ago?”

]

session = “demo-session-001”

for i, message in enumerate(test_messages):

print(f“\nCall {i + 1}/{len(test_messages)}”)

try:

call_llm_with_tracing(

user_message=message,

session_id=session,

user_id=“test-user-42”

)

except Exception as e:

print(f“Error on call {i + 1}: {e}”)

Source link

The Roadmap for Mastering LLMOps in 2026

Related Posts