The auriko Python package provides an OpenAI-compatible client for the Auriko API.
Full SDK Reference Complete API reference with all types, parameters, and examples
Installation
Requires Python 3.10 or later.
Get started
from auriko import Client
client = Client() # reads AURIKO_API_KEY from environment
response = client.chat.completions.create(
model = "gpt-5.4" ,
messages = [{ "role" : "user" , "content" : "Hello!" }]
)
print (response.choices[ 0 ].message.content)
API Key
import os
# Option 1: Auto-detect from AURIKO_API_KEY env var (recommended)
client = Client()
# Option 2: Pass explicitly
client = Client( api_key = os.environ[ "AURIKO_API_KEY" ])
Base URL
# Default: https://api.auriko.ai/v1
# Override for self-hosted or proxy setups:
client = Client( base_url = "https://your-proxy.example.com/v1" )
Timeout
client = Client( timeout = 60.0 ) # seconds
Retries
client = Client( max_retries = 3 ) # default is 2
Create chat completions
Basic request
Send a chat completion request:
response = client.chat.completions.create(
model = "gpt-4o" ,
messages = [
{ "role" : "system" , "content" : "You are a helpful assistant." },
{ "role" : "user" , "content" : "What is 2+2?" }
]
)
print (response.choices[ 0 ].message.content)
With routing options
response = client.chat.completions.create(
model = "gpt-5.4" ,
messages = [{ "role" : "user" , "content" : "Hello!" }],
routing = {
"optimize" : "cost" ,
"max_ttft_ms" : 200 ,
}
)
# Access routing metadata
print ( f "Provider: { response.routing_metadata.provider } " )
if response.routing_metadata.cost:
print ( f "Cost: $ { response.routing_metadata.cost.billable_cost_usd :.6f} " )
You can also pass a RoutingOptions object for IDE autocomplete and validation:
from auriko.route_types import RoutingOptions, Optimize
response = client.chat.completions.create(
model = "gpt-5.4" ,
messages = [{ "role" : "user" , "content" : "Hello!" }],
routing = RoutingOptions( optimize = Optimize. COST , max_ttft_ms = 200 )
)
All routing fields:
Field Type Description optimizeOptimizeStrategy: "cost", "speed", "ttft", "throughput", "balanced", "cheapest" weightsdict[str, float]Custom scoring weights: cost, ttft, throughput, reliability. Overrides preset. max_cost_per_1mfloatMax cost per 1M tokens max_ttft_msintMax time to first token (ms) min_throughput_tpsfloatMin tokens per second min_success_ratefloatMin provider success rate (0.0–1.0) providerslist[str]Allowlist of providers exclude_providerslist[str]Blocklist of providers preferstrPreferred provider (soft preference) modeMode"pool" (default) or "fallback"allow_fallbacksboolEnable fallback on failure max_fallback_attemptsintMax fallback retries data_policyDataPolicy"none", "no_training", "zdr"only_byokboolOnly use BYOK providers only_platformboolOnly use platform providers
See Advanced Routing for detailed strategy guides.
Multi-model routing
Route a request across multiple models. The router picks the best option based on your routing strategy:
response = client.chat.completions.create(
models = [ "gpt-4o" , "claude-sonnet-4-20250514" , "gemini-2.5-flash" ],
messages = [{ "role" : "user" , "content" : "Explain quantum computing briefly." }],
routing = { "optimize" : "cost" }
)
print ( f "Model used: { response.model } " )
print ( f "Provider: { response.routing_metadata.provider } " )
print (response.choices[ 0 ].message.content)
model and models are mutually exclusive. Specify exactly one. Passing both raises InvalidRequestError.
Extended thinking
Enable extended reasoning for complex tasks using the extensions parameter:
response = client.chat.completions.create(
model = "claude-sonnet-4-20250514" ,
messages = [{ "role" : "user" , "content" : "Solve step by step: what is 23! / 20!?" }],
extensions = { "thinking" : { "enabled" : True , "budget_tokens" : 10000 }}
)
# Access the reasoning output (if the model returns it)
if response.choices[ 0 ].message.reasoning_content:
print ( f "Reasoning: { response.choices[ 0 ].message.reasoning_content } " )
print ( f "Answer: { response.choices[ 0 ].message.content } " )
You can also pass provider-specific parameters through extensions:
response = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }],
extensions = { "openai" : { "logit_bias" : { "1234" : - 100 }}}
)
See Extensions and Thinking for provider details and streaming thinking output.
Attach metadata to requests for tracking and analytics:
response = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }],
auriko_metadata = { "session_id" : "abc-123" , "user_tier" : "premium" }
)
The Auriko dashboard logs and displays your metadata.
Stream responses
stream = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Count to 10" }],
stream = True
)
for chunk in stream:
if chunk.choices and chunk.choices[ 0 ].delta.content:
print (chunk.choices[ 0 ].delta.content, end = "" , flush = True )
After consuming all chunks, access stream-level metadata:
print ( f " \n Provider: { stream.routing_metadata.provider } " )
print ( f "Tokens: { stream.usage.total_tokens } " )
print ( f "Request ID: { stream.response_headers.request_id } " )
Use a context manager for automatic cleanup:
with client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Count to 10" }],
stream = True
) as stream:
for chunk in stream:
if chunk.choices and chunk.choices[ 0 ].delta.content:
print (chunk.choices[ 0 ].delta.content, end = "" , flush = True )
# stream is automatically closed
Or close manually with stream.close().
Routing metadata, usage, and response headers are available only after consuming all chunks.
See Streaming Guide for full patterns including tool call streaming.
tools = [
{
"type" : "function" ,
"function" : {
"name" : "get_weather" ,
"description" : "Get weather for a city" ,
"parameters" : {
"type" : "object" ,
"properties" : {
"city" : { "type" : "string" }
},
"required" : [ "city" ]
}
}
}
]
response = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "What's the weather in Paris?" }],
tools = tools
)
if response.choices[ 0 ].message.tool_calls:
tool_call = response.choices[ 0 ].message.tool_calls[ 0 ]
print ( f "Function: { tool_call.function.name } " )
print ( f "Arguments: { tool_call.function.arguments } " )
See Tool Calling Guide for multi-turn tool conversations.
Every response and error includes a response_headers object with typed accessors:
response = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }]
)
response.response_headers.request_id # str | None
response.response_headers.rate_limit_remaining # int | None
response.response_headers.rate_limit_limit # int | None
response.response_headers.rate_limit_reset # str | None
response.response_headers.credits_balance_microdollars # int | None
response.response_headers.provider_used # str | None
response.response_headers.routing_strategy # str | None
response.response_headers.get( "x-custom-header" ) # generic lookup
Property Header Type request_idx-request-idstr | Nonerate_limit_remainingx-ratelimit-remaining-requestsint | Nonerate_limit_limitx-ratelimit-limit-requestsint | Nonerate_limit_resetx-ratelimit-reset-requestsstr | Nonecredits_balance_microdollarsx-credits-balance-microdollarsint | Noneprovider_usedx-provider-usedstr | Nonerouting_strategyx-routing-strategystr | None
Error objects also carry response_headers. Use e.response_headers.request_id when filing support tickets to correlate with server logs.
See the Python SDK Reference for the complete ResponseHeaders API.
Read token usage
The Usage object on every response carries optional detail breakdowns:
response = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }]
)
usage = response.usage
# Prompt token breakdown
if usage.prompt_tokens_details:
print ( f "Cached: { usage.prompt_tokens_details.cached_tokens } " )
print ( f "Text: { usage.prompt_tokens_details.text_tokens } " )
print ( f "Image: { usage.prompt_tokens_details.image_tokens } " )
print ( f "Audio: { usage.prompt_tokens_details.audio_tokens } " )
# Completion token breakdown
if usage.completion_tokens_details:
print ( f "Reasoning: { usage.completion_tokens_details.reasoning_tokens } " )
print ( f "Text: { usage.completion_tokens_details.text_tokens } " )
Field Sub-fields Type prompt_tokens_detailscached_tokens, text_tokens, image_tokens, audio_tokensOptional[int] eachcompletion_tokens_detailsreasoning_tokens, text_tokens, image_tokens, audio_tokensOptional[int] each
Availability depends on the provider. completion_tokens_details.reasoning_tokens is present for OpenAI o-series, DeepSeek, xAI, and Google Gemini. It’s None for providers that don’t report reasoning token counts (Anthropic, Moonshot, Fireworks).
See Check reasoning token availability for the full breakdown.
Handle errors
Catch typed exceptions:
from auriko import (
Client,
AurikoAPIError,
AuthenticationError,
RateLimitError,
BudgetExceededError,
ModelNotFoundError,
ProviderError,
# Also available: InvalidRequestError, InsufficientCreditsError,
# InternalError, ProviderAuthError, ServiceUnavailableError
)
client = Client()
try :
response = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }]
)
except AuthenticationError as e:
print ( f "Check your API key: { e } " )
except RateLimitError as e:
print ( f "Rate limited: { e } " )
except BudgetExceededError as e:
print ( f "Budget exceeded: { e } " )
except ModelNotFoundError as e:
print ( f "Model not found: { e } " )
except ProviderError as e:
print ( f "Provider error: { e } " )
except AurikoAPIError as e:
print ( f "API error ( { e.status_code } ): { e } " )
See Error Handling Guide for retry patterns and map_openai_error().
Use management APIs
Query workspace, budget, and model information:
# Identity (discover your workspace)
identity = client.me.get()
print ( f "Workspace: { identity.workspace_id } " )
# Workspaces
workspaces = client.workspaces.list()
workspace = client.workspaces.get( "ws-123" )
# Budgets
budgets = client.budgets.list( "ws-123" )
budget = client.budgets.get( "ws-123" , "budget-456" )
# Models
registry = client.models.list_registry()
directory = client.models.list_directory()
providers = client.models.list_providers()
Model listing choices
Method Returns Use when list_registry()Flat list: id, family, display_name You need a quick model ID lookup list_directory()Rich detail: provider entries, context windows, capabilities, pricing tiers You need to compare providers or check capabilities list_providers()Provider catalog: display name, description, data policy You need to see available providers
See the Python SDK Reference for the complete API.
Use async client
Use the async client for non-blocking requests:
from auriko import AsyncClient
async def main ():
client = AsyncClient()
response = await client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }]
)
print (response.choices[ 0 ].message.content)
import asyncio
asyncio.run(main())
Async streaming
Stream responses asynchronously:
from auriko import AsyncClient
async def stream_response ():
client = AsyncClient()
stream = await client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Count to 10" }],
stream = True
)
async for chunk in stream:
if chunk.choices and chunk.choices[ 0 ].delta.content:
print (chunk.choices[ 0 ].delta.content, end = "" , flush = True )
Async context manager
Use async with for automatic connection cleanup:
from auriko import AsyncClient
async def main ():
async with AsyncClient() as client:
response = await client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }]
)
print (response.choices[ 0 ].message.content)
# client.close() called automatically
Or close explicitly: await client.close()
Use context managers
Use a context manager for automatic cleanup:
with Client() as client:
response = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }]
)
print (response.choices[ 0 ].message.content)
SDK scope
The Auriko SDK covers: inference (chat completions with routing), read-only management (workspaces, budgets, identity), and model discovery. For full platform operations (workspace creation, budget management, API key rotation), use the REST API directly.
Use type hints
The SDK provides typed responses, errors, and routing configuration. Use your IDE’s autocomplete for the best experience:
from auriko import Client
from auriko.models.chat import ChatCompletion, ChatCompletionChunk
client = Client()
response: ChatCompletion = client.chat.completions.create(
model = "gpt-4o" ,
messages = [{ "role" : "user" , "content" : "Hello!" }]
)