Disaggregated Serving¶
Source https://github.com/vllm-project/vllm/tree/main/examples/online_serving/disaggregated_serving.
Disagg Proxy Multiturn¶
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Disaggregated Prefill/Decode Proxy with Bidirectional KV Transfer
This proxy sits between clients and a vLLM Prefill/Decode (P/D) deployment,
routing multi-turn chat requests so that each turn reuses KV cache blocks
from the previous turn's Decode node via bidirectional KV transfer.
Architecture:
Client ──► Proxy ──► Prefill (P) ──► Decode (D)
│ │ │
│ kv_transfer_params flow: │
│ D finish ──► proxy caches │
│ next turn ──► proxy sends │
│ cached D blocks to P ──► │
│ P reads D blocks (bidir) │
│ P sends its blocks to D │
Per-request flow:
1. Client sends chat/completions request to proxy.
2. Proxy looks up cached D block info from the previous turn
(keyed by conversation_id).
3. If cache hit, proxy attaches D's block info to the request
so P can read D's KV blocks instead of recomputing.
4. Proxy sends request to P (max_tokens=1, non-streaming).
5. P returns kv_transfer_params with its own block info.
6. Proxy forwards request + P's block info to D (streaming).
7. D streams the response. The final chunk includes D's
kv_transfer_params, which the proxy caches for the next turn.
8. Proxy returns D's response to the client.
Conversation isolation:
Each request must include a ``conversation_id`` field (top-level in
the JSON body) to scope the KV cache across turns. Without it, the
proxy cannot link turns and falls back to no-cache behavior.
Usage:
python disagg_proxy_multiturn.py \\
--host 0.0.0.0 --port 8000 \\
--prefiller-host 10.0.0.1 --prefiller-port 8100 \\
--decoder-host 10.0.0.2 --decoder-port 8200
Dependencies:
pip install fastapi uvicorn httpx
"""
from __future__ import annotations
import argparse
import itertools
import json
import logging
import os
import time
import uuid
from contextlib import asynccontextmanager
from dataclasses import dataclass, field
from typing import Any
import httpx
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse
# Logging
logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO,
)
logger = logging.getLogger("disagg_proxy")
# Data structures
@dataclass
class CachedKVEntry:
"""KV transfer parameters cached from D's response for one turn."""
kv_transfer_params: dict[str, Any]
timestamp: float = field(default_factory=time.time)
class ConversationKVCache:
"""Per-conversation KV block cache.
Each conversation is identified by a ``conversation_id`` supplied by
the client. After D finishes a turn, its ``kv_transfer_params`` are
stored here. On the next turn, the proxy retrieves them so P can
read D's blocks via bidirectional KV transfer.
"""
def __init__(self, ttl_seconds: float = 600.0) -> None:
self._store: dict[str, CachedKVEntry] = {}
self._ttl = ttl_seconds
def get(self, conversation_id: str) -> dict[str, Any] | None:
"""Retrieve and consume cached KV params for a conversation.
Returns a *copy* of the kv_transfer_params dict, or None.
The entry is removed after retrieval (single-use).
"""
entry = self._store.pop(conversation_id, None)
if entry is None:
return None
age = time.time() - entry.timestamp
if age > self._ttl:
logger.info(
"conv=%s: stale cache entry (age=%.1fs > ttl=%.1fs), discarding",
conversation_id,
age,
self._ttl,
)
return None
logger.info(
"conv=%s: cache HIT (age=%.1fs)",
conversation_id,
age,
)
return dict(entry.kv_transfer_params)
def put(self, conversation_id: str, kv_params: dict[str, Any]) -> None:
"""Store D's kv_transfer_params for a conversation."""
self._store[conversation_id] = CachedKVEntry(
kv_transfer_params=dict(kv_params), # defensive copy
)
logger.info(
"conv=%s: cached D blocks (remote_request_id=%s, blocks=%d)",
conversation_id,
kv_params.get("remote_request_id", "?"),
len(kv_params.get("remote_block_ids", [[]])[0])
if kv_params.get("remote_block_ids")
else 0,
)
def evict_stale(self) -> int:
"""Remove entries older than TTL. Returns count of evicted entries."""
now = time.time()
stale = [
cid
for cid, entry in self._store.items()
if now - entry.timestamp > self._ttl
]
for cid in stale:
del self._store[cid]
return len(stale)
@property
def size(self) -> int:
return len(self._store)
# Global state
kv_cache = ConversationKVCache(
ttl_seconds=450.0
) # Must be < VLLM_NIXL_ABORT_REQUEST_TIMEOUT (480s)
# Service client helpers
@dataclass
class ServiceClient:
"""Wrapper around an httpx.AsyncClient for a P or D instance."""
client: httpx.AsyncClient
host: str
port: int
id: int
def _make_headers(request_id: str) -> dict[str, str]:
"""Build HTTP headers for upstream requests."""
headers = {"X-Request-Id": request_id}
api_key = os.environ.get("OPENAI_API_KEY")
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
return headers
async def _send_to_prefill(
client: ServiceClient,
endpoint: str,
req_data: dict[str, Any],
request_id: str,
) -> dict[str, Any]:
"""Send a non-streaming prefill request (max_tokens=1).
Returns the JSON response from P, which includes kv_transfer_params.
"""
payload = req_data.copy()
payload["stream"] = False
payload["max_tokens"] = 1
payload.pop("max_completion_tokens", None)
payload.pop("min_tokens", None)
payload.pop("stream_options", None)
resp = await client.client.post(
endpoint,
json=payload,
headers=_make_headers(request_id),
)
resp.raise_for_status()
return resp.json()
async def _stream_from_decode(
client: ServiceClient,
endpoint: str,
req_data: dict[str, Any],
request_id: str,
conversation_id: str,
) -> tuple[str, str | None, dict[str, Any] | None, str, str | None, int | None]:
"""Stream response from D, capturing text and kv_transfer_params.
Returns (collected_text, finish_reason, kv_params, response_id, created).
Also stores kv_params in the conversation cache.
"""
payload = req_data.copy()
payload["stream"] = True
collected_text = ""
finish_reason: str | None = None
response_id: str | None = None
model_name: str | None = None
created: int | None = None
captured_kv: dict[str, Any] | None = None
async with client.client.stream(
"POST",
endpoint,
json=payload,
headers=_make_headers(request_id),
) as resp:
resp.raise_for_status()
async for line in resp.aiter_lines():
if not line or not line.startswith("data: "):
continue
if line == "data: [DONE]":
break
try:
chunk = json.loads(line[6:])
except json.JSONDecodeError:
continue
if response_id is None:
response_id = chunk.get("id")
model_name = chunk.get("model")
created = chunk.get("created")
for choice in chunk.get("choices", []):
collected_text += choice.get("text", "")
delta = choice.get("delta", {})
collected_text += delta.get("content", "")
if choice.get("finish_reason"):
finish_reason = choice["finish_reason"]
kv_params = chunk.get("kv_transfer_params")
if kv_params:
kv_params["remote_host"] = client.host
captured_kv = kv_params
if conversation_id:
kv_cache.put(conversation_id, kv_params)
return (
collected_text,
finish_reason,
captured_kv,
response_id or request_id,
model_name,
created,
)
async def _stream_from_decode_sse(
client: ServiceClient,
endpoint: str,
req_data: dict[str, Any],
request_id: str,
conversation_id: str,
):
"""Yield SSE chunks from D to the client, capturing kv_transfer_params."""
payload = req_data.copy()
payload["stream"] = True
async with client.client.stream(
"POST",
endpoint,
json=payload,
headers=_make_headers(request_id),
) as resp:
resp.raise_for_status()
async for line in resp.aiter_lines():
if not line:
yield "\n"
continue
if line.startswith("data: ") and line != "data: [DONE]":
try:
chunk = json.loads(line[6:])
kv_params = chunk.get("kv_transfer_params")
if kv_params and conversation_id:
kv_params["remote_host"] = client.host
kv_cache.put(conversation_id, kv_params)
except json.JSONDecodeError:
pass
yield line + "\n"
# FastAPI application
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Initialize HTTP clients for P and D instances."""
app.state.prefill_clients: list[ServiceClient] = []
app.state.decode_clients: list[ServiceClient] = []
for i, (host, port) in enumerate(global_args.prefiller_instances):
app.state.prefill_clients.append(
ServiceClient(
client=httpx.AsyncClient(
timeout=None,
base_url=f"http://{host}:{port}/v1",
),
host=host,
port=port,
id=i,
)
)
for i, (host, port) in enumerate(global_args.decoder_instances):
app.state.decode_clients.append(
ServiceClient(
client=httpx.AsyncClient(
timeout=None,
base_url=f"http://{host}:{port}/v1",
),
host=host,
port=port,
id=i,
)
)
app.state.prefill_iter = itertools.cycle(range(len(app.state.prefill_clients)))
app.state.decode_iter = itertools.cycle(range(len(app.state.decode_clients)))
logger.info(
"Ready: %d prefill, %d decode instances",
len(app.state.prefill_clients),
len(app.state.decode_clients),
)
yield
for sc in app.state.prefill_clients + app.state.decode_clients:
await sc.client.aclose()
app = FastAPI(title="Disaggregated P/D Proxy (Multi-turn)", lifespan=lifespan)
def _next_client(app_state, role: str) -> ServiceClient:
if role == "prefill":
return app_state.prefill_clients[next(app_state.prefill_iter)]
return app_state.decode_clients[next(app_state.decode_iter)]
# Request handler
async def _handle_request(api_path: str, request: Request):
"""Core request handler for both /v1/chat/completions and /v1/completions."""
req_data = await request.json()
request_id = str(uuid.uuid4())
conversation_id: str = req_data.pop("conversation_id", "")
client_wants_stream = req_data.get("stream", False)
if not conversation_id:
logger.warning(
"[%s] No conversation_id provided — KV cache reuse disabled "
"for this request. Add a 'conversation_id' field to enable "
"cross-turn KV sharing.",
request_id,
)
# Step 1: Look up cached D blocks from the previous turn
cached_kv = kv_cache.get(conversation_id) if conversation_id else None
if cached_kv:
# Tell P to read D's blocks (bidirectional transfer)
cached_kv["do_remote_decode"] = True
cached_kv["do_remote_prefill"] = False
req_data["kv_transfer_params"] = cached_kv
logger.info(
"[%s] conv=%s: sending D's cached blocks to P (remote_request_id=%s)",
request_id,
conversation_id,
cached_kv.get("remote_request_id"),
)
else:
# No cached blocks — P recomputes from scratch
req_data["kv_transfer_params"] = {
"do_remote_decode": True,
"do_remote_prefill": False,
"remote_engine_id": None,
"remote_block_ids": None,
"remote_host": None,
"remote_port": None,
}
logger.info("[%s] conv=%s: cache MISS", request_id, conversation_id)
# Step 2: Send to Prefill node (non-streaming, max_tokens=1)
prefill_client = _next_client(request.app.state, "prefill")
t0 = time.time()
prefill_resp = await _send_to_prefill(
prefill_client,
api_path,
req_data,
request_id,
)
logger.info(
"[%s] Prefill done in %.0fms",
request_id,
(time.time() - t0) * 1000,
)
# Attach P's kv_transfer_params for D to read P's blocks
p_kv_params = prefill_resp.get("kv_transfer_params", {})
if p_kv_params:
p_kv_params["remote_host"] = prefill_client.host
req_data["kv_transfer_params"] = p_kv_params
# Step 3: Stream from Decode node, capturing kv_transfer_params
decode_client = _next_client(request.app.state, "decode")
if client_wants_stream:
return StreamingResponse(
_stream_from_decode_sse(
decode_client,
api_path,
req_data,
request_id,
conversation_id,
),
media_type="text/event-stream",
)
text, finish_reason, _, resp_id, model, created = await _stream_from_decode(
decode_client,
api_path,
req_data,
request_id,
conversation_id,
)
# Build OpenAI-compatible response
is_chat = "messages" in req_data
if is_chat:
body = {
"id": resp_id,
"object": "chat.completion",
"created": created or int(time.time()),
"model": model or req_data.get("model", ""),
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": text},
"finish_reason": finish_reason,
}
],
"usage": None,
}
else:
body = {
"id": resp_id,
"object": "text_completion",
"created": created or int(time.time()),
"model": model or req_data.get("model", ""),
"choices": [
{
"index": 0,
"text": text,
"logprobs": None,
"finish_reason": finish_reason,
}
],
"usage": None,
}
return JSONResponse(content=body)
# Routes
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
return await _handle_request("/chat/completions", request)
@app.post("/v1/completions")
async def completions(request: Request):
return await _handle_request("/completions", request)
@app.get("/health")
async def health():
evicted = kv_cache.evict_stale()
return {
"status": "ok",
"cached_conversations": kv_cache.size,
"evicted_stale": evicted,
}
# CLI
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Disaggregated P/D proxy with bidirectional KV transfer",
)
p.add_argument("--host", default="0.0.0.0")
p.add_argument("--port", type=int, default=8000)
p.add_argument(
"--prefiller-host",
"--prefiller-hosts",
dest="prefiller_hosts",
nargs="+",
default=["localhost"],
)
p.add_argument(
"--prefiller-port",
"--prefiller-ports",
dest="prefiller_ports",
type=int,
nargs="+",
default=[8100],
)
p.add_argument(
"--decoder-host",
"--decoder-hosts",
dest="decoder_hosts",
nargs="+",
default=["localhost"],
)
p.add_argument(
"--decoder-port",
"--decoder-ports",
dest="decoder_ports",
type=int,
nargs="+",
default=[8200],
)
args = p.parse_args()
if len(args.prefiller_hosts) != len(args.prefiller_ports):
p.error("Number of prefiller hosts must match ports")
if len(args.decoder_hosts) != len(args.decoder_ports):
p.error("Number of decoder hosts must match ports")
args.prefiller_instances = list(zip(args.prefiller_hosts, args.prefiller_ports))
args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports))
return args
if __name__ == "__main__":
global global_args
global_args = parse_args()
import uvicorn
uvicorn.run(app, host=global_args.host, port=global_args.port)