Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 84b667ca7a | |||
| 29657106fc |
@@ -11,7 +11,7 @@ Features:
|
|||||||
- ✅ Easy to config: single yaml file
|
- ✅ Easy to config: single yaml file
|
||||||
- ✅ On-demand model switching
|
- ✅ On-demand model switching
|
||||||
- ✅ Full control over server settings per model
|
- ✅ Full control over server settings per model
|
||||||
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
|
- ✅ OpenAI API support (`v1/completions`, `v1/chat/completions`, `v1/embeddings` and `v1/rerank`)
|
||||||
- ✅ Multiple GPU support
|
- ✅ Multiple GPU support
|
||||||
- ✅ Run multiple models at once with `profiles`
|
- ✅ Run multiple models at once with `profiles`
|
||||||
- ✅ Remote log monitoring at `/log`
|
- ✅ Remote log monitoring at `/log`
|
||||||
@@ -39,6 +39,9 @@ llama-swap's configuration is purposefully simple.
|
|||||||
# Default (and minimum) is 15 seconds
|
# Default (and minimum) is 15 seconds
|
||||||
healthCheckTimeout: 60
|
healthCheckTimeout: 60
|
||||||
|
|
||||||
|
# Write HTTP logs (useful for troubleshooting), defaults to false
|
||||||
|
logRequests: true
|
||||||
|
|
||||||
# define valid model values and the upstream server start
|
# define valid model values and the upstream server start
|
||||||
models:
|
models:
|
||||||
"llama":
|
"llama":
|
||||||
@@ -92,7 +95,7 @@ profiles:
|
|||||||
- "llama"
|
- "llama"
|
||||||
```
|
```
|
||||||
|
|
||||||
**Guides and examples**
|
**Advanced examples**
|
||||||
|
|
||||||
- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints
|
- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints
|
||||||
- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases.
|
- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases.
|
||||||
|
|||||||
@@ -2,6 +2,9 @@
|
|||||||
# Default (and minimum): 15 seconds
|
# Default (and minimum): 15 seconds
|
||||||
healthCheckTimeout: 15
|
healthCheckTimeout: 15
|
||||||
|
|
||||||
|
# Log HTTP requests helpful for troubleshoot, defaults to False
|
||||||
|
logRequests: true
|
||||||
|
|
||||||
models:
|
models:
|
||||||
"llama":
|
"llama":
|
||||||
cmd: >
|
cmd: >
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ func (m *ModelConfig) SanitizedCommand() ([]string, error) {
|
|||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
HealthCheckTimeout int `yaml:"healthCheckTimeout"`
|
HealthCheckTimeout int `yaml:"healthCheckTimeout"`
|
||||||
|
LogRequests bool `yaml:"logRequests"`
|
||||||
Models map[string]ModelConfig `yaml:"models"`
|
Models map[string]ModelConfig `yaml:"models"`
|
||||||
Profiles map[string][]string `yaml:"profiles"`
|
Profiles map[string][]string `yaml:"profiles"`
|
||||||
|
|
||||||
|
|||||||
+50
-7
@@ -46,6 +46,38 @@ func New(config *Config) *ProxyManager {
|
|||||||
ginEngine: gin.New(),
|
ginEngine: gin.New(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if config.LogRequests {
|
||||||
|
pm.ginEngine.Use(func(c *gin.Context) {
|
||||||
|
// Start timer
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
// Process request
|
||||||
|
c.Next()
|
||||||
|
|
||||||
|
// Stop timer
|
||||||
|
duration := time.Since(start)
|
||||||
|
|
||||||
|
// Log request details
|
||||||
|
clientIP := c.ClientIP()
|
||||||
|
method := c.Request.Method
|
||||||
|
path := c.Request.URL.Path
|
||||||
|
statusCode := c.Writer.Status()
|
||||||
|
bodySize := c.Writer.Size()
|
||||||
|
|
||||||
|
fmt.Fprintf(pm.logMonitor, "[llama-swap] %s [%s] \"%s %s %s\" %d %d \"%s\" %v\n",
|
||||||
|
clientIP,
|
||||||
|
time.Now().Format("2006-01-02 15:04:05"),
|
||||||
|
method,
|
||||||
|
path,
|
||||||
|
c.Request.Proto,
|
||||||
|
statusCode,
|
||||||
|
bodySize,
|
||||||
|
c.Request.UserAgent(),
|
||||||
|
duration,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// Set up routes using the Gin engine
|
// Set up routes using the Gin engine
|
||||||
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
||||||
// Support legacy /v1/completions api, see issue #12
|
// Support legacy /v1/completions api, see issue #12
|
||||||
@@ -127,7 +159,7 @@ func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
|
|||||||
|
|
||||||
// Encode the data as JSON and write it to the response writer
|
// Encode the data as JSON and write it to the response writer
|
||||||
if err := json.NewEncoder(c.Writer).Encode(map[string]interface{}{"data": data}); err != nil {
|
if err := json.NewEncoder(c.Writer).Encode(map[string]interface{}{"data": data}); err != nil {
|
||||||
c.AbortWithError(http.StatusInternalServerError, fmt.Errorf("error encoding JSON"))
|
pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error encoding JSON %s", err.Error()))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -197,12 +229,12 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
|
|||||||
requestedModel := c.Param("model_id")
|
requestedModel := c.Param("model_id")
|
||||||
|
|
||||||
if requestedModel == "" {
|
if requestedModel == "" {
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
|
pm.sendErrorResponse(c, http.StatusBadRequest, "model id required in path")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if process, err := pm.swapModel(requestedModel); err != nil {
|
if process, err := pm.swapModel(requestedModel); err != nil {
|
||||||
c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
|
pm.sendErrorResponse(c, http.StatusNotFound, fmt.Sprintf("unable to swap to model, %s", err.Error()))
|
||||||
} else {
|
} else {
|
||||||
// rewrite the path
|
// rewrite the path
|
||||||
c.Request.URL.Path = c.Param("upstreamPath")
|
c.Request.URL.Path = c.Param("upstreamPath")
|
||||||
@@ -238,22 +270,23 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
|||||||
func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
|
func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
|
||||||
bodyBytes, err := io.ReadAll(c.Request.Body)
|
bodyBytes, err := io.ReadAll(c.Request.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("invalid JSON"))
|
pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var requestBody map[string]interface{}
|
var requestBody map[string]interface{}
|
||||||
if err := json.Unmarshal(bodyBytes, &requestBody); err != nil {
|
if err := json.Unmarshal(bodyBytes, &requestBody); err != nil {
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("invalid JSON"))
|
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("invalid JSON: %s", err.Error()))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
model, ok := requestBody["model"].(string)
|
model, ok := requestBody["model"].(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("missing or invalid 'model' key"))
|
pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if process, err := pm.swapModel(model); err != nil {
|
if process, err := pm.swapModel(model); err != nil {
|
||||||
c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
|
pm.sendErrorResponse(c, http.StatusNotFound, fmt.Sprintf("unable to swap to model, %s", err.Error()))
|
||||||
return
|
return
|
||||||
} else {
|
} else {
|
||||||
c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
||||||
@@ -266,6 +299,16 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pm *ProxyManager) sendErrorResponse(c *gin.Context, statusCode int, message string) {
|
||||||
|
acceptHeader := c.GetHeader("Accept")
|
||||||
|
|
||||||
|
if strings.Contains(acceptHeader, "application/json") {
|
||||||
|
c.JSON(statusCode, gin.H{"error": message})
|
||||||
|
} else {
|
||||||
|
c.String(statusCode, message)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func ProcessKeyName(groupName, modelName string) string {
|
func ProcessKeyName(groupName, modelName string) string {
|
||||||
return groupName + PROFILE_SPLIT_CHAR + modelName
|
return groupName + PROFILE_SPLIT_CHAR + modelName
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user