apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
name: my-llm-service
annotations:
autoscaling.knative.dev/min-scale: "2"
autoscaling.knative.dev/max-scale: "20"
spec:
predictor:
model:
modelFormat:
name: huggingface
storageUri: "hf://meta-llama/Meta-Llama-3-8B-Instruct"
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
name: llm-virtualservice
spec:
hosts:
- my-llm-service
http:
- retries:
attempts: 3
perTryTimeout: 5s
retryOn: gateway-error,connect-failure,retriable-4xx
route:
- destination:
host: my-llm-service apiVersion: networking.istio.io/v1beta1
kind: DestinationRule
metadata:
name: llm-circuit-breaker
spec:
host: my-llm-service
trafficPolicy:
connectionPool:
http:
http1MaxPendingRequests: 100
maxRequestsPerConnection: 10
outlierDetection:
consecutive5xxErrors: 5
interval: 10s
baseEjectionTime: 30s apiVersion: kyverno.io/v1
kind: ClusterPolicy
metadata:
name: require-gpu-limits-for-inference
spec:
validationFailureAction: Enforce
rules:
- name: check-gpu-limits
match:
any:
- resources:
kinds:
- Pod
selector:
matchLabels:
app.kubernetes.io/component: inference
validate:
message: "Inference pods must explicitly set nvidia.com/gpu in resources.limits"
pattern:
spec:
containers:
- resources:
limits:
nvidia.com/gpu: "?*" apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kserve-inference-monitor
namespace: monitoring
labels:
release: prometheus # должен совпадать с selector в вашем Prometheus CR
spec:
namespaceSelector:
matchNames:
- default # namespace, где живёт InferenceService
selector:
matchLabels:
serving.kserve.io/inferenceservice: my-llm-service
endpoints:
- port: http
path: /metrics
interval: 15s