diff --git a/README.md b/README.md index 60a82b0b1..0a5fe3b50 100644 --- a/README.md +++ b/README.md @@ -72,11 +72,30 @@ workspace-phi-3-5-mini Standard_NC12s_v3 True True Next, one can find the inference service's cluster ip and use a temporal `curl` pod to test the service endpoint in the cluster. ```sh +# find service endpoint $ kubectl get svc workspace-phi-3-5-mini NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE workspace-phi-3-5-mini ClusterIP 80/TCP,29500/TCP 10m - -export CLUSTERIP=$(kubectl get svc workspace-phi-3-5-mini -o jsonpath="{.spec.clusterIPs[0]}") +$ export CLUSTERIP=$(kubectl get svc workspace-phi-3-5-mini -o jsonpath="{.spec.clusterIPs[0]}") + +# find availalbe models +$ kubectl run -it --rm --restart=Never curl --image=curlimages/curl -- curl -s http://$CLUSTERIP/v1/models | jq +{ + "object": "list", + "data": [ + { + "id": "phi-3.5-mini-instruct", + "object": "model", + "created": 1733370094, + "owned_by": "vllm", + "root": "/workspace/vllm/weights", + "parent": null, + "max_model_len": 16384 + } + ] +} + +# make an inference call using the model id (phi-3.5-mini-instruct) from previous step $ kubectl run -it --rm --restart=Never curl --image=curlimages/curl -- curl -X POST http://$CLUSTERIP/v1/completions \ -H "Content-Type: application/json" \ -d '{