apiVersion: v1 kind: Namespace metadata: labels: control-plane: kfserving-controller-manager controller-tools.k8s.io: "1.0" istio-injection: disabled name: kfserving-system --- apiVersion: apiextensions.k8s.io/v1beta1 kind: CustomResourceDefinition metadata: creationTimestamp: null labels: controller-tools.k8s.io: "1.0" name: inferenceservices.serving.kubeflow.org spec: additionalPrinterColumns: - JSONPath: .status.url name: URL type: string - JSONPath: .status.conditions[?(@.type=='Ready')].status name: Ready type: string - JSONPath: .status.traffic name: Default Traffic type: integer - JSONPath: .status.canaryTraffic name: Canary Traffic type: integer - JSONPath: .metadata.creationTimestamp name: Age type: date group: serving.kubeflow.org names: kind: InferenceService plural: inferenceservices shortNames: - inferenceservice scope: Namespaced validation: openAPIV3Schema: properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' type: string kind: description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' type: string metadata: type: object spec: properties: canary: description: Canary defines an alternate endpoints to route a percentage of traffic. properties: explainer: description: Explainer defines the model explanation service spec, explainer service calls to predictor or transformer if it is specified. properties: alibi: description: Spec for alibi explainer properties: config: description: Inline custom parameter settings for explainer type: object resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Defaults to latest Alibi Version type: string storageUri: description: The location of a trained explanation model type: string type: description: The type of Alibi explainer type: string required: - type type: object custom: description: Spec for a custom explainer properties: container: type: object required: - container type: object logger: description: Activate request/response logging properties: mode: description: What payloads to log type: string url: description: URL to send request logging CloudEvents type: string type: object maxReplicas: description: This is the up bound for autoscaler to scale to format: int64 type: integer minReplicas: description: Minimum number of replicas, pods won't scale down to 0 in case of no traffic format: int64 type: integer parallelism: description: Parallelism, how many requests can be processed concurrently format: int64 type: integer serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the service type: string type: object predictor: description: Predictor defines the model serving spec +required properties: custom: description: Spec for a custom predictor properties: container: type: object required: - container type: object logger: description: Activate request/response logging properties: mode: description: What payloads to log type: string url: description: URL to send request logging CloudEvents type: string type: object maxReplicas: description: This is the up bound for autoscaler to scale to format: int64 type: integer minReplicas: description: Minimum number of replicas, pods won't scale down to 0 in case of no traffic format: int64 type: integer onnx: description: Spec for ONNX runtime (https://github.com/microsoft/onnxruntime) properties: resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object parallelism: description: Parallelism, how many requests can be processed concurrently format: int64 type: integer pytorch: description: Spec for PyTorch predictor properties: modelClassName: description: Defaults PyTorch model class name to 'PyTorchModel' type: string resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the service type: string sklearn: description: Spec for SKLearn predictor properties: resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object tensorflow: description: Spec for Tensorflow Serving (https://github.com/tensorflow/serving) properties: resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map. type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object tensorrt: description: Spec for TensorRT Inference Server (https://github.com/NVIDIA/triton-inference-server) properties: resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object xgboost: description: Spec for XGBoost predictor properties: nthread: description: Number of thread to be used by XGBoost format: int64 type: integer resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object type: object transformer: description: Transformer defines the pre/post processing before and after the predictor call, transformer service calls to predictor service. properties: custom: description: Spec for a custom transformer properties: container: type: object required: - container type: object logger: description: Activate request/response logging properties: mode: description: What payloads to log type: string url: description: URL to send request logging CloudEvents type: string type: object maxReplicas: description: This is the up bound for autoscaler to scale to format: int64 type: integer minReplicas: description: Minimum number of replicas, pods won't scale down to 0 in case of no traffic format: int64 type: integer parallelism: description: Parallelism, how many requests can be processed concurrently format: int64 type: integer serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the service type: string type: object required: - predictor type: object canaryTrafficPercent: description: CanaryTrafficPercent defines the percentage of traffic going to canary InferenceService endpoints format: int64 type: integer default: description: Default defines default InferenceService endpoints +required properties: explainer: description: Explainer defines the model explanation service spec, explainer service calls to predictor or transformer if it is specified. properties: alibi: description: Spec for alibi explainer properties: config: description: Inline custom parameter settings for explainer type: object resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Defaults to latest Alibi Version type: string storageUri: description: The location of a trained explanation model type: string type: description: The type of Alibi explainer type: string required: - type type: object custom: description: Spec for a custom explainer properties: container: type: object required: - container type: object logger: description: Activate request/response logging properties: mode: description: What payloads to log type: string url: description: URL to send request logging CloudEvents type: string type: object maxReplicas: description: This is the up bound for autoscaler to scale to format: int64 type: integer minReplicas: description: Minimum number of replicas, pods won't scale down to 0 in case of no traffic format: int64 type: integer parallelism: description: Parallelism, how many requests can be processed concurrently format: int64 type: integer serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the service type: string type: object predictor: description: Predictor defines the model serving spec +required properties: custom: description: Spec for a custom predictor properties: container: type: object required: - container type: object logger: description: Activate request/response logging properties: mode: description: What payloads to log type: string url: description: URL to send request logging CloudEvents type: string type: object maxReplicas: description: This is the up bound for autoscaler to scale to format: int64 type: integer minReplicas: description: Minimum number of replicas, pods won't scale down to 0 in case of no traffic format: int64 type: integer onnx: description: Spec for ONNX runtime (https://github.com/microsoft/onnxruntime) properties: resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object parallelism: description: Parallelism, how many requests can be processed concurrently format: int64 type: integer pytorch: description: Spec for PyTorch predictor properties: modelClassName: description: Defaults PyTorch model class name to 'PyTorchModel' type: string resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the service type: string sklearn: description: Spec for SKLearn predictor properties: resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object tensorflow: description: Spec for Tensorflow Serving (https://github.com/tensorflow/serving) properties: resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map. type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object tensorrt: description: Spec for TensorRT Inference Server (https://github.com/NVIDIA/triton-inference-server) properties: resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object xgboost: description: Spec for XGBoost predictor properties: nthread: description: Number of thread to be used by XGBoost format: int64 type: integer resources: description: Defaults to requests and limits of 1CPU, 2Gb MEM. type: object runtimeVersion: description: Allowed runtime versions are specified in the inferenceservice config map type: string storageUri: description: The location of the trained model type: string required: - storageUri type: object type: object transformer: description: Transformer defines the pre/post processing before and after the predictor call, transformer service calls to predictor service. properties: custom: description: Spec for a custom transformer properties: container: type: object required: - container type: object logger: description: Activate request/response logging properties: mode: description: What payloads to log type: string url: description: URL to send request logging CloudEvents type: string type: object maxReplicas: description: This is the up bound for autoscaler to scale to format: int64 type: integer minReplicas: description: Minimum number of replicas, pods won't scale down to 0 in case of no traffic format: int64 type: integer parallelism: description: Parallelism, how many requests can be processed concurrently format: int64 type: integer serviceAccountName: description: ServiceAccountName is the name of the ServiceAccount to use to run the service type: string type: object required: - predictor type: object required: - default type: object status: properties: canary: description: Statuses for the canary endpoints of the InferenceService type: object canaryTraffic: description: Traffic percentage that goes to canary services format: int64 type: integer conditions: description: Conditions the latest available observations of a resource's current state. +patchMergeKey=type +patchStrategy=merge items: properties: lastTransitionTime: description: LastTransitionTime is the last time the condition transitioned from one status to another. We use VolatileTime in place of metav1.Time to exclude this from creating equality.Semantic differences (all other things held constant). type: string message: description: A human readable message indicating details about the transition. type: string reason: description: The reason for the condition's last transition. type: string severity: description: Severity with which to treat failures of this type of condition. When this is not specified, it defaults to Error. type: string status: description: Status of the condition, one of True, False, Unknown. +required type: string type: description: Type of condition. +required type: string required: - type - status type: object type: array default: description: Statuses for the default endpoints of the InferenceService type: object observedGeneration: description: ObservedGeneration is the 'Generation' of the Service that was last processed by the controller. format: int64 type: integer traffic: description: Traffic percentage that goes to default services format: int64 type: integer url: description: URL of the InferenceService type: string type: object version: v1alpha2 status: acceptedNames: kind: "" plural: "" conditions: [] storedVersions: [] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: kfserving-proxy-role rules: - apiGroups: - authentication.k8s.io resources: - tokenreviews verbs: - create - apiGroups: - authorization.k8s.io resources: - subjectaccessreviews verbs: - create --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: creationTimestamp: null name: manager-role rules: - apiGroups: - serving.knative.dev resources: - services verbs: - get - list - watch - create - update - patch - delete - apiGroups: - serving.knative.dev resources: - services/status verbs: - get - update - patch - apiGroups: - networking.istio.io resources: - virtualservices verbs: - get - list - watch - create - update - patch - delete - apiGroups: - networking.istio.io resources: - virtualservices/status verbs: - get - update - patch - apiGroups: - serving.kubeflow.org resources: - inferenceservices verbs: - get - list - watch - create - update - patch - delete - apiGroups: - serving.kubeflow.org resources: - inferenceservices/status verbs: - get - update - patch - apiGroups: - "" resources: - serviceaccounts verbs: - get - list - watch - apiGroups: - "" resources: - secrets verbs: - get - list - watch - apiGroups: - "" resources: - configmaps verbs: - get - list - watch - apiGroups: - admissionregistration.k8s.io resources: - mutatingwebhookconfigurations - validatingwebhookconfigurations verbs: - get - list - watch - create - update - patch - delete - apiGroups: - "" resources: - secrets verbs: - get - list - watch - create - update - patch - delete - apiGroups: - "" resources: - services verbs: - get - list - watch - create - update - patch - delete - apiGroups: - "" resources: - namespaces verbs: - get - list - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: kfserving-proxy-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: kfserving-proxy-role subjects: - kind: ServiceAccount name: default namespace: kfserving-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: creationTimestamp: null name: manager-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: manager-role subjects: - kind: ServiceAccount name: default namespace: kfserving-system --- apiVersion: v1 data: credentials: |- { "gcs": { "gcsCredentialFileName": "gcloud-application-credentials.json" }, "s3": { "s3AccessKeyIDName": "awsAccessKeyID", "s3SecretAccessKeyName": "awsSecretAccessKey" } } explainers: |- { "alibi": { "image" : "gcr.io/kfserving/alibi-explainer", "defaultImageVersion": "0.2.2", "allowedImageVersions": [ "0.2.2" ] } } ingress: |- { "ingressGateway" : "knative-ingress-gateway.knative-serving", "ingressService" : "istio-ingressgateway.istio-system.svc.cluster.local" } logger: |- { "image" : "gcr.io/kfserving/logger:0.2.2", "memoryRequest": "100Mi", "memoryLimit": "1Gi", "cpuRequest": "100m", "cpuLimit": "1" } predictors: |- { "tensorflow": { "image": "tensorflow/serving", "defaultImageVersion": "1.14.0", "defaultGpuImageVersion": "1.14.0-gpu", "allowedImageVersions": [ "1.11.0", "1.11.0-gpu", "1.12.0", "1.12.0-gpu", "1.13.0", "1.13.0-gpu", "1.14.0", "1.14.0-gpu" ] }, "onnx": { "image": "mcr.microsoft.com/onnxruntime/server", "defaultImageVersion": "v0.5.1", "allowedImageVersions": [ "v0.5.1" ] }, "sklearn": { "image": "gcr.io/kfserving/sklearnserver", "defaultImageVersion": "0.2.2", "allowedImageVersions": [ "0.2.2" ] }, "xgboost": { "image": "gcr.io/kfserving/xgbserver", "defaultImageVersion": "0.2.2", "allowedImageVersions": [ "0.2.2" ] }, "pytorch": { "image": "gcr.io/kfserving/pytorchserver", "defaultImageVersion": "0.2.2", "allowedImageVersions": [ "0.2.2" ] }, "tensorrt": { "image": "nvcr.io/nvidia/tensorrtserver", "defaultImageVersion": "19.05-py3", "allowedImageVersions": [ "19.05-py3" ] } } storageInitializer: |- { "image" : "gcr.io/kfserving/storage-initializer:0.2.2", "memoryRequest": "100Mi", "memoryLimit": "1Gi", "cpuRequest": "100m", "cpuLimit": "1" } transformers: |- { } kind: ConfigMap metadata: name: inferenceservice-config namespace: kfserving-system --- apiVersion: v1 kind: Secret metadata: name: kfserving-webhook-server-secret namespace: kfserving-system --- apiVersion: v1 kind: Service metadata: annotations: prometheus.io/port: "8443" prometheus.io/scheme: https prometheus.io/scrape: "true" labels: control-plane: controller-manager controller-tools.k8s.io: "1.0" name: kfserving-controller-manager-metrics-service namespace: kfserving-system spec: ports: - name: https port: 8443 targetPort: https selector: control-plane: controller-manager controller-tools.k8s.io: "1.0" --- apiVersion: v1 kind: Service metadata: labels: control-plane: kfserving-controller-manager controller-tools.k8s.io: "1.0" name: kfserving-controller-manager-service namespace: kfserving-system spec: ports: - port: 443 selector: control-plane: kfserving-controller-manager controller-tools.k8s.io: "1.0" --- apiVersion: apps/v1 kind: StatefulSet metadata: labels: control-plane: kfserving-controller-manager controller-tools.k8s.io: "1.0" name: kfserving-controller-manager namespace: kfserving-system spec: selector: matchLabels: control-plane: kfserving-controller-manager controller-tools.k8s.io: "1.0" serviceName: controller-manager-service template: metadata: labels: control-plane: kfserving-controller-manager controller-tools.k8s.io: "1.0" spec: containers: - args: - --secure-listen-address=0.0.0.0:8443 - --upstream=http://127.0.0.1:8080/ - --logtostderr=true - --v=10 image: gcr.io/kubebuilder/kube-rbac-proxy:v0.4.0 name: kube-rbac-proxy ports: - containerPort: 8443 name: https - args: - --metrics-addr=127.0.0.1:8080 command: - /manager env: - name: POD_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace - name: SECRET_NAME value: kfserving-webhook-server-secret image: gcr.io/kfserving/kfserving-controller:0.2.2 imagePullPolicy: Always name: manager ports: - containerPort: 9876 name: webhook-server protocol: TCP resources: limits: cpu: 100m memory: 300Mi requests: cpu: 100m memory: 200Mi volumeMounts: - mountPath: /tmp/cert name: cert readOnly: true terminationGracePeriodSeconds: 10 volumes: - name: cert secret: defaultMode: 420 secretName: kfserving-webhook-server-secret