{
  "description": "InferenceService is the Schema for the inferenceservices API",
  "properties": {
    "apiVersion": {
      "description": "APIVersion defines the versioned schema of this representation of an object.\nServers should convert recognized schemas to the latest internal value, and\nmay reject unrecognized values.\nMore info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources",
      "type": [
        "string",
        "null"
      ]
    },
    "kind": {
      "description": "Kind is a string value representing the REST resource this object represents.\nServers may infer this from the endpoint the client submits requests to.\nCannot be updated.\nIn CamelCase.\nMore info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds",
      "type": [
        "string",
        "null"
      ]
    },
    "metadata": {
      "type": [
        "object",
        "null"
      ]
    },
    "spec": {
      "additionalProperties": false,
      "description": "spec defines the desired state of InferenceService",
      "properties": {
        "args": {
          "description": "Args overrides the container arguments entirely.\nOnly used when Runtime is \"generic\". For llamacpp, use ExtraArgs instead.",
          "items": {
            "type": "string"
          },
          "type": [
            "array",
            "null"
          ]
        },
        "autoscaling": {
          "additionalProperties": false,
          "description": "Autoscaling configures horizontal pod autoscaling for the inference service.\nWhen set, the controller creates and manages an HPA resource targeting the\ninference Deployment. Requires Prometheus Adapter for custom metrics.\nMutually exclusive with manual replica management: when autoscaling is enabled,\nthe Replicas field serves as the initial replica count only.",
          "properties": {
            "maxReplicas": {
              "description": "MaxReplicas is the upper limit for the number of replicas.",
              "format": "int32",
              "maximum": 100,
              "minimum": 1,
              "type": "integer"
            },
            "metrics": {
              "description": "Metrics defines the scaling metrics and target values.\nIf empty, defaults to llamacpp:requests_processing with target average value of 2.",
              "items": {
                "additionalProperties": false,
                "description": "MetricSpec defines a single metric for HPA scaling.",
                "properties": {
                  "name": {
                    "description": "Name is the metric name (e.g., llamacpp:requests_processing).",
                    "type": "string"
                  },
                  "targetAverageUtilization": {
                    "description": "TargetAverageUtilization is the target utilization percentage for Resource-type metrics.",
                    "format": "int32",
                    "type": [
                      "integer",
                      "null"
                    ]
                  },
                  "targetAverageValue": {
                    "description": "TargetAverageValue is the target per-pod average for Pods-type metrics.",
                    "type": [
                      "string",
                      "null"
                    ]
                  },
                  "type": {
                    "description": "Type is the metric source type.",
                    "enum": [
                      "Pods",
                      "Resource"
                    ],
                    "type": "string"
                  }
                },
                "required": [
                  "name",
                  "type"
                ],
                "type": "object"
              },
              "type": [
                "array",
                "null"
              ]
            },
            "minReplicas": {
              "default": 1,
              "description": "MinReplicas is the lower limit for the number of replicas.",
              "format": "int32",
              "maximum": 10,
              "minimum": 1,
              "type": [
                "integer",
                "null"
              ]
            }
          },
          "required": [
            "maxReplicas"
          ],
          "type": [
            "object",
            "null"
          ]
        },
        "batchSize": {
          "description": "BatchSize sets the token batch size for prompt processing.\nLarger values improve throughput but use more memory.\nMaps to llama.cpp --batch-size flag.",
          "format": "int32",
          "maximum": 16384,
          "minimum": 1,
          "type": [
            "integer",
            "null"
          ]
        },
        "cacheTypeCustomK": {
          "description": "CacheTypeCustomK sets a custom KV cache type for keys that is not in the\nstandard enum. Used for llama.cpp forks with additional cache formats such\nas TurboQuant (turbo3, turbo4, tbqp3, etc.). Maps to llama.cpp\n--cache-type-k. The runtime binary must understand the value or llama-server\nwill fail to start; LLMKube does not validate the string.\nTakes precedence over CacheTypeK when both are set.",
          "type": [
            "string",
            "null"
          ]
        },
        "cacheTypeCustomV": {
          "description": "CacheTypeCustomV sets a custom KV cache type for values that is not in the\nstandard enum. See CacheTypeCustomK for usage notes. Takes precedence over\nCacheTypeV when both are set.",
          "type": [
            "string",
            "null"
          ]
        },
        "cacheTypeK": {
          "description": "CacheTypeK sets the KV cache quantization type for keys.\nSupported values depend on the llama.cpp build version.\nMaps to llama.cpp --cache-type-k flag. Default: f16 (llama.cpp default).\nFor custom build types not in the enum (e.g. TurboQuant turbo3, tbqp3), use\nCacheTypeCustomK instead.",
          "enum": [
            "f16",
            "f32",
            "q8_0",
            "q4_0",
            "q4_1",
            "q5_0",
            "q5_1",
            "iq4_nl"
          ],
          "type": [
            "string",
            "null"
          ]
        },
        "cacheTypeV": {
          "description": "CacheTypeV sets the KV cache quantization type for values.\nMaps to llama.cpp --cache-type-v flag. Default: f16 (llama.cpp default).\nFor custom build types not in the enum (e.g. TurboQuant turbo3, tbqp3), use\nCacheTypeCustomV instead.",
          "enum": [
            "f16",
            "f32",
            "q8_0",
            "q4_0",
            "q4_1",
            "q5_0",
            "q5_1",
            "iq4_nl"
          ],
          "type": [
            "string",
            "null"
          ]
        },
        "command": {
          "description": "Command overrides the container entrypoint.\nOnly used when Runtime is \"generic\" or for advanced customization.",
          "items": {
            "type": "string"
          },
          "type": [
            "array",
            "null"
          ]
        },
        "containerPort": {
          "description": "ContainerPort overrides the primary container port.\nEach runtime has its own default (llamacpp: 8080).",
          "format": "int32",
          "maximum": 65535,
          "minimum": 1,
          "type": [
            "integer",
            "null"
          ]
        },
        "contextSize": {
          "description": "ContextSize sets the context window size for the llama.cpp server (-c flag).\nLarger values allow processing longer inputs but require more memory.\nIf not specified, llama.cpp uses its default (typically 512 or 2048).\nThe upper bound covers Qwen 3.6 at 1M-via-YaRN with margin and accommodates\nnear-future hybrid-attention model architectures. KV cache memory is the\nuser's responsibility to size via spec.resources.memory or hostMemory.",
          "format": "int32",
          "maximum": 2097152,
          "minimum": 128,
          "type": [
            "integer",
            "null"
          ]
        },
        "endpoint": {
          "additionalProperties": false,
          "description": "Endpoint defines the service endpoint configuration",
          "properties": {
            "path": {
              "default": "/v1/chat/completions",
              "description": "Path is the HTTP path for the inference endpoint",
              "type": [
                "string",
                "null"
              ]
            },
            "port": {
              "default": 8080,
              "description": "Port is the service port",
              "format": "int32",
              "maximum": 65535,
              "minimum": 1,
              "type": [
                "integer",
                "null"
              ]
            },
            "type": {
              "default": "ClusterIP",
              "description": "Type is the Kubernetes service type (ClusterIP, NodePort, LoadBalancer)",
              "enum": [
                "ClusterIP",
                "NodePort",
                "LoadBalancer"
              ],
              "type": [
                "string",
                "null"
              ]
            }
          },
          "type": [
            "object",
            "null"
          ]
        },
        "env": {
          "description": "Env adds environment variables to the inference container.\nUseful for HF_TOKEN, custom runtime config, etc.",
          "items": {
            "additionalProperties": false,
            "description": "EnvVar represents an environment variable present in a Container.",
            "properties": {
              "name": {
                "description": "Name of the environment variable.\nMay consist of any printable ASCII characters except '='.",
                "type": "string"
              },
              "value": {
                "description": "Variable references $(VAR_NAME) are expanded\nusing the previously defined environment variables in the container and\nany service environment variables. If a variable cannot be resolved,\nthe reference in the input string will be unchanged. Double $$ are reduced\nto a single $, which allows for escaping the $(VAR_NAME) syntax: i.e.\n\"$$(VAR_NAME)\" will produce the string literal \"$(VAR_NAME)\".\nEscaped references will never be expanded, regardless of whether the variable\nexists or not.\nDefaults to \"\".",
                "type": [
                  "string",
                  "null"
                ]
              },
              "valueFrom": {
                "additionalProperties": false,
                "description": "Source for the environment variable's value. Cannot be used if value is not empty.",
                "properties": {
                  "configMapKeyRef": {
                    "additionalProperties": false,
                    "description": "Selects a key of a ConfigMap.",
                    "properties": {
                      "key": {
                        "description": "The key to select.",
                        "type": "string"
                      },
                      "name": {
                        "default": "",
                        "description": "Name of the referent.\nThis field is effectively required, but due to backwards compatibility is\nallowed to be empty. Instances of this type with an empty value here are\nalmost certainly wrong.\nMore info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names",
                        "type": [
                          "string",
                          "null"
                        ]
                      },
                      "optional": {
                        "description": "Specify whether the ConfigMap or its key must be defined",
                        "type": [
                          "boolean",
                          "null"
                        ]
                      }
                    },
                    "required": [
                      "key"
                    ],
                    "type": [
                      "object",
                      "null"
                    ],
                    "x-kubernetes-map-type": "atomic"
                  },
                  "fieldRef": {
                    "additionalProperties": false,
                    "description": "Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['\u003cKEY\u003e']`, `metadata.annotations['\u003cKEY\u003e']`,\nspec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.",
                    "properties": {
                      "apiVersion": {
                        "description": "Version of the schema the FieldPath is written in terms of, defaults to \"v1\".",
                        "type": [
                          "string",
                          "null"
                        ]
                      },
                      "fieldPath": {
                        "description": "Path of the field to select in the specified API version.",
                        "type": "string"
                      }
                    },
                    "required": [
                      "fieldPath"
                    ],
                    "type": [
                      "object",
                      "null"
                    ],
                    "x-kubernetes-map-type": "atomic"
                  },
                  "fileKeyRef": {
                    "additionalProperties": false,
                    "description": "FileKeyRef selects a key of the env file.\nRequires the EnvFiles feature gate to be enabled.",
                    "properties": {
                      "key": {
                        "description": "The key within the env file. An invalid key will prevent the pod from starting.\nThe keys defined within a source may consist of any printable ASCII characters except '='.\nDuring Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters.",
                        "type": "string"
                      },
                      "optional": {
                        "default": false,
                        "description": "Specify whether the file or its key must be defined. If the file or key\ndoes not exist, then the env var is not published.\nIf optional is set to true and the specified key does not exist,\nthe environment variable will not be set in the Pod's containers.\n\nIf optional is set to false and the specified key does not exist,\nan error will be returned during Pod creation.",
                        "type": [
                          "boolean",
                          "null"
                        ]
                      },
                      "path": {
                        "description": "The path within the volume from which to select the file.\nMust be relative and may not contain the '..' path or start with '..'.",
                        "type": "string"
                      },
                      "volumeName": {
                        "description": "The name of the volume mount containing the env file.",
                        "type": "string"
                      }
                    },
                    "required": [
                      "key",
                      "path",
                      "volumeName"
                    ],
                    "type": [
                      "object",
                      "null"
                    ],
                    "x-kubernetes-map-type": "atomic"
                  },
                  "resourceFieldRef": {
                    "additionalProperties": false,
                    "description": "Selects a resource of the container: only resources limits and requests\n(limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.",
                    "properties": {
                      "containerName": {
                        "description": "Container name: required for volumes, optional for env vars",
                        "type": [
                          "string",
                          "null"
                        ]
                      },
                      "divisor": {
                        "description": "Specifies the output format of the exposed resources, defaults to \"1\"",
                        "oneOf": [
                          {
                            "pattern": "^(\\+|-)?(([0-9]+(\\.[0-9]*)?)|(\\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\\+|-)?(([0-9]+(\\.[0-9]*)?)|(\\.[0-9]+))))?$",
                            "type": "string"
                          },
                          {
                            "type": "integer"
                          },
                          {
                            "type": "null"
                          }
                        ],
                        "x-kubernetes-int-or-string": true
                      },
                      "resource": {
                        "description": "Required: resource to select",
                        "type": "string"
                      }
                    },
                    "required": [
                      "resource"
                    ],
                    "type": [
                      "object",
                      "null"
                    ],
                    "x-kubernetes-map-type": "atomic"
                  },
                  "secretKeyRef": {
                    "additionalProperties": false,
                    "description": "Selects a key of a secret in the pod's namespace",
                    "properties": {
                      "key": {
                        "description": "The key of the secret to select from.  Must be a valid secret key.",
                        "type": "string"
                      },
                      "name": {
                        "default": "",
                        "description": "Name of the referent.\nThis field is effectively required, but due to backwards compatibility is\nallowed to be empty. Instances of this type with an empty value here are\nalmost certainly wrong.\nMore info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names",
                        "type": [
                          "string",
                          "null"
                        ]
                      },
                      "optional": {
                        "description": "Specify whether the Secret or its key must be defined",
                        "type": [
                          "boolean",
                          "null"
                        ]
                      }
                    },
                    "required": [
                      "key"
                    ],
                    "type": [
                      "object",
                      "null"
                    ],
                    "x-kubernetes-map-type": "atomic"
                  }
                },
                "type": [
                  "object",
                  "null"
                ]
              }
            },
            "required": [
              "name"
            ],
            "type": "object"
          },
          "type": [
            "array",
            "null"
          ]
        },
        "evictionProtection": {
          "description": "EvictionProtection marks this service as ineligible for memory-pressure\neviction by the metal-agent watchdog. Use this for production workloads\nthat should never be silently stopped under memory pressure, even when\nthey are the lowest-priority option. The agent's per-process pickEvictionTarget\nexcludes protected processes from the eviction-candidate set; the\nMemoryPressure status condition is still patched on protected services\nfor operator visibility.\n\nHas no effect when --eviction-enabled is unset on the metal-agent or\nfor non-llama-server runtimes (oMLX, Ollama). Defaults to false.",
          "type": [
            "boolean",
            "null"
          ]
        },
        "extraArgs": {
          "description": "ExtraArgs provides additional command-line arguments passed directly to the\nruntime process. Use for flags not yet supported as typed CRD fields.\nArguments are appended after all other configured flags.\nSupported by the \"llamacpp\" and \"vllm\" runtimes. Ignored by others.\nExample: [\"--seed\", \"42\", \"--log-disable\"]",
          "items": {
            "type": "string"
          },
          "type": [
            "array",
            "null"
          ]
        },
        "flashAttention": {
          "description": "FlashAttention enables flash attention for faster prompt processing and\nreduced KV cache memory. Maps to llama.cpp --flash-attn flag.\n\nOn NVIDIA GPUs requires Ampere or newer (compute capability 8.0+).\nOn Apple Silicon (Metal agent path) the default is true when this field\nis unset, because the wired-collector + flash-attn combination prevents\nthe ~25% decode degradation observed at long context on Qwen-class\nmodels running on M-series chips.",
          "type": [
            "boolean",
            "null"
          ]
        },
        "image": {
          "description": "Image is the container image for the inference runtime.\nFor llamacpp runtime, defaults to ghcr.io/ggml-org/llama.cpp:server.\nFor generic runtime, this field is required.",
          "type": [
            "string",
            "null"
          ]
        },
        "imagePullSecrets": {
          "description": "ImagePullSecrets for pulling container images from private registries.",
          "items": {
            "additionalProperties": false,
            "description": "LocalObjectReference contains enough information to let you locate the\nreferenced object inside the same namespace.",
            "properties": {
              "name": {
                "default": "",
                "description": "Name of the referent.\nThis field is effectively required, but due to backwards compatibility is\nallowed to be empty. Instances of this type with an empty value here are\nalmost certainly wrong.\nMore info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names",
                "type": [
                  "string",
                  "null"
                ]
              }
            },
            "type": "object",
            "x-kubernetes-map-type": "atomic"
          },
          "type": [
            "array",
            "null"
          ]
        },
        "jinja": {
          "description": "Jinja enables Jinja2 chat template rendering for tool/function calling support.\nRequired when using the OpenAI-compatible API with tools. Maps to llama.cpp --jinja flag.",
          "type": [
            "boolean",
            "null"
          ]
        },
        "metadataOverrides": {
          "description": "MetadataOverrides overrides GGUF metadata key-value pairs at model load time.\nEach entry is passed as a separate --override-kv flag. Format: key=type:value\n(e.g., \"qwen35moe.context_length=int:1048576\" to extend context window, or\n\"tokenizer.chat_template.thinking=bool:false\" to tweak tokenizer behavior).\nMaps to llama.cpp --override-kv flag (one flag per entry).",
          "items": {
            "type": "string"
          },
          "type": [
            "array",
            "null"
          ]
        },
        "modelRef": {
          "description": "ModelRef references the Model CR that contains the model to serve",
          "type": "string"
        },
        "moeCPULayers": {
          "description": "MoeCPULayers sets the number of MoE layers to offload to CPU.\nWhen set, only the specified number of MoE layers run on CPU rather than all.\nMaps to llama.cpp --n-cpu-moe flag.",
          "format": "int32",
          "minimum": 0,
          "type": [
            "integer",
            "null"
          ]
        },
        "moeCPUOffload": {
          "description": "MoeCPUOffload offloads all MoE expert layers to CPU for reduced VRAM usage.\nEnables running large MoE models (e.g., Qwen3-30B, Mixtral) on VRAM-constrained\nhardware by keeping attention layers on GPU while expert weights use system RAM.\nMaps to llama.cpp --cpu-moe flag. Requires sufficient system RAM via resources.memory.",
          "type": [
            "boolean",
            "null"
          ]
        },
        "noKvOffload": {
          "description": "NoKvOffload keeps the KV cache in system RAM instead of VRAM.\nUseful for extended context windows when VRAM is constrained by model weights.\nMaps to llama.cpp --no-kv-offload flag. Requires sufficient system RAM via resources.memory.",
          "type": [
            "boolean",
            "null"
          ]
        },
        "noWarmup": {
          "description": "NoWarmup skips the llama.cpp startup warmup inference pass.\nReduces pod ready time at the cost of slightly higher first-request latency.\nUseful for scale-to-zero and quick redeployment patterns.\nMaps to llama.cpp --no-warmup flag.",
          "type": [
            "boolean",
            "null"
          ]
        },
        "nodeSelector": {
          "additionalProperties": {
            "type": "string"
          },
          "description": "NodeSelector for pod placement (e.g., specific node pools)",
          "type": [
            "object",
            "null"
          ]
        },
        "parallelSlots": {
          "description": "ParallelSlots sets the number of concurrent request slots for the llama.cpp\nserver (--parallel flag). Each slot processes one request independently;\nhigher values use more KV cache memory. If not specified, the operator\nomits --parallel and llama.cpp picks an auto value (currently 4).",
          "format": "int32",
          "maximum": 64,
          "minimum": 1,
          "type": [
            "integer",
            "null"
          ]
        },
        "personaPlexConfig": {
          "additionalProperties": false,
          "description": "PersonaPlexConfig holds configuration for the PersonaPlex (Moshi) runtime.\nOnly used when Runtime is \"personaplex\".",
          "properties": {
            "cpuOffload": {
              "description": "CPUOffload enables model weight offloading to system RAM when GPU VRAM is insufficient.\nRequires the accelerate package in the container image.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "hfTokenSecretRef": {
              "additionalProperties": false,
              "description": "HFTokenSecretRef references a Secret containing the HuggingFace token for model download.\nThe Secret key must be \"HF_TOKEN\".",
              "properties": {
                "key": {
                  "description": "The key of the secret to select from.  Must be a valid secret key.",
                  "type": "string"
                },
                "name": {
                  "default": "",
                  "description": "Name of the referent.\nThis field is effectively required, but due to backwards compatibility is\nallowed to be empty. Instances of this type with an empty value here are\nalmost certainly wrong.\nMore info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "optional": {
                  "description": "Specify whether the Secret or its key must be defined",
                  "type": [
                    "boolean",
                    "null"
                  ]
                }
              },
              "required": [
                "key"
              ],
              "type": [
                "object",
                "null"
              ],
              "x-kubernetes-map-type": "atomic"
            },
            "quantize4Bit": {
              "description": "Quantize4Bit enables NF4 4-bit quantization for reduced VRAM usage (~9.6 GB vs ~14 GB).\nRequires the bitsandbytes package in the container image.",
              "type": [
                "boolean",
                "null"
              ]
            }
          },
          "type": [
            "object",
            "null"
          ]
        },
        "podAnnotations": {
          "additionalProperties": {
            "type": "string"
          },
          "description": "PodAnnotations are merged into the inference Pod's metadata.annotations.\nUse this to tag Pods for downstream tooling (cost attribution, service\nmesh routing, custom admission controllers) without those tools needing\nto know about LLMKube's CRD schema. Pure passthrough; the operator\nitself does not set any annotations on inference Pods today.",
          "type": [
            "object",
            "null"
          ]
        },
        "podLabels": {
          "additionalProperties": {
            "type": "string"
          },
          "description": "PodLabels are merged into the inference Pod's metadata.labels alongside\nthe operator-managed labels (`app`, `inference.llmkube.dev/model`,\n`inference.llmkube.dev/service`). Operator-managed keys take precedence\non collision so the Deployment selector stays in sync with the Pods it\nowns. The Deployment selector itself uses only the operator-managed\nlabels and is immutable, so changing PodLabels later is safe.",
          "type": [
            "object",
            "null"
          ]
        },
        "podSecurityContext": {
          "additionalProperties": false,
          "description": "PodSecurityContext defines pod-level security attributes for inference pods.\nUse this to set fsGroup for volume permissions (required on OpenShift).",
          "properties": {
            "appArmorProfile": {
              "additionalProperties": false,
              "description": "appArmorProfile is the AppArmor options to use by the containers in this pod.\nNote that this field cannot be set when spec.os.name is windows.",
              "properties": {
                "localhostProfile": {
                  "description": "localhostProfile indicates a profile loaded on the node that should be used.\nThe profile must be preconfigured on the node to work.\nMust match the loaded name of the profile.\nMust be set if and only if type is \"Localhost\".",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "type": {
                  "description": "type indicates which kind of AppArmor profile will be applied.\nValid options are:\n  Localhost - a profile pre-loaded on the node.\n  RuntimeDefault - the container runtime's default profile.\n  Unconfined - no AppArmor enforcement.",
                  "type": "string"
                }
              },
              "required": [
                "type"
              ],
              "type": [
                "object",
                "null"
              ]
            },
            "fsGroup": {
              "description": "A special supplemental group that applies to all containers in a pod.\nSome volume types allow the Kubelet to change the ownership of that volume\nto be owned by the pod:\n\n1. The owning GID will be the FSGroup\n2. The setgid bit is set (new files created in the volume will be owned by FSGroup)\n3. The permission bits are OR'd with rw-rw----\n\nIf unset, the Kubelet will not modify the ownership and permissions of any volume.\nNote that this field cannot be set when spec.os.name is windows.",
              "format": "int64",
              "type": [
                "integer",
                "null"
              ]
            },
            "fsGroupChangePolicy": {
              "description": "fsGroupChangePolicy defines behavior of changing ownership and permission of the volume\nbefore being exposed inside Pod. This field will only apply to\nvolume types which support fsGroup based ownership(and permissions).\nIt will have no effect on ephemeral volume types such as: secret, configmaps\nand emptydir.\nValid values are \"OnRootMismatch\" and \"Always\". If not specified, \"Always\" is used.\nNote that this field cannot be set when spec.os.name is windows.",
              "type": [
                "string",
                "null"
              ]
            },
            "runAsGroup": {
              "description": "The GID to run the entrypoint of the container process.\nUses runtime default if unset.\nMay also be set in SecurityContext.  If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence\nfor that container.\nNote that this field cannot be set when spec.os.name is windows.",
              "format": "int64",
              "type": [
                "integer",
                "null"
              ]
            },
            "runAsNonRoot": {
              "description": "Indicates that the container must run as a non-root user.\nIf true, the Kubelet will validate the image at runtime to ensure that it\ndoes not run as UID 0 (root) and fail to start the container if it does.\nIf unset or false, no such validation will be performed.\nMay also be set in SecurityContext.  If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "runAsUser": {
              "description": "The UID to run the entrypoint of the container process.\nDefaults to user specified in image metadata if unspecified.\nMay also be set in SecurityContext.  If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence\nfor that container.\nNote that this field cannot be set when spec.os.name is windows.",
              "format": "int64",
              "type": [
                "integer",
                "null"
              ]
            },
            "seLinuxChangePolicy": {
              "description": "seLinuxChangePolicy defines how the container's SELinux label is applied to all volumes used by the Pod.\nIt has no effect on nodes that do not support SELinux or to volumes does not support SELinux.\nValid values are \"MountOption\" and \"Recursive\".\n\n\"Recursive\" means relabeling of all files on all Pod volumes by the container runtime.\nThis may be slow for large volumes, but allows mixing privileged and unprivileged Pods sharing the same volume on the same node.\n\n\"MountOption\" mounts all eligible Pod volumes with `-o context` mount option.\nThis requires all Pods that share the same volume to use the same SELinux label.\nIt is not possible to share the same volume among privileged and unprivileged Pods.\nEligible volumes are in-tree FibreChannel and iSCSI volumes, and all CSI volumes\nwhose CSI driver announces SELinux support by setting spec.seLinuxMount: true in their\nCSIDriver instance. Other volumes are always re-labelled recursively.\n\"MountOption\" value is allowed only when SELinuxMount feature gate is enabled.\n\nIf not specified and SELinuxMount feature gate is enabled, \"MountOption\" is used.\nIf not specified and SELinuxMount feature gate is disabled, \"MountOption\" is used for ReadWriteOncePod volumes\nand \"Recursive\" for all other volumes.\n\nThis field affects only Pods that have SELinux label set, either in PodSecurityContext or in SecurityContext of all containers.\n\nAll Pods that use the same volume should use the same seLinuxChangePolicy, otherwise some pods can get stuck in ContainerCreating state.\nNote that this field cannot be set when spec.os.name is windows.",
              "type": [
                "string",
                "null"
              ]
            },
            "seLinuxOptions": {
              "additionalProperties": false,
              "description": "The SELinux context to be applied to all containers.\nIf unspecified, the container runtime will allocate a random SELinux context for each\ncontainer.  May also be set in SecurityContext.  If set in\nboth SecurityContext and PodSecurityContext, the value specified in SecurityContext\ntakes precedence for that container.\nNote that this field cannot be set when spec.os.name is windows.",
              "properties": {
                "level": {
                  "description": "Level is SELinux level label that applies to the container.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "role": {
                  "description": "Role is a SELinux role label that applies to the container.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "type": {
                  "description": "Type is a SELinux type label that applies to the container.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "user": {
                  "description": "User is a SELinux user label that applies to the container.",
                  "type": [
                    "string",
                    "null"
                  ]
                }
              },
              "type": [
                "object",
                "null"
              ]
            },
            "seccompProfile": {
              "additionalProperties": false,
              "description": "The seccomp options to use by the containers in this pod.\nNote that this field cannot be set when spec.os.name is windows.",
              "properties": {
                "localhostProfile": {
                  "description": "localhostProfile indicates a profile defined in a file on the node should be used.\nThe profile must be preconfigured on the node to work.\nMust be a descending path, relative to the kubelet's configured seccomp profile location.\nMust be set if type is \"Localhost\". Must NOT be set for any other type.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "type": {
                  "description": "type indicates which kind of seccomp profile will be applied.\nValid options are:\n\nLocalhost - a profile defined in a file on the node should be used.\nRuntimeDefault - the container runtime default profile should be used.\nUnconfined - no profile should be applied.",
                  "type": "string"
                }
              },
              "required": [
                "type"
              ],
              "type": [
                "object",
                "null"
              ]
            },
            "supplementalGroups": {
              "description": "A list of groups applied to the first process run in each container, in\naddition to the container's primary GID and fsGroup (if specified).  If\nthe SupplementalGroupsPolicy feature is enabled, the\nsupplementalGroupsPolicy field determines whether these are in addition\nto or instead of any group memberships defined in the container image.\nIf unspecified, no additional groups are added, though group memberships\ndefined in the container image may still be used, depending on the\nsupplementalGroupsPolicy field.\nNote that this field cannot be set when spec.os.name is windows.",
              "items": {
                "format": "int64",
                "type": "integer"
              },
              "type": [
                "array",
                "null"
              ],
              "x-kubernetes-list-type": "atomic"
            },
            "supplementalGroupsPolicy": {
              "description": "Defines how supplemental groups of the first container processes are calculated.\nValid values are \"Merge\" and \"Strict\". If not specified, \"Merge\" is used.\n(Alpha) Using the field requires the SupplementalGroupsPolicy feature gate to be enabled\nand the container runtime must implement support for this feature.\nNote that this field cannot be set when spec.os.name is windows.",
              "type": [
                "string",
                "null"
              ]
            },
            "sysctls": {
              "description": "Sysctls hold a list of namespaced sysctls used for the pod. Pods with unsupported\nsysctls (by the container runtime) might fail to launch.\nNote that this field cannot be set when spec.os.name is windows.",
              "items": {
                "additionalProperties": false,
                "description": "Sysctl defines a kernel parameter to be set",
                "properties": {
                  "name": {
                    "description": "Name of a property to set",
                    "type": "string"
                  },
                  "value": {
                    "description": "Value of a property to set",
                    "type": "string"
                  }
                },
                "required": [
                  "name",
                  "value"
                ],
                "type": "object"
              },
              "type": [
                "array",
                "null"
              ],
              "x-kubernetes-list-type": "atomic"
            },
            "windowsOptions": {
              "additionalProperties": false,
              "description": "The Windows specific settings applied to all containers.\nIf unspecified, the options within a container's SecurityContext will be used.\nIf set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.\nNote that this field cannot be set when spec.os.name is linux.",
              "properties": {
                "gmsaCredentialSpec": {
                  "description": "GMSACredentialSpec is where the GMSA admission webhook\n(https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the\nGMSA credential spec named by the GMSACredentialSpecName field.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "gmsaCredentialSpecName": {
                  "description": "GMSACredentialSpecName is the name of the GMSA credential spec to use.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "hostProcess": {
                  "description": "HostProcess determines if a container should be run as a 'Host Process' container.\nAll of a Pod's containers must have the same effective HostProcess value\n(it is not allowed to have a mix of HostProcess containers and non-HostProcess containers).\nIn addition, if HostProcess is true then HostNetwork must also be set to true.",
                  "type": [
                    "boolean",
                    "null"
                  ]
                },
                "runAsUserName": {
                  "description": "The UserName in Windows to run the entrypoint of the container process.\nDefaults to the user specified in image metadata if unspecified.\nMay also be set in PodSecurityContext. If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence.",
                  "type": [
                    "string",
                    "null"
                  ]
                }
              },
              "type": [
                "object",
                "null"
              ]
            }
          },
          "type": [
            "object",
            "null"
          ]
        },
        "priority": {
          "default": "normal",
          "description": "Priority determines scheduling priority for GPU allocation.\nHigher priority services can preempt lower priority ones when GPUs are scarce.",
          "enum": [
            "critical",
            "high",
            "normal",
            "low",
            "batch"
          ],
          "type": [
            "string",
            "null"
          ]
        },
        "priorityClassName": {
          "description": "PriorityClassName allows specifying a custom Kubernetes PriorityClass.\nTakes precedence over the Priority field if set.",
          "type": [
            "string",
            "null"
          ]
        },
        "probeOverrides": {
          "additionalProperties": false,
          "description": "ProbeOverrides allows replacing the auto-generated health probes.\nUseful for runtimes with non-HTTP health endpoints (e.g., TCP, WebSocket).",
          "properties": {
            "liveness": {
              "additionalProperties": false,
              "description": "Liveness overrides the liveness probe.",
              "properties": {
                "exec": {
                  "additionalProperties": false,
                  "description": "Exec specifies a command to execute in the container.",
                  "properties": {
                    "command": {
                      "description": "Command is the command line to execute inside the container, the working directory for the\ncommand  is root ('/') in the container's filesystem. The command is simply exec'd, it is\nnot run inside a shell, so traditional shell instructions ('|', etc) won't work. To use\na shell, you need to explicitly call out to that shell.\nExit status of 0 is treated as live/healthy and non-zero is unhealthy.",
                      "items": {
                        "type": "string"
                      },
                      "type": [
                        "array",
                        "null"
                      ],
                      "x-kubernetes-list-type": "atomic"
                    }
                  },
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "failureThreshold": {
                  "description": "Minimum consecutive failures for the probe to be considered failed after having succeeded.\nDefaults to 3. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "grpc": {
                  "additionalProperties": false,
                  "description": "GRPC specifies a GRPC HealthCheckRequest.",
                  "properties": {
                    "port": {
                      "description": "Port number of the gRPC service. Number must be in the range 1 to 65535.",
                      "format": "int32",
                      "type": "integer"
                    },
                    "service": {
                      "default": "",
                      "description": "Service is the name of the service to place in the gRPC HealthCheckRequest\n(see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).\n\nIf this is not specified, the default behavior is defined by gRPC.",
                      "type": [
                        "string",
                        "null"
                      ]
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "httpGet": {
                  "additionalProperties": false,
                  "description": "HTTPGet specifies an HTTP GET request to perform.",
                  "properties": {
                    "host": {
                      "description": "Host name to connect to, defaults to the pod IP. You probably want to set\n\"Host\" in httpHeaders instead.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "httpHeaders": {
                      "description": "Custom headers to set in the request. HTTP allows repeated headers.",
                      "items": {
                        "additionalProperties": false,
                        "description": "HTTPHeader describes a custom header to be used in HTTP probes",
                        "properties": {
                          "name": {
                            "description": "The header field name.\nThis will be canonicalized upon output, so case-variant names will be understood as the same header.",
                            "type": "string"
                          },
                          "value": {
                            "description": "The header field value",
                            "type": "string"
                          }
                        },
                        "required": [
                          "name",
                          "value"
                        ],
                        "type": "object"
                      },
                      "type": [
                        "array",
                        "null"
                      ],
                      "x-kubernetes-list-type": "atomic"
                    },
                    "path": {
                      "description": "Path to access on the HTTP server.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "port": {
                      "description": "Name or number of the port to access on the container.\nNumber must be in the range 1 to 65535.\nName must be an IANA_SVC_NAME.",
                      "oneOf": [
                        {
                          "type": "string"
                        },
                        {
                          "type": "integer"
                        }
                      ],
                      "x-kubernetes-int-or-string": true
                    },
                    "scheme": {
                      "description": "Scheme to use for connecting to the host.\nDefaults to HTTP.",
                      "type": [
                        "string",
                        "null"
                      ]
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "initialDelaySeconds": {
                  "description": "Number of seconds after the container has started before liveness probes are initiated.\nMore info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "periodSeconds": {
                  "description": "How often (in seconds) to perform the probe.\nDefault to 10 seconds. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "successThreshold": {
                  "description": "Minimum consecutive successes for the probe to be considered successful after having failed.\nDefaults to 1. Must be 1 for liveness and startup. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "tcpSocket": {
                  "additionalProperties": false,
                  "description": "TCPSocket specifies a connection to a TCP port.",
                  "properties": {
                    "host": {
                      "description": "Optional: Host name to connect to, defaults to the pod IP.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "port": {
                      "description": "Number or name of the port to access on the container.\nNumber must be in the range 1 to 65535.\nName must be an IANA_SVC_NAME.",
                      "oneOf": [
                        {
                          "type": "string"
                        },
                        {
                          "type": "integer"
                        }
                      ],
                      "x-kubernetes-int-or-string": true
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "terminationGracePeriodSeconds": {
                  "description": "Optional duration in seconds the pod needs to terminate gracefully upon probe failure.\nThe grace period is the duration in seconds after the processes running in the pod are sent\na termination signal and the time when the processes are forcibly halted with a kill signal.\nSet this value longer than the expected cleanup time for your process.\nIf this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this\nvalue overrides the value provided by the pod spec.\nValue must be non-negative integer. The value zero indicates stop immediately via\nthe kill signal (no opportunity to shut down).\nThis is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.\nMinimum value is 1. spec.terminationGracePeriodSeconds is used if unset.",
                  "format": "int64",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "timeoutSeconds": {
                  "description": "Number of seconds after which the probe times out.\nDefaults to 1 second. Minimum value is 1.\nMore info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                }
              },
              "type": [
                "object",
                "null"
              ]
            },
            "readiness": {
              "additionalProperties": false,
              "description": "Readiness overrides the readiness probe.",
              "properties": {
                "exec": {
                  "additionalProperties": false,
                  "description": "Exec specifies a command to execute in the container.",
                  "properties": {
                    "command": {
                      "description": "Command is the command line to execute inside the container, the working directory for the\ncommand  is root ('/') in the container's filesystem. The command is simply exec'd, it is\nnot run inside a shell, so traditional shell instructions ('|', etc) won't work. To use\na shell, you need to explicitly call out to that shell.\nExit status of 0 is treated as live/healthy and non-zero is unhealthy.",
                      "items": {
                        "type": "string"
                      },
                      "type": [
                        "array",
                        "null"
                      ],
                      "x-kubernetes-list-type": "atomic"
                    }
                  },
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "failureThreshold": {
                  "description": "Minimum consecutive failures for the probe to be considered failed after having succeeded.\nDefaults to 3. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "grpc": {
                  "additionalProperties": false,
                  "description": "GRPC specifies a GRPC HealthCheckRequest.",
                  "properties": {
                    "port": {
                      "description": "Port number of the gRPC service. Number must be in the range 1 to 65535.",
                      "format": "int32",
                      "type": "integer"
                    },
                    "service": {
                      "default": "",
                      "description": "Service is the name of the service to place in the gRPC HealthCheckRequest\n(see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).\n\nIf this is not specified, the default behavior is defined by gRPC.",
                      "type": [
                        "string",
                        "null"
                      ]
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "httpGet": {
                  "additionalProperties": false,
                  "description": "HTTPGet specifies an HTTP GET request to perform.",
                  "properties": {
                    "host": {
                      "description": "Host name to connect to, defaults to the pod IP. You probably want to set\n\"Host\" in httpHeaders instead.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "httpHeaders": {
                      "description": "Custom headers to set in the request. HTTP allows repeated headers.",
                      "items": {
                        "additionalProperties": false,
                        "description": "HTTPHeader describes a custom header to be used in HTTP probes",
                        "properties": {
                          "name": {
                            "description": "The header field name.\nThis will be canonicalized upon output, so case-variant names will be understood as the same header.",
                            "type": "string"
                          },
                          "value": {
                            "description": "The header field value",
                            "type": "string"
                          }
                        },
                        "required": [
                          "name",
                          "value"
                        ],
                        "type": "object"
                      },
                      "type": [
                        "array",
                        "null"
                      ],
                      "x-kubernetes-list-type": "atomic"
                    },
                    "path": {
                      "description": "Path to access on the HTTP server.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "port": {
                      "description": "Name or number of the port to access on the container.\nNumber must be in the range 1 to 65535.\nName must be an IANA_SVC_NAME.",
                      "oneOf": [
                        {
                          "type": "string"
                        },
                        {
                          "type": "integer"
                        }
                      ],
                      "x-kubernetes-int-or-string": true
                    },
                    "scheme": {
                      "description": "Scheme to use for connecting to the host.\nDefaults to HTTP.",
                      "type": [
                        "string",
                        "null"
                      ]
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "initialDelaySeconds": {
                  "description": "Number of seconds after the container has started before liveness probes are initiated.\nMore info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "periodSeconds": {
                  "description": "How often (in seconds) to perform the probe.\nDefault to 10 seconds. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "successThreshold": {
                  "description": "Minimum consecutive successes for the probe to be considered successful after having failed.\nDefaults to 1. Must be 1 for liveness and startup. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "tcpSocket": {
                  "additionalProperties": false,
                  "description": "TCPSocket specifies a connection to a TCP port.",
                  "properties": {
                    "host": {
                      "description": "Optional: Host name to connect to, defaults to the pod IP.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "port": {
                      "description": "Number or name of the port to access on the container.\nNumber must be in the range 1 to 65535.\nName must be an IANA_SVC_NAME.",
                      "oneOf": [
                        {
                          "type": "string"
                        },
                        {
                          "type": "integer"
                        }
                      ],
                      "x-kubernetes-int-or-string": true
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "terminationGracePeriodSeconds": {
                  "description": "Optional duration in seconds the pod needs to terminate gracefully upon probe failure.\nThe grace period is the duration in seconds after the processes running in the pod are sent\na termination signal and the time when the processes are forcibly halted with a kill signal.\nSet this value longer than the expected cleanup time for your process.\nIf this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this\nvalue overrides the value provided by the pod spec.\nValue must be non-negative integer. The value zero indicates stop immediately via\nthe kill signal (no opportunity to shut down).\nThis is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.\nMinimum value is 1. spec.terminationGracePeriodSeconds is used if unset.",
                  "format": "int64",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "timeoutSeconds": {
                  "description": "Number of seconds after which the probe times out.\nDefaults to 1 second. Minimum value is 1.\nMore info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                }
              },
              "type": [
                "object",
                "null"
              ]
            },
            "startup": {
              "additionalProperties": false,
              "description": "Startup overrides the startup probe.",
              "properties": {
                "exec": {
                  "additionalProperties": false,
                  "description": "Exec specifies a command to execute in the container.",
                  "properties": {
                    "command": {
                      "description": "Command is the command line to execute inside the container, the working directory for the\ncommand  is root ('/') in the container's filesystem. The command is simply exec'd, it is\nnot run inside a shell, so traditional shell instructions ('|', etc) won't work. To use\na shell, you need to explicitly call out to that shell.\nExit status of 0 is treated as live/healthy and non-zero is unhealthy.",
                      "items": {
                        "type": "string"
                      },
                      "type": [
                        "array",
                        "null"
                      ],
                      "x-kubernetes-list-type": "atomic"
                    }
                  },
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "failureThreshold": {
                  "description": "Minimum consecutive failures for the probe to be considered failed after having succeeded.\nDefaults to 3. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "grpc": {
                  "additionalProperties": false,
                  "description": "GRPC specifies a GRPC HealthCheckRequest.",
                  "properties": {
                    "port": {
                      "description": "Port number of the gRPC service. Number must be in the range 1 to 65535.",
                      "format": "int32",
                      "type": "integer"
                    },
                    "service": {
                      "default": "",
                      "description": "Service is the name of the service to place in the gRPC HealthCheckRequest\n(see https://github.com/grpc/grpc/blob/master/doc/health-checking.md).\n\nIf this is not specified, the default behavior is defined by gRPC.",
                      "type": [
                        "string",
                        "null"
                      ]
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "httpGet": {
                  "additionalProperties": false,
                  "description": "HTTPGet specifies an HTTP GET request to perform.",
                  "properties": {
                    "host": {
                      "description": "Host name to connect to, defaults to the pod IP. You probably want to set\n\"Host\" in httpHeaders instead.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "httpHeaders": {
                      "description": "Custom headers to set in the request. HTTP allows repeated headers.",
                      "items": {
                        "additionalProperties": false,
                        "description": "HTTPHeader describes a custom header to be used in HTTP probes",
                        "properties": {
                          "name": {
                            "description": "The header field name.\nThis will be canonicalized upon output, so case-variant names will be understood as the same header.",
                            "type": "string"
                          },
                          "value": {
                            "description": "The header field value",
                            "type": "string"
                          }
                        },
                        "required": [
                          "name",
                          "value"
                        ],
                        "type": "object"
                      },
                      "type": [
                        "array",
                        "null"
                      ],
                      "x-kubernetes-list-type": "atomic"
                    },
                    "path": {
                      "description": "Path to access on the HTTP server.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "port": {
                      "description": "Name or number of the port to access on the container.\nNumber must be in the range 1 to 65535.\nName must be an IANA_SVC_NAME.",
                      "oneOf": [
                        {
                          "type": "string"
                        },
                        {
                          "type": "integer"
                        }
                      ],
                      "x-kubernetes-int-or-string": true
                    },
                    "scheme": {
                      "description": "Scheme to use for connecting to the host.\nDefaults to HTTP.",
                      "type": [
                        "string",
                        "null"
                      ]
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "initialDelaySeconds": {
                  "description": "Number of seconds after the container has started before liveness probes are initiated.\nMore info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "periodSeconds": {
                  "description": "How often (in seconds) to perform the probe.\nDefault to 10 seconds. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "successThreshold": {
                  "description": "Minimum consecutive successes for the probe to be considered successful after having failed.\nDefaults to 1. Must be 1 for liveness and startup. Minimum value is 1.",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "tcpSocket": {
                  "additionalProperties": false,
                  "description": "TCPSocket specifies a connection to a TCP port.",
                  "properties": {
                    "host": {
                      "description": "Optional: Host name to connect to, defaults to the pod IP.",
                      "type": [
                        "string",
                        "null"
                      ]
                    },
                    "port": {
                      "description": "Number or name of the port to access on the container.\nNumber must be in the range 1 to 65535.\nName must be an IANA_SVC_NAME.",
                      "oneOf": [
                        {
                          "type": "string"
                        },
                        {
                          "type": "integer"
                        }
                      ],
                      "x-kubernetes-int-or-string": true
                    }
                  },
                  "required": [
                    "port"
                  ],
                  "type": [
                    "object",
                    "null"
                  ]
                },
                "terminationGracePeriodSeconds": {
                  "description": "Optional duration in seconds the pod needs to terminate gracefully upon probe failure.\nThe grace period is the duration in seconds after the processes running in the pod are sent\na termination signal and the time when the processes are forcibly halted with a kill signal.\nSet this value longer than the expected cleanup time for your process.\nIf this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this\nvalue overrides the value provided by the pod spec.\nValue must be non-negative integer. The value zero indicates stop immediately via\nthe kill signal (no opportunity to shut down).\nThis is a beta field and requires enabling ProbeTerminationGracePeriod feature gate.\nMinimum value is 1. spec.terminationGracePeriodSeconds is used if unset.",
                  "format": "int64",
                  "type": [
                    "integer",
                    "null"
                  ]
                },
                "timeoutSeconds": {
                  "description": "Number of seconds after which the probe times out.\nDefaults to 1 second. Minimum value is 1.\nMore info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes",
                  "format": "int32",
                  "type": [
                    "integer",
                    "null"
                  ]
                }
              },
              "type": [
                "object",
                "null"
              ]
            }
          },
          "type": [
            "object",
            "null"
          ]
        },
        "reasoningBudget": {
          "description": "ReasoningBudget caps the number of reasoning tokens the model is allowed to\nemit per response. Zero disables visible thinking output entirely; the model\nstill reasons internally but does not emit thinking tokens. Critical for\nproduction agentic workloads on thinking models (Qwen 3.6, GLM-5) where\nrunaway reasoning can burn compute.\nMaps to llama.cpp --reasoning-budget flag.",
          "format": "int32",
          "minimum": 0,
          "type": [
            "integer",
            "null"
          ]
        },
        "reasoningBudgetMessage": {
          "description": "ReasoningBudgetMessage is injected when the reasoning budget is exhausted,\nforcing the model to conclude. Ignored unless ReasoningBudget is also set.\nMaps to llama.cpp --reasoning-budget-message flag.",
          "type": [
            "string",
            "null"
          ]
        },
        "replicas": {
          "default": 1,
          "description": "Replicas is the desired number of inference pods",
          "format": "int32",
          "maximum": 10,
          "minimum": 0,
          "type": [
            "integer",
            "null"
          ]
        },
        "resources": {
          "additionalProperties": false,
          "description": "Resources defines compute resources for inference pods",
          "properties": {
            "cpu": {
              "description": "CPU requests (e.g., \"2\" or \"2000m\")",
              "type": [
                "string",
                "null"
              ]
            },
            "gpu": {
              "description": "GPU count required per pod\nFor multi-GPU inference, each pod gets this many GPUs\nNote: Multi-GPU sharding config comes from Model CRD",
              "format": "int32",
              "maximum": 8,
              "minimum": 0,
              "type": [
                "integer",
                "null"
              ]
            },
            "gpuMemory": {
              "description": "GPUMemory specifies GPU memory limit per pod (e.g., \"16Gi\")\nUsed for scheduling and validation",
              "type": [
                "string",
                "null"
              ]
            },
            "hostMemory": {
              "description": "HostMemory specifies the system RAM required for hybrid GPU/CPU offloading (e.g., \"64Gi\").\nUsed when MoE expert weights or KV cache are offloaded to CPU via moeCPUOffload or noKvOffload.\nTranslated to pod resources.requests.memory, taking precedence over Memory when set.\nWithout this, the K8s scheduler has no visibility into the pod's actual RAM consumption,\nwhich can lead to OOM kills after model load.",
              "type": [
                "string",
                "null"
              ]
            },
            "memory": {
              "description": "Memory requests (e.g., \"4Gi\")",
              "type": [
                "string",
                "null"
              ]
            }
          },
          "type": [
            "object",
            "null"
          ]
        },
        "ropeScaling": {
          "additionalProperties": false,
          "description": "RopeScaling configures RoPE-based context extension so a model can be\nserved past its native trained context (e.g. 128K served at 256K via\nYaRN). For the llamacpp runtime this maps to --rope-scaling /\n--rope-scale / --yarn-orig-ctx. Prefer this over raw spec.extraArgs:\nit is validated and discoverable via `kubectl explain`. If --rope-scaling\nis also present in spec.extraArgs, extraArgs wins and this is skipped.",
          "properties": {
            "factor": {
              "description": "Factor is the scale multiplier (--rope-scale), e.g. \"2.0\" to double the\nnative context. A string to avoid CRD float pitfalls; the runtime parses\nit as a float. Optional.",
              "pattern": "^[0-9]+(\\.[0-9]+)?$",
              "type": [
                "string",
                "null"
              ]
            },
            "originalContext": {
              "description": "OriginalContext is the model's native training context length\n(--yarn-orig-ctx), e.g. 131072 for a 128K model. Recommended with yarn.",
              "format": "int32",
              "minimum": 128,
              "type": [
                "integer",
                "null"
              ]
            },
            "type": {
              "description": "Type is the scaling method (--rope-scaling). \"yarn\" is the usual choice\nfor extending context (e.g. 128K to 256K).",
              "enum": [
                "linear",
                "yarn",
                "longrope"
              ],
              "type": "string"
            }
          },
          "required": [
            "type"
          ],
          "type": [
            "object",
            "null"
          ]
        },
        "runtime": {
          "default": "llamacpp",
          "description": "Runtime selects the inference server backend.\n\"llamacpp\" (default): llama.cpp server with auto-generated args and /health probes.\n\"generic\": user-provided container with custom command, args, env, and probes.\n\"personaplex\": NVIDIA PersonaPlex (Moshi) speech-to-speech server.\n\"vllm\": vLLM OpenAI-compatible server with PagedAttention.\n\"tgi\": HuggingFace Text Generation Inference server.",
          "enum": [
            "llamacpp",
            "personaplex",
            "vllm",
            "tgi",
            "generic"
          ],
          "type": [
            "string",
            "null"
          ]
        },
        "runtimeClassName": {
          "description": "RuntimeClassName selects a Kubernetes RuntimeClass for the inference Pod.\nMost commonly set to \"nvidia\" on clusters where the NVIDIA Container\nRuntime is not configured as the cluster default. Without it, GPU pods\nschedule onto the GPU node but never get the device files bind-mounted,\nand the container fails at runtime with \"no CUDA-capable device is\ndetected\". Maps directly to PodSpec.RuntimeClassName.\n\nMost clusters running the NVIDIA GPU Operator with the default toolkit\nenv do not need this set; it is a safety hatch for clusters where the\nruntime configuration is non-default.",
          "type": [
            "string",
            "null"
          ]
        },
        "securityContext": {
          "additionalProperties": false,
          "description": "SecurityContext defines container-level security attributes for the inference container.",
          "properties": {
            "allowPrivilegeEscalation": {
              "description": "AllowPrivilegeEscalation controls whether a process can gain more\nprivileges than its parent process. This bool directly controls if\nthe no_new_privs flag will be set on the container process.\nAllowPrivilegeEscalation is true always when the container is:\n1) run as Privileged\n2) has CAP_SYS_ADMIN\nNote that this field cannot be set when spec.os.name is windows.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "appArmorProfile": {
              "additionalProperties": false,
              "description": "appArmorProfile is the AppArmor options to use by this container. If set, this profile\noverrides the pod's appArmorProfile.\nNote that this field cannot be set when spec.os.name is windows.",
              "properties": {
                "localhostProfile": {
                  "description": "localhostProfile indicates a profile loaded on the node that should be used.\nThe profile must be preconfigured on the node to work.\nMust match the loaded name of the profile.\nMust be set if and only if type is \"Localhost\".",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "type": {
                  "description": "type indicates which kind of AppArmor profile will be applied.\nValid options are:\n  Localhost - a profile pre-loaded on the node.\n  RuntimeDefault - the container runtime's default profile.\n  Unconfined - no AppArmor enforcement.",
                  "type": "string"
                }
              },
              "required": [
                "type"
              ],
              "type": [
                "object",
                "null"
              ]
            },
            "capabilities": {
              "additionalProperties": false,
              "description": "The capabilities to add/drop when running containers.\nDefaults to the default set of capabilities granted by the container runtime.\nNote that this field cannot be set when spec.os.name is windows.",
              "properties": {
                "add": {
                  "description": "Added capabilities",
                  "items": {
                    "description": "Capability represent POSIX capabilities type",
                    "type": "string"
                  },
                  "type": [
                    "array",
                    "null"
                  ],
                  "x-kubernetes-list-type": "atomic"
                },
                "drop": {
                  "description": "Removed capabilities",
                  "items": {
                    "description": "Capability represent POSIX capabilities type",
                    "type": "string"
                  },
                  "type": [
                    "array",
                    "null"
                  ],
                  "x-kubernetes-list-type": "atomic"
                }
              },
              "type": [
                "object",
                "null"
              ]
            },
            "privileged": {
              "description": "Run container in privileged mode.\nProcesses in privileged containers are essentially equivalent to root on the host.\nDefaults to false.\nNote that this field cannot be set when spec.os.name is windows.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "procMount": {
              "description": "procMount denotes the type of proc mount to use for the containers.\nThe default value is Default which uses the container runtime defaults for\nreadonly paths and masked paths.\nNote that this field cannot be set when spec.os.name is windows.",
              "type": [
                "string",
                "null"
              ]
            },
            "readOnlyRootFilesystem": {
              "description": "Whether this container has a read-only root filesystem.\nDefault is false.\nNote that this field cannot be set when spec.os.name is windows.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "runAsGroup": {
              "description": "The GID to run the entrypoint of the container process.\nUses runtime default if unset.\nMay also be set in PodSecurityContext.  If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence.\nNote that this field cannot be set when spec.os.name is windows.",
              "format": "int64",
              "type": [
                "integer",
                "null"
              ]
            },
            "runAsNonRoot": {
              "description": "Indicates that the container must run as a non-root user.\nIf true, the Kubelet will validate the image at runtime to ensure that it\ndoes not run as UID 0 (root) and fail to start the container if it does.\nIf unset or false, no such validation will be performed.\nMay also be set in PodSecurityContext.  If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "runAsUser": {
              "description": "The UID to run the entrypoint of the container process.\nDefaults to user specified in image metadata if unspecified.\nMay also be set in PodSecurityContext.  If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence.\nNote that this field cannot be set when spec.os.name is windows.",
              "format": "int64",
              "type": [
                "integer",
                "null"
              ]
            },
            "seLinuxOptions": {
              "additionalProperties": false,
              "description": "The SELinux context to be applied to the container.\nIf unspecified, the container runtime will allocate a random SELinux context for each\ncontainer.  May also be set in PodSecurityContext.  If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence.\nNote that this field cannot be set when spec.os.name is windows.",
              "properties": {
                "level": {
                  "description": "Level is SELinux level label that applies to the container.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "role": {
                  "description": "Role is a SELinux role label that applies to the container.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "type": {
                  "description": "Type is a SELinux type label that applies to the container.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "user": {
                  "description": "User is a SELinux user label that applies to the container.",
                  "type": [
                    "string",
                    "null"
                  ]
                }
              },
              "type": [
                "object",
                "null"
              ]
            },
            "seccompProfile": {
              "additionalProperties": false,
              "description": "The seccomp options to use by this container. If seccomp options are\nprovided at both the pod \u0026 container level, the container options\noverride the pod options.\nNote that this field cannot be set when spec.os.name is windows.",
              "properties": {
                "localhostProfile": {
                  "description": "localhostProfile indicates a profile defined in a file on the node should be used.\nThe profile must be preconfigured on the node to work.\nMust be a descending path, relative to the kubelet's configured seccomp profile location.\nMust be set if type is \"Localhost\". Must NOT be set for any other type.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "type": {
                  "description": "type indicates which kind of seccomp profile will be applied.\nValid options are:\n\nLocalhost - a profile defined in a file on the node should be used.\nRuntimeDefault - the container runtime default profile should be used.\nUnconfined - no profile should be applied.",
                  "type": "string"
                }
              },
              "required": [
                "type"
              ],
              "type": [
                "object",
                "null"
              ]
            },
            "windowsOptions": {
              "additionalProperties": false,
              "description": "The Windows specific settings applied to all containers.\nIf unspecified, the options from the PodSecurityContext will be used.\nIf set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence.\nNote that this field cannot be set when spec.os.name is linux.",
              "properties": {
                "gmsaCredentialSpec": {
                  "description": "GMSACredentialSpec is where the GMSA admission webhook\n(https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the\nGMSA credential spec named by the GMSACredentialSpecName field.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "gmsaCredentialSpecName": {
                  "description": "GMSACredentialSpecName is the name of the GMSA credential spec to use.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "hostProcess": {
                  "description": "HostProcess determines if a container should be run as a 'Host Process' container.\nAll of a Pod's containers must have the same effective HostProcess value\n(it is not allowed to have a mix of HostProcess containers and non-HostProcess containers).\nIn addition, if HostProcess is true then HostNetwork must also be set to true.",
                  "type": [
                    "boolean",
                    "null"
                  ]
                },
                "runAsUserName": {
                  "description": "The UserName in Windows to run the entrypoint of the container process.\nDefaults to the user specified in image metadata if unspecified.\nMay also be set in PodSecurityContext. If set in both SecurityContext and\nPodSecurityContext, the value specified in SecurityContext takes precedence.",
                  "type": [
                    "string",
                    "null"
                  ]
                }
              },
              "type": [
                "object",
                "null"
              ]
            }
          },
          "type": [
            "object",
            "null"
          ]
        },
        "skipModelInit": {
          "description": "SkipModelInit disables the model-downloader init container.\nUse when the model is baked into the image or downloaded by the\ncontainer itself (e.g., via HF_TOKEN).",
          "type": [
            "boolean",
            "null"
          ]
        },
        "tensorOverrides": {
          "description": "TensorOverrides provides fine-grained tensor placement overrides for power users.\nEach entry specifies a tensor name and target device (e.g., \"exps=CPU\", \"token_embd=CUDA0\").\nMaps to llama.cpp --override-tensor flag (one flag per entry).",
          "items": {
            "type": "string"
          },
          "type": [
            "array",
            "null"
          ]
        },
        "tgiConfig": {
          "additionalProperties": false,
          "description": "TGIConfig holds configuration for the TGI runtime.\nOnly used when Runtime is \"tgi\".",
          "properties": {
            "dtype": {
              "description": "Dtype sets the model data type (float16, bfloat16).",
              "enum": [
                "float16",
                "bfloat16"
              ],
              "type": [
                "string",
                "null"
              ]
            },
            "hfTokenSecretRef": {
              "additionalProperties": false,
              "description": "HFTokenSecretRef references a Secret containing the HuggingFace token.",
              "properties": {
                "key": {
                  "description": "The key of the secret to select from.  Must be a valid secret key.",
                  "type": "string"
                },
                "name": {
                  "default": "",
                  "description": "Name of the referent.\nThis field is effectively required, but due to backwards compatibility is\nallowed to be empty. Instances of this type with an empty value here are\nalmost certainly wrong.\nMore info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "optional": {
                  "description": "Specify whether the Secret or its key must be defined",
                  "type": [
                    "boolean",
                    "null"
                  ]
                }
              },
              "required": [
                "key"
              ],
              "type": [
                "object",
                "null"
              ],
              "x-kubernetes-map-type": "atomic"
            },
            "maxInputLength": {
              "description": "MaxInputLength sets the maximum input token length.",
              "format": "int32",
              "type": [
                "integer",
                "null"
              ]
            },
            "maxTotalTokens": {
              "description": "MaxTotalTokens sets the maximum total tokens (input + output).",
              "format": "int32",
              "type": [
                "integer",
                "null"
              ]
            },
            "quantize": {
              "description": "Quantize sets the quantization method (bitsandbytes, gptq, awq, eetq).",
              "enum": [
                "bitsandbytes",
                "gptq",
                "awq",
                "eetq"
              ],
              "type": [
                "string",
                "null"
              ]
            }
          },
          "type": [
            "object",
            "null"
          ]
        },
        "tolerations": {
          "description": "Tolerations for pod scheduling (e.g., GPU taints, spot instances)",
          "items": {
            "additionalProperties": false,
            "description": "The pod this Toleration is attached to tolerates any taint that matches\nthe triple \u003ckey,value,effect\u003e using the matching operator \u003coperator\u003e.",
            "properties": {
              "effect": {
                "description": "Effect indicates the taint effect to match. Empty means match all taint effects.\nWhen specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.",
                "type": [
                  "string",
                  "null"
                ]
              },
              "key": {
                "description": "Key is the taint key that the toleration applies to. Empty means match all taint keys.\nIf the key is empty, operator must be Exists; this combination means to match all values and all keys.",
                "type": [
                  "string",
                  "null"
                ]
              },
              "operator": {
                "description": "Operator represents a key's relationship to the value.\nValid operators are Exists, Equal, Lt, and Gt. Defaults to Equal.\nExists is equivalent to wildcard for value, so that a pod can\ntolerate all taints of a particular category.\nLt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators).",
                "type": [
                  "string",
                  "null"
                ]
              },
              "tolerationSeconds": {
                "description": "TolerationSeconds represents the period of time the toleration (which must be\nof effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,\nit is not set, which means tolerate the taint forever (do not evict). Zero and\nnegative values will be treated as 0 (evict immediately) by the system.",
                "format": "int64",
                "type": [
                  "integer",
                  "null"
                ]
              },
              "value": {
                "description": "Value is the taint value the toleration matches to.\nIf the operator is Exists, the value should be empty, otherwise just a regular string.",
                "type": [
                  "string",
                  "null"
                ]
              }
            },
            "type": "object"
          },
          "type": [
            "array",
            "null"
          ]
        },
        "uBatchSize": {
          "description": "UBatchSize sets the micro-batch size for decoding.\nSmaller micro-batches reduce memory usage during generation.\nMaps to llama.cpp --ubatch-size flag.",
          "format": "int32",
          "minimum": 1,
          "type": [
            "integer",
            "null"
          ]
        },
        "vllmConfig": {
          "additionalProperties": false,
          "description": "VLLMConfig holds configuration for the vLLM runtime.\nOnly used when Runtime is \"vllm\".",
          "properties": {
            "attentionBackend": {
              "description": "AttentionBackend selects the attention implementation used by vLLM.\nFLASHINFER is typically fastest on recent NVIDIA GPUs (especially Blackwell);\nFLASH_ATTN is a solid default; XFORMERS and torch_sdpa are portability\nfallbacks. Requires a vLLM version that supports the chosen backend.\nBoth uppercase (vLLM's native form) and lowercase spellings are accepted\nfor backwards compatibility with earlier LLMKube releases.\nMaps to vLLM --attention-backend flag.",
              "enum": [
                "FLASH_ATTN",
                "FLASHINFER",
                "XFORMERS",
                "flashinfer",
                "flash_attn",
                "xformers",
                "torch_sdpa"
              ],
              "type": [
                "string",
                "null"
              ]
            },
            "cpuOffloadGB": {
              "description": "CPUOffloadGB increases the GPU memory size. When set, passes\n--cpu-offload-gb to vLLM. Per-rank, so 4 on TP=2 means 4 GB of\nCPU RAM per GPU. Use when FP8 model weights don't fit VRAM.\nThroughput hit is 2-5x on the offloaded path.",
              "format": "int32",
              "minimum": 0,
              "type": [
                "integer",
                "null"
              ]
            },
            "dtype": {
              "description": "Dtype sets the model data type (auto, float16, bfloat16).",
              "enum": [
                "auto",
                "float16",
                "bfloat16"
              ],
              "type": [
                "string",
                "null"
              ]
            },
            "enableChunkedPrefill": {
              "description": "EnableChunkedPrefill interleaves long prefills with decode steps so a\nlarge paste (e.g. a 32K-token file) does not starve concurrent decode\nstreams. Only emitted when explicitly set to true.\nMaps to vLLM --enable-chunked-prefill flag.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "enableExpertParallel": {
              "description": "EnableExpertParallel distributes MoE experts across tensor-parallel ranks\ninstead of replicating them. Only meaningful for MoE models.\nMaps to vLLM --enable-expert-parallel flag.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "enablePrefixCaching": {
              "description": "EnablePrefixCaching turns on vLLM's automatic prefix caching for repeated prompts.\nSignificantly reduces time-to-first-token for conversational and agentic workloads\nwhere requests share a common system prompt.\nOnly emitted when explicitly set to true — when nil or false, vLLM's own\ndefault is used (do not emit the flag).\nMaps to vLLM --enable-prefix-caching flag.",
              "type": [
                "boolean",
                "null"
              ]
            },
            "gpuMemoryUtilization": {
              "description": "GPUMemoryUtilization controls how much GPU memory each stage can use.\nWhen set, passes --gpu-memory-utilization to vLLM. Range from 0.1 - 0.99\nand default unset (vLLM uses 0.90).",
              "maximum": 0.99,
              "minimum": 0.1,
              "type": [
                "number",
                "null"
              ]
            },
            "hfTokenSecretRef": {
              "additionalProperties": false,
              "description": "HFTokenSecretRef references a Secret containing the HuggingFace token.",
              "properties": {
                "key": {
                  "description": "The key of the secret to select from.  Must be a valid secret key.",
                  "type": "string"
                },
                "name": {
                  "default": "",
                  "description": "Name of the referent.\nThis field is effectively required, but due to backwards compatibility is\nallowed to be empty. Instances of this type with an empty value here are\nalmost certainly wrong.\nMore info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "optional": {
                  "description": "Specify whether the Secret or its key must be defined",
                  "type": [
                    "boolean",
                    "null"
                  ]
                }
              },
              "required": [
                "key"
              ],
              "type": [
                "object",
                "null"
              ],
              "x-kubernetes-map-type": "atomic"
            },
            "kvCacheCustomDtype": {
              "description": "KVCacheCustomDtype sets a custom vLLM KV cache element type that is not\nin the standard enum. Used for vLLM versions with additional cache\nformats such as TurboQuant 2-bit (turbo2, shipped in v0.20.0). Maps to\nvLLM --kv-cache-dtype. The runtime image must understand the value or\nvLLM will fail to start; LLMKube does not validate the string. Mirrors\nthe llama.cpp-side CacheTypeCustomK/V escape hatch.\nTakes precedence over KVCacheDtype when both are set.",
              "type": [
                "string",
                "null"
              ]
            },
            "kvCacheDtype": {
              "default": "auto",
              "description": "KVCacheDtype selects the KV cache element type. fp8_e5m2 and fp8_e4m3 cut\nKV cache memory roughly in half versus auto (which follows dtype), which\nis what unlocks 128K+ context on consumer VRAM for agentic workloads.\nMaps to vLLM --kv-cache-dtype flag.\nFor custom build types not in the enum (e.g. TurboQuant turbo2 from\nvLLM v0.20+), use KVCacheCustomDtype instead.",
              "enum": [
                "auto",
                "fp8_e5m2",
                "fp8_e4m3"
              ],
              "type": [
                "string",
                "null"
              ]
            },
            "maxModelLen": {
              "description": "MaxModelLen sets the maximum model context length.",
              "format": "int32",
              "type": [
                "integer",
                "null"
              ]
            },
            "maxNumBatchedTokens": {
              "description": "MaxNumBatchedTokens sets the maximum number of tokens batched together\nper step. This is the main throughput knob: too low means prefill-bound,\ntoo high risks OOM on long context. No default — only emitted when set.\nMaps to vLLM --max-num-batched-tokens flag.",
              "format": "int32",
              "minimum": 512,
              "type": [
                "integer",
                "null"
              ]
            },
            "quantization": {
              "description": "Quantization method.\nawq, gptq, squeezellm are classic 4-bit formats. fp8 targets 8-bit FP\ncheckpoints (Qwen FP8, Llama FP8, etc.). nvfp4 is NVIDIA's Blackwell-native\n4-bit format. compressed-tensors is the neuralmagic/vLLM cross-format\nloader used by Unsloth and other recent releases.",
              "enum": [
                "awq",
                "gptq",
                "squeezellm",
                "fp8",
                "nvfp4",
                "compressed-tensors"
              ],
              "type": [
                "string",
                "null"
              ]
            },
            "speculative": {
              "additionalProperties": false,
              "description": "Speculative enables draft-model speculative decoding. On single-stream\nagentic workloads this can be 30-60% faster than plain tensor-parallel\nexecution. Requires a second (smaller) Model CR to act as the draft.",
              "properties": {
                "enabled": {
                  "description": "Enabled toggles speculative decoding on. When false or nil, no\nspeculative flags are emitted regardless of other fields.",
                  "type": [
                    "boolean",
                    "null"
                  ]
                },
                "model": {
                  "description": "Model references the Model CR (in the same namespace as the\nInferenceService) to use as the speculative draft model.\nRequired when Enabled is true. If missing, speculative decoding is\nskipped and the InferenceService surfaces a SpeculativeInvalid\nstatus condition rather than failing the reconcile.\nMaps to vLLM --speculative-model flag.",
                  "type": [
                    "string",
                    "null"
                  ]
                },
                "numSpeculativeTokens": {
                  "default": 4,
                  "description": "NumSpeculativeTokens is the number of draft tokens proposed per step.\nTypical sweet spot is 3-5; higher values increase wasted work when the\ndraft disagrees with the target model.\nMaps to vLLM --num-speculative-tokens flag.",
                  "format": "int32",
                  "maximum": 16,
                  "minimum": 1,
                  "type": [
                    "integer",
                    "null"
                  ]
                }
              },
              "type": [
                "object",
                "null"
              ]
            },
            "tensorParallelSize": {
              "description": "TensorParallelSize sets the number of GPUs for tensor parallelism.",
              "format": "int32",
              "type": [
                "integer",
                "null"
              ]
            }
          },
          "type": [
            "object",
            "null"
          ]
        }
      },
      "required": [
        "modelRef"
      ],
      "type": "object"
    },
    "status": {
      "additionalProperties": false,
      "description": "status defines the observed state of InferenceService",
      "properties": {
        "conditions": {
          "description": "conditions represent the current state of the InferenceService resource.\nEach condition has a unique type and reflects the status of a specific aspect of the resource.\n\nStandard condition types include:\n- \"Available\": the resource is fully functional\n- \"Progressing\": the resource is being created or updated\n- \"Degraded\": the resource failed to reach or maintain its desired state\n\nThe status of each condition is one of True, False, or Unknown.",
          "items": {
            "additionalProperties": false,
            "description": "Condition contains details for one aspect of the current state of this API Resource.",
            "properties": {
              "lastTransitionTime": {
                "description": "lastTransitionTime is the last time the condition transitioned from one status to another.\nThis should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.",
                "format": "date-time",
                "type": "string"
              },
              "message": {
                "description": "message is a human readable message indicating details about the transition.\nThis may be an empty string.",
                "maxLength": 32768,
                "type": "string"
              },
              "observedGeneration": {
                "description": "observedGeneration represents the .metadata.generation that the condition was set based upon.\nFor instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date\nwith respect to the current state of the instance.",
                "format": "int64",
                "minimum": 0,
                "type": [
                  "integer",
                  "null"
                ]
              },
              "reason": {
                "description": "reason contains a programmatic identifier indicating the reason for the condition's last transition.\nProducers of specific condition types may define expected values and meanings for this field,\nand whether the values are considered a guaranteed API.\nThe value should be a CamelCase string.\nThis field may not be empty.",
                "maxLength": 1024,
                "minLength": 1,
                "pattern": "^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$",
                "type": "string"
              },
              "status": {
                "description": "status of the condition, one of True, False, Unknown.",
                "enum": [
                  "True",
                  "False",
                  "Unknown"
                ],
                "type": "string"
              },
              "type": {
                "description": "type of condition in CamelCase or in foo.example.com/CamelCase.",
                "maxLength": 316,
                "pattern": "^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$",
                "type": "string"
              }
            },
            "required": [
              "lastTransitionTime",
              "message",
              "reason",
              "status",
              "type"
            ],
            "type": "object"
          },
          "type": [
            "array",
            "null"
          ],
          "x-kubernetes-list-map-keys": [
            "type"
          ],
          "x-kubernetes-list-type": "map"
        },
        "desiredReplicas": {
          "description": "DesiredReplicas is the desired number of replicas",
          "format": "int32",
          "type": [
            "integer",
            "null"
          ]
        },
        "effectivePriority": {
          "description": "EffectivePriority shows the resolved priority value from the applied PriorityClass",
          "format": "int32",
          "type": [
            "integer",
            "null"
          ]
        },
        "endpoint": {
          "description": "Endpoint is the service URL where inference requests can be sent",
          "type": [
            "string",
            "null"
          ]
        },
        "lastUpdated": {
          "description": "LastUpdated is the timestamp of the last status update",
          "format": "date-time",
          "type": [
            "string",
            "null"
          ]
        },
        "modelReady": {
          "description": "ModelReady indicates if the referenced Model is in Ready state",
          "type": [
            "boolean",
            "null"
          ]
        },
        "phase": {
          "description": "Phase represents the current lifecycle phase of the InferenceService.\nPossible values: Pending, Creating, Progressing, Ready, WaitingForGPU,\nStopped, Failed. Stopped is the terminal state when spec.replicas=0\nhas caused the agent to tear down the workload; tooling polling for\nreadiness should treat Stopped the same as Pending (the user\nintentionally took the service offline; this is not an error).",
          "enum": [
            "Pending",
            "Creating",
            "Progressing",
            "Ready",
            "WaitingForGPU",
            "Stopped",
            "Failed"
          ],
          "type": [
            "string",
            "null"
          ]
        },
        "queuePosition": {
          "description": "QueuePosition indicates position among pending InferenceServices cluster-wide (0 = not queued)",
          "format": "int32",
          "type": [
            "integer",
            "null"
          ]
        },
        "readyReplicas": {
          "description": "Replicas tracks the number of ready vs desired pods",
          "format": "int32",
          "type": [
            "integer",
            "null"
          ]
        },
        "replicas": {
          "description": "Replicas is the current number of running inference pods",
          "format": "int32",
          "type": [
            "integer",
            "null"
          ]
        },
        "schedulingMessage": {
          "description": "SchedulingMessage provides details about scheduling issues",
          "type": [
            "string",
            "null"
          ]
        },
        "schedulingStatus": {
          "description": "SchedulingStatus indicates why pods cannot be scheduled (e.g., \"InsufficientGPU\")",
          "type": [
            "string",
            "null"
          ]
        },
        "waitingFor": {
          "description": "WaitingFor describes the resource constraint (e.g., \"nvidia.com/gpu: 1\")",
          "type": [
            "string",
            "null"
          ]
        }
      },
      "type": [
        "object",
        "null"
      ]
    }
  },
  "required": [
    "spec"
  ],
  "type": "object"
}