trait HasLlamaCppProperties extends AnyRef
Contains settable parameters for the AutoGGUFModel.
- Self Type
- HasLlamaCppProperties with ParamsAndFeaturesWritable with HasProtectedParams
- Grouped
- Alphabetic
- By Inheritance
- HasLlamaCppProperties
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
- val cachePrompt: BooleanParam
- val chatTemplate: Param[String]
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
- val defragmentationThreshold: FloatParam
- val disableTokenIds: IntArrayParam
- val dynamicTemperatureExponent: FloatParam
- val dynamicTemperatureRange: FloatParam
- val embedding: BooleanParam
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
- val flashAttention: BooleanParam
- val frequencyPenalty: FloatParam
- def getCachePrompt: Boolean
- def getChatTemplate: String
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- def getDefragmentationThreshold: Float
- def getDisableTokenIds: Array[Int]
- def getDynamicTemperatureExponent: Float
- def getDynamicTemperatureRange: Float
- def getEmbedding: Boolean
- def getFlashAttention: Boolean
- def getFrequencyPenalty: Float
- def getGrammar: String
- def getGrpAttnN: Int
- def getGrpAttnW: Int
- def getIgnoreEos: Boolean
-
def
getInferenceParameters: InferenceParameters
- Attributes
- protected
- def getInputPrefix: String
- def getInputPrefixBos: Boolean
- def getInputSuffix: String
- def getLookupCacheDynamicFilePath: String
- def getLookupCacheStaticFilePath: String
- def getLoraAdapters: Map[String, Float]
- def getMainGpu: Int
-
def
getMetadata: String
Get the metadata for the model
- def getMinKeep: Int
- def getMinP: Float
- def getMiroStat: String
- def getMiroStatEta: Float
- def getMiroStatTau: Float
- def getModelDraft: String
-
def
getModelParameters: ModelParameters
- Attributes
- protected
- def getNBatch: Int
- def getNChunks: Int
- def getNCtx: Int
- def getNDraft: Int
- def getNGpuLayers: Int
- def getNGpuLayersDraft: Int
- def getNKeep: Int
- def getNPredict: Int
- def getNProbs: Int
- def getNSequences: Int
- def getNThreads: Int
- def getNThreadsBatch: Int
- def getNThreadsBatchDraft: Int
- def getNThreadsDraft: Int
- def getNUbatch: Int
- def getNoKvOffload: Boolean
- def getNuma: String
- def getPSplit: Float
- def getPenalizeNl: Boolean
- def getPenaltyPrompt: String
- def getPoolingType: String
- def getPresencePenalty: Float
- def getRepeatLastN: Int
- def getRepeatPenalty: Float
- def getRopeFreqBase: Float
- def getRopeFreqScale: Float
- def getRopeScalingType: String
- def getSamplers: Array[String]
- def getSeed: Int
- def getSplitMode: String
- def getStopStrings: Array[String]
- def getSystemPrompt: String
- def getTemperature: Float
- def getTensorSplit: Array[Double]
- def getTfsZ: Float
- def getTokenBias: Map[String, Float]
- def getTokenIdBias: Map[Int, Float]
- def getTopK: Int
- def getTopP: Float
- def getTypicalP: Float
- def getUseChatTemplate: Boolean
- def getUseMlock: Boolean
- def getUseMmap: Boolean
- def getYarnAttnFactor: Float
- def getYarnBetaFast: Float
- def getYarnBetaSlow: Float
- def getYarnExtFactor: Float
- def getYarnOrigCtx: Int
-
val
gpuSplitMode: Param[String]
Set how to split the model across GPUs
Set how to split the model across GPUs
- NONE: No GPU split
- LAYER: Split the model across GPUs by layer
- ROW: Split the model across GPUs by rows
- val grammar: Param[String]
- val grpAttnN: IntParam
- val grpAttnW: IntParam
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- val ignoreEos: BooleanParam
- val inputPrefix: Param[String]
- val inputPrefixBos: BooleanParam
- val inputSuffix: Param[String]
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
- val logger: Logger
- val lookupCacheDynamicFilePath: Param[String]
- val lookupCacheStaticFilePath: Param[String]
- val loraAdapters: StructFeature[Map[String, Float]]
- val mainGpu: IntParam
- val metadata: (HasLlamaCppProperties.this)#ProtectedParam[String]
- val minKeep: IntParam
- val minP: FloatParam
- val miroStat: Param[String]
- val miroStatEta: FloatParam
- val miroStatTau: FloatParam
- val modelDraft: Param[String]
- val nBatch: IntParam
- val nChunks: IntParam
- val nCtx: IntParam
- val nDraft: IntParam
- val nGpuLayers: IntParam
- val nGpuLayersDraft: IntParam
- val nKeep: IntParam
- val nPredict: IntParam
- val nProbs: IntParam
- val nSequences: IntParam
- val nThreads: IntParam
- val nThreadsBatch: IntParam
- val nThreadsBatchDraft: IntParam
- val nThreadsDraft: IntParam
- val nUbatch: IntParam
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- val noKvOffload: BooleanParam
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
val
numaStrategy: Param[String]
Set optimization strategies that help on some NUMA systems (if available)
Set optimization strategies that help on some NUMA systems (if available)
Available Strategies:
- DISABLED: No NUMA optimizations
- DISTRIBUTE: Spread execution evenly over all
- ISOLATE: Only spawn threads on CPUs on the node that execution started on
- NUMA_CTL: Use the CPU map provided by numactl
- MIRROR: Mirrors the model across NUMA nodes
- val pSplit: FloatParam
- val penalizeNl: BooleanParam
- val penaltyPrompt: Param[String]
-
val
poolingType: Param[String]
Set the pooling type for embeddings, use model default if unspecified
Set the pooling type for embeddings, use model default if unspecified
- 0 UNSPECIFIED: Don't use any pooling
- 1 MEAN: Mean Pooling
- 2 CLS: CLS Pooling
- val presencePenalty: FloatParam
- val repeatLastN: IntParam
- val repeatPenalty: FloatParam
- val ropeFreqBase: FloatParam
- val ropeFreqScale: FloatParam
-
val
ropeScalingType: Param[String]
Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
- UNSPECIFIED: Don't use any scaling
- LINEAR: Linear scaling
- YARN: YaRN RoPE scaling
- val samplers: StringArrayParam
- val seed: IntParam
-
def
setCachePrompt(cachePrompt: Boolean): HasLlamaCppProperties.this
Whether to remember the prompt to avoid reprocessing it
-
def
setChatTemplate(chatTemplate: String): HasLlamaCppProperties.this
The chat template to use
-
def
setDefragmentationThreshold(defragThold: Float): HasLlamaCppProperties.this
Set the KV cache defragmentation threshold
-
def
setDisableTokenIds(disableTokenIds: Array[Int]): HasLlamaCppProperties.this
Set the token ids to disable in the completion.
Set the token ids to disable in the completion. This corresponds to
setTokenBias
with a value ofFloat.NEGATIVE_INFINITY
. -
def
setDynamicTemperatureExponent(dynatempExponent: Float): HasLlamaCppProperties.this
Set the dynamic temperature exponent
-
def
setDynamicTemperatureRange(dynatempRange: Float): HasLlamaCppProperties.this
Set the dynamic temperature range
-
def
setEmbedding(embedding: Boolean): HasLlamaCppProperties.this
Whether to load model with embedding support
-
def
setFlashAttention(flashAttention: Boolean): HasLlamaCppProperties.this
Whether to enable Flash Attention
-
def
setFrequencyPenalty(frequencyPenalty: Float): HasLlamaCppProperties.this
Set the repetition alpha frequency penalty
-
def
setGpuSplitMode(splitMode: String): HasLlamaCppProperties.this
Set how to split the model across GPUs
Set how to split the model across GPUs
- NONE: No GPU split -LAYER: Split the model across GPUs by layer 2. ROW: Split the model across GPUs by rows
-
def
setGrammar(grammar: String): HasLlamaCppProperties.this
Set BNF-like grammar to constrain generations
-
def
setGrpAttnN(grpAttnN: Int): HasLlamaCppProperties.this
Set the group-attention factor
-
def
setGrpAttnW(grpAttnW: Int): HasLlamaCppProperties.this
Set the group-attention width
-
def
setIgnoreEos(ignoreEos: Boolean): HasLlamaCppProperties.this
Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)
-
def
setInputPrefix(inputPrefix: String): HasLlamaCppProperties.this
Set the prompt to start generation with
-
def
setInputPrefixBos(inputPrefixBos: Boolean): HasLlamaCppProperties.this
Whether to add prefix BOS to user inputs, preceding the
--in-prefix
string -
def
setInputSuffix(inputSuffix: String): HasLlamaCppProperties.this
Set a suffix for infilling
-
def
setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): HasLlamaCppProperties.this
Set a model alias
-
def
setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): HasLlamaCppProperties.this
Set a model alias
-
def
setLoraAdapters(loraAdapters: HashMap[String, Double]): HasLlamaCppProperties.this
Sets paths to lora adapters with user defined scale.
Sets paths to lora adapters with user defined scale. (PySpark Override)
-
def
setLoraAdapters(loraAdapters: Map[String, Float]): HasLlamaCppProperties.this
Sets paths to lora adapters with user defined scale.
-
def
setMainGpu(mainGpu: Int): HasLlamaCppProperties.this
Set the GPU that is used for scratch and small tensors
-
def
setMetadata(metadata: String): HasLlamaCppProperties.this
Set the metadata for the model
-
def
setMinKeep(minKeep: Int): HasLlamaCppProperties.this
Set the amount of tokens the samplers should return at least (0 = disabled)
-
def
setMinP(minP: Float): HasLlamaCppProperties.this
Set min-p sampling
-
def
setMiroStat(mirostat: String): HasLlamaCppProperties.this
Set MiroStat sampling strategies.
Set MiroStat sampling strategies.
- DISABLED: No MiroStat
- V1: MiroStat V1
- V2: MiroStat V2
-
def
setMiroStatEta(mirostatEta: Float): HasLlamaCppProperties.this
Set the MiroStat learning rate, parameter eta
-
def
setMiroStatTau(mirostatTau: Float): HasLlamaCppProperties.this
Set the MiroStat target entropy, parameter tau
-
def
setModelDraft(modelDraft: String): HasLlamaCppProperties.this
Set the draft model for speculative decoding
-
def
setNBatch(nBatch: Int): HasLlamaCppProperties.this
Set the logical batch size for prompt processing (must be >=32 to use BLAS)
-
def
setNChunks(nChunks: Int): HasLlamaCppProperties.this
Set the maximal number of chunks to process
-
def
setNCtx(nCtx: Int): HasLlamaCppProperties.this
Set the size of the prompt context
-
def
setNDraft(nDraft: Int): HasLlamaCppProperties.this
Set the number of tokens to draft for speculative decoding
-
def
setNGpuLayers(nGpuLayers: Int): HasLlamaCppProperties.this
Set the number of layers to store in VRAM (-1 - use default)
-
def
setNGpuLayersDraft(nGpuLayersDraft: Int): HasLlamaCppProperties.this
Set the number of layers to store in VRAM for the draft model (-1 - use default)
-
def
setNKeep(nKeep: Int): HasLlamaCppProperties.this
Set the number of tokens to keep from the initial prompt
-
def
setNPredict(nPredict: Int): HasLlamaCppProperties.this
Set the number of tokens to predict
-
def
setNProbs(nProbs: Int): HasLlamaCppProperties.this
Set the amount top tokens probabilities to output if greater than 0.
-
def
setNSequences(nSequences: Int): HasLlamaCppProperties.this
Set the number of sequences to decode
-
def
setNThreads(nThreads: Int): HasLlamaCppProperties.this
Set the number of threads to use during generation
-
def
setNThreadsBatch(nThreadsBatch: Int): HasLlamaCppProperties.this
Set the number of threads to use during batch and prompt processing
-
def
setNThreadsBatchDraft(nThreadsBatchDraft: Int): HasLlamaCppProperties.this
Set the number of threads to use during batch and prompt processing
-
def
setNThreadsDraft(nThreadsDraft: Int): HasLlamaCppProperties.this
Set the number of threads to use during draft generation
-
def
setNUbatch(nUbatch: Int): HasLlamaCppProperties.this
Set the physical batch size for prompt processing (must be >=32 to use BLAS)
-
def
setNoKvOffload(noKvOffload: Boolean): HasLlamaCppProperties.this
Whether to disable KV offload
-
def
setNumaStrategy(numa: String): HasLlamaCppProperties.this
Set optimization strategies that help on some NUMA systems (if available)
Set optimization strategies that help on some NUMA systems (if available)
Available Strategies:
- DISABLED: No NUMA optimizations
- DISTRIBUTE: spread execution evenly over all
- ISOLATE: only spawn threads on CPUs on the node that execution started on
- NUMA_CTL: use the CPU map provided by numactl
- MIRROR: Mirrors the model across NUMA nodes
-
def
setPSplit(pSplit: Float): HasLlamaCppProperties.this
Set the speculative decoding split probability
-
def
setPenalizeNl(penalizeNl: Boolean): HasLlamaCppProperties.this
Set whether to penalize newline tokens
-
def
setPenaltyPrompt(penaltyPrompt: String): HasLlamaCppProperties.this
Override which part of the prompt is penalized for repetition.
-
def
setPoolingType(poolingType: String): HasLlamaCppProperties.this
Set the pooling type for embeddings, use model default if unspecified
Set the pooling type for embeddings, use model default if unspecified
- UNSPECIFIED: Don't use any pooling
- MEAN: Mean Pooling
- CLS: CLS Pooling
-
def
setPresencePenalty(presencePenalty: Float): HasLlamaCppProperties.this
Set the repetition alpha presence penalty
-
def
setRepeatLastN(repeatLastN: Int): HasLlamaCppProperties.this
Set the last n tokens to consider for penalties
-
def
setRepeatPenalty(repeatPenalty: Float): HasLlamaCppProperties.this
Set the penalty of repeated sequences of tokens
-
def
setRopeFreqBase(ropeFreqBase: Float): HasLlamaCppProperties.this
Set the RoPE base frequency, used by NTK-aware scaling
-
def
setRopeFreqScale(ropeFreqScale: Float): HasLlamaCppProperties.this
Set the RoPE frequency scaling factor, expands context by a factor of 1/N
-
def
setRopeScalingType(ropeScalingType: String): HasLlamaCppProperties.this
Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
- UNSPECIFIED: Don't use any scaling
- LINEAR: Linear scaling
- YARN: YaRN RoPE scaling
-
def
setSamplers(samplers: Array[String]): HasLlamaCppProperties.this
Set which samplers to use for token generation in the given order .
Set which samplers to use for token generation in the given order .
Available Samplers are:
- TOP_K: Top-k sampling
- TFS_Z: Tail free sampling
- TYPICAL_P: Locally typical sampling p
- TOP_P: Top-p sampling
- MIN_P: Min-p sampling
- TEMPERATURE: Temperature sampling
-
def
setSeed(seed: Int): HasLlamaCppProperties.this
Set the RNG seed
-
def
setStopStrings(stopStrings: Array[String]): HasLlamaCppProperties.this
Set strings upon seeing which token generation is stopped
-
def
setSystemPrompt(systemPrompt: String): HasLlamaCppProperties.this
Set a system prompt to use
-
def
setTemperature(temperature: Float): HasLlamaCppProperties.this
Set the temperature
-
def
setTensorSplit(tensorSplit: Array[Double]): HasLlamaCppProperties.this
Set how split tensors should be distributed across GPUs
-
def
setTfsZ(tfsZ: Float): HasLlamaCppProperties.this
Set tail free sampling, parameter z
-
def
setTokenBias(tokenBias: HashMap[String, Double]): HasLlamaCppProperties.this
Set the tokens to disable during completion.
Set the tokens to disable during completion. (Override for PySpark)
-
def
setTokenBias(tokenBias: Map[String, Float]): HasLlamaCppProperties.this
Set the tokens to disable during completion.
-
def
setTokenIdBias(tokenIdBias: HashMap[Integer, Double]): HasLlamaCppProperties.this
Set the token ids to disable in the completion.
Set the token ids to disable in the completion. (Override for PySpark)
-
def
setTokenIdBias(tokenIdBias: Map[Int, Float]): HasLlamaCppProperties.this
Set the token ids to disable in the completion.
-
def
setTopK(topK: Int): HasLlamaCppProperties.this
Set top-k sampling
-
def
setTopP(topP: Float): HasLlamaCppProperties.this
Set top-p sampling
-
def
setTypicalP(typicalP: Float): HasLlamaCppProperties.this
Set locally typical sampling, parameter p
-
def
setUseChatTemplate(useChatTemplate: Boolean): HasLlamaCppProperties.this
Set whether or not generate should apply a chat template
-
def
setUseMlock(useMlock: Boolean): HasLlamaCppProperties.this
Whether to force the system to keep model in RAM rather than swapping or compressing
-
def
setUseMmap(useMmap: Boolean): HasLlamaCppProperties.this
Whether to use memory-map model (faster load but may increase pageouts if not using mlock)
-
def
setYarnAttnFactor(yarnAttnFactor: Float): HasLlamaCppProperties.this
Set the YaRN scale sqrt(t) or attention magnitude
-
def
setYarnBetaFast(yarnBetaFast: Float): HasLlamaCppProperties.this
Set the YaRN low correction dim or beta
-
def
setYarnBetaSlow(yarnBetaSlow: Float): HasLlamaCppProperties.this
Set the YaRN high correction dim or alpha
-
def
setYarnExtFactor(yarnExtFactor: Float): HasLlamaCppProperties.this
Set the YaRN extrapolation mix factor
-
def
setYarnOrigCtx(yarnOrigCtx: Int): HasLlamaCppProperties.this
Set the YaRN original context size of model
- val stopStrings: StringArrayParam
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
- val systemPrompt: Param[String]
- val temperature: FloatParam
- val tensorSplit: DoubleArrayParam
- val tfsZ: FloatParam
-
def
toString(): String
- Definition Classes
- AnyRef → Any
- val tokenBias: StructFeature[Map[String, Float]]
- val tokenIdBias: StructFeature[Map[Int, Float]]
- val topK: IntParam
- val topP: FloatParam
- val typicalP: FloatParam
- val useChatTemplate: BooleanParam
- val useMlock: BooleanParam
- val useMmap: BooleanParam
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
- val yarnAttnFactor: FloatParam
- val yarnBetaFast: FloatParam
- val yarnBetaSlow: FloatParam
- val yarnExtFactor: FloatParam
- val yarnOrigCtx: IntParam
Inherited from AnyRef
Inherited from Any
Parameter setters
Parameter getters
Parameters
A list of (hyper-)parameter keys this annotator can take. Users can set and get the parameter values through setters and getters, respectively.