trait HasLlamaCppModelProperties extends AnyRef
Contains settable model parameters for the AutoGGUFModel.
- Self Type
- HasLlamaCppModelProperties with ParamsAndFeaturesWritable with HasProtectedParams
- Grouped
- Alphabetic
- By Inheritance
- HasLlamaCppModelProperties
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
- val chatTemplate: Param[String]
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
val
defaultGpuLayers: Int
- Attributes
- protected
-
val
defaultMainGpu: Int
- Attributes
- protected
- val defragmentationThreshold: FloatParam
- val embedding: BooleanParam
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
- val flashAttention: BooleanParam
- def getChatTemplate: String
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- def getDefragmentationThreshold: Float
- def getEmbedding: Boolean
- def getFlashAttention: Boolean
- def getGrpAttnN: Int
- def getGrpAttnW: Int
- def getInputPrefixBos: Boolean
- def getLookupCacheDynamicFilePath: String
- def getLookupCacheStaticFilePath: String
- def getLoraAdapters: Map[String, Float]
- def getMainGpu: Int
-
def
getMetadata: String
Get the metadata for the model
- def getMetadataMap: Map[String, String]
- def getModelDraft: String
-
def
getModelParameters: ModelParameters
- Attributes
- protected
- def getNBatch: Int
- def getNChunks: Int
- def getNCtx: Int
- def getNDraft: Int
- def getNGpuLayers: Int
- def getNGpuLayersDraft: Int
- def getNSequences: Int
- def getNThreads: Int
- def getNThreadsBatch: Int
- def getNThreadsBatchDraft: Int
- def getNThreadsDraft: Int
- def getNUbatch: Int
- def getNoKvOffload: Boolean
- def getNuma: String
- def getPSplit: Float
- def getPoolingType: String
- def getRopeFreqBase: Float
- def getRopeFreqScale: Float
- def getRopeScalingType: String
- def getSplitMode: String
- def getSystemPrompt: String
- def getTensorSplit: Array[Double]
- def getUseMlock: Boolean
- def getUseMmap: Boolean
- def getYarnAttnFactor: Float
- def getYarnBetaFast: Float
- def getYarnBetaSlow: Float
- def getYarnExtFactor: Float
- def getYarnOrigCtx: Int
-
val
gpuSplitMode: Param[String]
Set how to split the model across GPUs
Set how to split the model across GPUs
- NONE: No GPU split
- LAYER: Split the model across GPUs by layer
- ROW: Split the model across GPUs by rows
- val grpAttnN: IntParam
- val grpAttnW: IntParam
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- val inputPrefixBos: BooleanParam
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
val
logger: Logger
- Attributes
- protected
- val lookupCacheDynamicFilePath: Param[String]
- val lookupCacheStaticFilePath: Param[String]
- val loraAdapters: StructFeature[Map[String, Float]]
- val mainGpu: IntParam
- val metadata: (HasLlamaCppModelProperties.this)#ProtectedParam[String]
- val modelDraft: Param[String]
- val nBatch: IntParam
- val nChunks: IntParam
- val nCtx: IntParam
- val nDraft: IntParam
- val nGpuLayers: IntParam
- val nGpuLayersDraft: IntParam
- val nSequences: IntParam
- val nThreads: IntParam
- val nThreadsBatch: IntParam
- val nThreadsBatchDraft: IntParam
- val nThreadsDraft: IntParam
- val nUbatch: IntParam
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- val noKvOffload: BooleanParam
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
val
numaStrategy: Param[String]
Set optimization strategies that help on some NUMA systems (if available)
Set optimization strategies that help on some NUMA systems (if available)
Available Strategies:
- DISABLED: No NUMA optimizations
- DISTRIBUTE: Spread execution evenly over all
- ISOLATE: Only spawn threads on CPUs on the node that execution started on
- NUMA_CTL: Use the CPU map provided by numactl
- MIRROR: Mirrors the model across NUMA nodes
- val pSplit: FloatParam
-
val
poolingType: Param[String]
Set the pooling type for embeddings, use model default if unspecified
Set the pooling type for embeddings, use model default if unspecified
- 0 NONE: Don't use any pooling
- 1 MEAN: Mean Pooling
- 2 CLS: Choose the CLS token
- 3 LAST: Choose the last token
- val ropeFreqBase: FloatParam
- val ropeFreqScale: FloatParam
-
val
ropeScalingType: Param[String]
Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
- UNSPECIFIED: Don't use any scaling
- LINEAR: Linear scaling
- YARN: YaRN RoPE scaling
-
def
setChatTemplate(chatTemplate: String): HasLlamaCppModelProperties.this
The chat template to use
-
def
setDefragmentationThreshold(defragThold: Float): HasLlamaCppModelProperties.this
Set the KV cache defragmentation threshold
-
def
setEmbedding(embedding: Boolean): HasLlamaCppModelProperties.this
Whether to load model with embedding support
-
def
setFlashAttention(flashAttention: Boolean): HasLlamaCppModelProperties.this
Whether to enable Flash Attention
-
def
setGpuSplitMode(splitMode: String): HasLlamaCppModelProperties.this
Set how to split the model across GPUs
Set how to split the model across GPUs
- NONE: No GPU split -LAYER: Split the model across GPUs by layer 2. ROW: Split the model across GPUs by rows
-
def
setGpuSupportIfAvailable(spark: SparkSession): HasLlamaCppModelProperties.this
- Attributes
- protected
-
def
setGrpAttnN(grpAttnN: Int): HasLlamaCppModelProperties.this
Set the group-attention factor
-
def
setGrpAttnW(grpAttnW: Int): HasLlamaCppModelProperties.this
Set the group-attention width
-
def
setInputPrefixBos(inputPrefixBos: Boolean): HasLlamaCppModelProperties.this
Whether to add prefix BOS to user inputs, preceding the
--in-prefix
string -
def
setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): HasLlamaCppModelProperties.this
Set path to dynamic lookup cache to use for lookup decoding (updated by generation)
-
def
setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): HasLlamaCppModelProperties.this
Set path to static lookup cache to use for lookup decoding (not updated by generation)
-
def
setLoraAdapters(loraAdapters: HashMap[String, Double]): HasLlamaCppModelProperties.this
Sets paths to lora adapters with user defined scale.
Sets paths to lora adapters with user defined scale. (PySpark Override)
-
def
setLoraAdapters(loraAdapters: Map[String, Float]): HasLlamaCppModelProperties.this
Sets paths to lora adapters with user defined scale.
-
def
setMainGpu(mainGpu: Int): HasLlamaCppModelProperties.this
Set the GPU that is used for scratch and small tensors
-
def
setMetadata(metadata: String): HasLlamaCppModelProperties.this
Set the metadata for the model
-
def
setModelDraft(modelDraft: String): HasLlamaCppModelProperties.this
Set the draft model for speculative decoding
-
def
setNBatch(nBatch: Int): HasLlamaCppModelProperties.this
Set the logical batch size for prompt processing (must be >=32 to use BLAS)
-
def
setNChunks(nChunks: Int): HasLlamaCppModelProperties.this
Set the maximal number of chunks to process
-
def
setNCtx(nCtx: Int): HasLlamaCppModelProperties.this
Set the size of the prompt context
-
def
setNDraft(nDraft: Int): HasLlamaCppModelProperties.this
Set the number of tokens to draft for speculative decoding
-
def
setNGpuLayers(nGpuLayers: Int): HasLlamaCppModelProperties.this
Set the number of layers to store in VRAM (-1 - use default)
-
def
setNGpuLayersDraft(nGpuLayersDraft: Int): HasLlamaCppModelProperties.this
Set the number of layers to store in VRAM for the draft model (-1 - use default)
-
def
setNSequences(nSequences: Int): HasLlamaCppModelProperties.this
Set the number of sequences to decode
-
def
setNThreads(nThreads: Int): HasLlamaCppModelProperties.this
Set the number of threads to use during generation
-
def
setNThreadsBatch(nThreadsBatch: Int): HasLlamaCppModelProperties.this
Set the number of threads to use during batch and prompt processing
-
def
setNThreadsBatchDraft(nThreadsBatchDraft: Int): HasLlamaCppModelProperties.this
Set the number of threads to use during batch and prompt processing
-
def
setNThreadsDraft(nThreadsDraft: Int): HasLlamaCppModelProperties.this
Set the number of threads to use during draft generation
-
def
setNUbatch(nUbatch: Int): HasLlamaCppModelProperties.this
Set the physical batch size for prompt processing (must be >=32 to use BLAS)
-
def
setNoKvOffload(noKvOffload: Boolean): HasLlamaCppModelProperties.this
Whether to disable KV offload
-
def
setNumaStrategy(numa: String): HasLlamaCppModelProperties.this
Set optimization strategies that help on some NUMA systems (if available)
Set optimization strategies that help on some NUMA systems (if available)
Available Strategies:
- DISABLED: No NUMA optimizations
- DISTRIBUTE: spread execution evenly over all
- ISOLATE: only spawn threads on CPUs on the node that execution started on
- NUMA_CTL: use the CPU map provided by numactl
- MIRROR: Mirrors the model across NUMA nodes
-
def
setPSplit(pSplit: Float): HasLlamaCppModelProperties.this
Set the speculative decoding split probability
-
def
setPoolingType(poolingType: String): HasLlamaCppModelProperties.this
Set the pooling type for embeddings, use model default if unspecified
Set the pooling type for embeddings, use model default if unspecified
- 0 NONE: Don't use any pooling and return token embeddings (if the model supports it)
- 1 MEAN: Mean Pooling
- 2 CLS: Choose the CLS token
- 3 LAST: Choose the last token
-
def
setRopeFreqBase(ropeFreqBase: Float): HasLlamaCppModelProperties.this
Set the RoPE base frequency, used by NTK-aware scaling
-
def
setRopeFreqScale(ropeFreqScale: Float): HasLlamaCppModelProperties.this
Set the RoPE frequency scaling factor, expands context by a factor of 1/N
-
def
setRopeScalingType(ropeScalingType: String): HasLlamaCppModelProperties.this
Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
- UNSPECIFIED: Don't use any scaling
- LINEAR: Linear scaling
- YARN: YaRN RoPE scaling
-
def
setSystemPrompt(systemPrompt: String): HasLlamaCppModelProperties.this
Set a system prompt to use
-
def
setTensorSplit(tensorSplit: Array[Double]): HasLlamaCppModelProperties.this
Set how split tensors should be distributed across GPUs
-
def
setUseMlock(useMlock: Boolean): HasLlamaCppModelProperties.this
Whether to force the system to keep model in RAM rather than swapping or compressing
-
def
setUseMmap(useMmap: Boolean): HasLlamaCppModelProperties.this
Whether to use memory-map model (faster load but may increase pageouts if not using mlock)
-
def
setYarnAttnFactor(yarnAttnFactor: Float): HasLlamaCppModelProperties.this
Set the YaRN scale sqrt(t) or attention magnitude
-
def
setYarnBetaFast(yarnBetaFast: Float): HasLlamaCppModelProperties.this
Set the YaRN low correction dim or beta
-
def
setYarnBetaSlow(yarnBetaSlow: Float): HasLlamaCppModelProperties.this
Set the YaRN high correction dim or alpha
-
def
setYarnExtFactor(yarnExtFactor: Float): HasLlamaCppModelProperties.this
Set the YaRN extrapolation mix factor
-
def
setYarnOrigCtx(yarnOrigCtx: Int): HasLlamaCppModelProperties.this
Set the YaRN original context size of model
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
- val systemPrompt: Param[String]
- val tensorSplit: DoubleArrayParam
-
def
toString(): String
- Definition Classes
- AnyRef → Any
- val useMlock: BooleanParam
- val useMmap: BooleanParam
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
- val yarnAttnFactor: FloatParam
- val yarnBetaFast: FloatParam
- val yarnBetaSlow: FloatParam
- val yarnExtFactor: FloatParam
- val yarnOrigCtx: IntParam
Inherited from AnyRef
Inherited from Any
Parameter setters
Parameter getters
Parameters
A list of (hyper-)parameter keys this annotator can take. Users can set and get the parameter values through setters and getters, respectively.