class QwenTokenizer extends Gpt2Tokenizer
- Alphabetic
- By Inheritance
- QwenTokenizer
- Gpt2Tokenizer
- BpeTokenizer
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
- new QwenTokenizer(merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens, padWithSequenceTokens: Boolean = false, addPrefixSpaceToSentence: Boolean = false)
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
val
addPrefixSpaceToSentence: Boolean
- Definition Classes
- BpeTokenizer
-
val
alwaysAddPrefix: Boolean
- Definition Classes
- BpeTokenizer
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
bpe(indToken: IndexedToken): Array[TokenPiece]
Do the BPE algorithm.
Do the BPE algorithm. Goal is to find the token as the largest words in the known vocabulary. If not possible, the word is split into smaller subwords, until they are known.
- returns
Array of TokenPieces, corresponding to encoded token
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
val
bpeRanks: Map[(String, String), Int]
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
val
bytesToUnicodeMapping: Map[Int, String]
Mapping for bytes to a different set of unicode characters (especially white spaces).
Mapping for bytes to a different set of unicode characters (especially white spaces). This improved model performance for gpt-2
- Attributes
- protected
- Definition Classes
- Gpt2Tokenizer
-
val
cache: Map[String, Array[String]]
cache for already encoded tokens
cache for already encoded tokens
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
decodeTokens(tokens: Array[Int]): String
- Definition Classes
- Gpt2Tokenizer
-
val
decoderVocab: Map[Int, String]
- Attributes
- protected
- Definition Classes
- Gpt2Tokenizer
-
def
encode(indTokens: Array[IndexedToken]): Array[TokenPiece]
- Definition Classes
- BpeTokenizer
-
def
encode(indToken: IndexedToken): Array[TokenPiece]
- Definition Classes
- BpeTokenizer
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
def
getBpeRanking: ((String, String)) ⇒ Int
Rankings for the byte pairs.
Rankings for the byte pairs. Derived from merges.txt
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
def
getBytePairs(word: Array[String]): Array[(String, String)]
Create a sequence of byte-pairs of the word
Create a sequence of byte-pairs of the word
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
getTokenPieces(indToken: IndexedToken, word: Array[String]): Array[TokenPiece]
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
val
merges: Map[(String, String), Int]
- Definition Classes
- BpeTokenizer
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
val
padWithSequenceTokens: Boolean
- Definition Classes
- BpeTokenizer
-
def
performMerges(wordChars: Array[String], charPairs: Array[(String, String)]): Array[String]
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
def
preProcessTokenForBpe(token: String): String
- Definition Classes
- Gpt2Tokenizer → BpeTokenizer
-
val
prefixForPieceId: Option[String]
- Definition Classes
- Gpt2Tokenizer → BpeTokenizer
-
val
sentencePadding: (String, String)
Special tokens of the model for processing
Special tokens of the model for processing
- Definition Classes
- BpeTokenizer
-
val
specialTokens: SpecialTokens
- Definition Classes
- BpeTokenizer
-
def
splitOnSpecialToken(specialToken: SpecialToken, text: String): ListBuffer[String]
Split the the individual sub texts on special tokens, e.g.
Split the the individual sub texts on special tokens, e.g. masking etc.
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
val
splitPattern: Regex
- Definition Classes
- Gpt2Tokenizer
-
val
suffixForPieceId: Option[String]
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
def
tokenize(sentence: Sentence): Array[IndexedToken]
Tokenize considering special tokens and split algorithm
Tokenize considering special tokens and split algorithm
- Definition Classes
- BpeTokenizer
-
def
tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]
Needs to be implemented
Needs to be implemented
- Definition Classes
- Gpt2Tokenizer → BpeTokenizer
-
val
unicodeToByteMapping: Map[String, Int]
- Attributes
- protected
- Definition Classes
- Gpt2Tokenizer
-
val
vocab: Map[String, Int]
- Definition Classes
- BpeTokenizer
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()