class CLIPTokenizer extends Gpt2Tokenizer
- Alphabetic
- By Inheritance
- CLIPTokenizer
- Gpt2Tokenizer
- BpeTokenizer
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
- new CLIPTokenizer(merges: Map[(String, String), Int], vocab: Map[String, Int], specialTokens: SpecialTokens)
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
val
addPrefixSpaceToSentence: Boolean
- Definition Classes
- BpeTokenizer
-
val
alwaysAddPrefix: Boolean
- Definition Classes
- BpeTokenizer
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
bpe(indToken: IndexedToken): Array[TokenPiece]
CLIP Specific tokenization.
CLIP Specific tokenization. We append "<\w>" to word ends.
- returns
Array of TokenPieces, corresponding to encoded token
- Attributes
- protected
- Definition Classes
- CLIPTokenizer → BpeTokenizer
-
val
bpeRanks: Map[(String, String), Int]
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
val
bytesToUnicodeMapping: Map[Int, String]
Mapping for bytes to a different set of unicode characters (especially white spaces).
Mapping for bytes to a different set of unicode characters (especially white spaces). This improved model performance for gpt-2
- Attributes
- protected
- Definition Classes
- Gpt2Tokenizer
-
val
cache: Map[String, Array[String]]
cache for already encoded tokens
cache for already encoded tokens
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
decodeTokens(tokens: Array[Int]): String
- Definition Classes
- Gpt2Tokenizer
-
val
decoderVocab: Map[Int, String]
- Attributes
- protected
- Definition Classes
- Gpt2Tokenizer
-
def
encode(indTokens: Array[IndexedToken]): Array[TokenPiece]
- Definition Classes
- BpeTokenizer
-
def
encode(indToken: IndexedToken): Array[TokenPiece]
- Definition Classes
- BpeTokenizer
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
def
getBpeRanking: ((String, String)) ⇒ Int
Rankings for the byte pairs.
Rankings for the byte pairs. Derived from merges.txt
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
def
getBytePairs(word: Array[String]): Array[(String, String)]
Create a sequence of byte-pairs of the word
Create a sequence of byte-pairs of the word
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
getTokenPieces(indToken: IndexedToken, word: Array[String]): Array[TokenPiece]
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
val
merges: Map[(String, String), Int]
- Definition Classes
- BpeTokenizer
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
val
padWithSequenceTokens: Boolean
- Definition Classes
- BpeTokenizer
-
def
performMerges(wordChars: Array[String], charPairs: Array[(String, String)]): Array[String]
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
def
preProcessTokenForBpe(token: String): String
- Definition Classes
- Gpt2Tokenizer → BpeTokenizer
-
val
prefixForPieceId: Option[String]
- Definition Classes
- Gpt2Tokenizer → BpeTokenizer
-
val
sentencePadding: (String, String)
Special tokens of the model for processing
Special tokens of the model for processing
- Definition Classes
- BpeTokenizer
-
val
specialTokens: SpecialTokens
- Definition Classes
- BpeTokenizer
-
def
splitOnSpecialToken(specialToken: SpecialToken, text: String): ListBuffer[String]
Split the the individual sub texts on special tokens, e.g.
Split the the individual sub texts on special tokens, e.g. masking etc.
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
val
splitPattern: Regex
- Definition Classes
- CLIPTokenizer → Gpt2Tokenizer
-
val
suffixForPieceId: Option[String]
- Attributes
- protected
- Definition Classes
- BpeTokenizer
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
def
tokenize(sentence: Sentence): Array[IndexedToken]
Tokenize considering special tokens and split algorithm
Tokenize considering special tokens and split algorithm
- Definition Classes
- BpeTokenizer
-
def
tokenizeSubText(text: String, indexOffset: Int): Array[IndexedToken]
Needs to be implemented
Needs to be implemented
- Definition Classes
- CLIPTokenizer → Gpt2Tokenizer → BpeTokenizer
-
val
unicodeToByteMapping: Map[String, Int]
- Attributes
- protected
- Definition Classes
- Gpt2Tokenizer
-
val
vocab: Map[String, Int]
- Definition Classes
- BpeTokenizer
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()