WordPiece
#deepLearning/wordPiece
WordPiece 算法思想
Wordpiece将文本在每个词内部添加前缀来分割成字符级,来构建词汇表。
w ##o ##r ##d
Wordpiece 合并规则
核心思想:合并能最大提升语言模型似然概率的子词对。 \[ Score = \frac{freq\_of\_pair} { {freq\_of\_first\_element} \times {freq\_of\_second\_element} } \] 通过将两部分合在一起的频率除以其中各部分的频率的乘积,该算法优先合并那些在词汇表中单独出现出现的对。
首先假设词汇表 ("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)
("h" "##u" "##g", 10), ("p" "##u" "##g", 5), ("p" "##u" "##n", 12), ("b" "##u" "##n", 4), ("h" "##u" "##g" "##s", 5)
["b", "h", "p", "##g", "##n", "##s", "##u"]
pair("##u", "##g")
出现的频率是 20
次,freq("##u") == 36
, freq("##g") == 20
, 所以
pair("##u", "##g")
的分数为
score = pair("##u", "##g") / freq("##u") * freq("##g") = 20 / (36 * 20) = 1/36
。
同理 pair("##g", "##s")
的分数为
1/20
,所以第一个合并是 ("##g", "##s") -> ("##gs")
。
词汇表: ["b", "h", "p", "##g", "##n", "##s", "##u", "##gs"] |
接下来依次合并,直到达到所需的词汇表大小。
实现 WordPiece
语料库 corpus = [
"This is the Hugging Face Course.",
"This chapter is about tokenization.",
"This section shows several tokenizer algorithms.",
"Hopefully, you will be able to understand how they are trained and generate tokens.",
]
将语料库预分词为单词,由于正在复刻 WordPiece tokenizer (如
BERT),因此使用 bert-base-cased
tokenizer 进行预分词:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
计算语料库中每个单词的频率 from collections import defaultdict
word_freqs = defaultdict(int)
for text in corpus:
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
new_words = [word for word, offset in words_with_offsets]
for word in new_words:
word_freqs[word] += 1
word_freqs
# outputs:
defaultdict(
int, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'Course': 1, '.': 4, 'chapter': 1, 'about': 1,
'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'Hopefully': 1,
',': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1,
'trained': 1, 'and': 1, 'generate': 1, 'tokens': 1})
添加前缀 alphabet = []
for word in word_freqs.keys():
if word[0] not in alphabet:
alphabet.append(word[0])
for letter in word[1:]:
if f"##{letter}" not in alphabet:
alphabet.append(f"##{letter}")
alphabet.sort()
alphabet
print(alphabet)
# outputs:
['##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s',
'##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u',
'w', 'y']
在使用 BERT 的情况下,特殊 tokens
是 ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
:
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
将每个单词进行分割 splits = {
word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
for word in word_freqs.keys()
}
splits
# outputs:
{'This': ['T', '##h', '##i', '##s'],
'is': ['i', '##s'],
'the': ['t', '##h', '##e'],
'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'],
'Face': ['F', '##a', '##c', '##e'],
'course': ['c', '##o', '##u', '##r', '##s', '##e'],
'.': ['.'],
'chapter': ['c', '##h', '##a', '##p', '##t', '##e', '##r'],
'about': ['a', '##b', '##o', '##u', '##t'],
'tokenization': ['t',
'##o',
'##k',
'##e',
'##n',
'##i',
'##z',
'##a',
'##t',
'##i',
'##o',
'##n'],
'section': ['s', '##e', '##c', '##t', '##i', '##o', '##n'],
'shows': ['s', '##h', '##o', '##w', '##s'],
'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'],
'tokenizer': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##e', '##r'],
'algorithms': ['a',
'##l',
'##g',
'##o',
'##r',
'##i',
'##t',
'##h',
'##m',
'##s'],
'Hopefully': ['H', '##o', '##p', '##e', '##f', '##u', '##l', '##l', '##y'],
',': [','],
'you': ['y', '##o', '##u'],
'will': ['w', '##i', '##l', '##l'],
'be': ['b', '##e'],
'able': ['a', '##b', '##l', '##e'],
'to': ['t', '##o'],
'understand': ['u',
'##n',
'##d',
'##e',
'##r',
'##s',
'##t',
'##a',
'##n',
'##d'],
'how': ['h', '##o', '##w'],
'they': ['t', '##h', '##e', '##y'],
'are': ['a', '##r', '##e'],
'trained': ['t', '##r', '##a', '##i', '##n', '##e', '##d'],
'and': ['a', '##n', '##d'],
'generate': ['g', '##e', '##n', '##e', '##r', '##a', '##t', '##e'],
'tokens': ['t', '##o', '##k', '##e', '##n', '##s']}
计算每对的分数 def compute_pair_scores(splits):
letter_freqs = defaultdict(int)
pair_freqs = defaultdict(int)
for word, freq in word_freqs.items():
split = splits[word]
if len(split) == 1:
letter_freqs[split[0]] += freq
continue
for i in range(len(split) - 1):
pair = (split[i], split[i + 1])
letter_freqs[split[i]] += freq
pair_freqs[pair] += freq
letter_freqs[split[-1]] += freq
scores = {
pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
for pair, freq in pair_freqs.items()
}
return scores
pair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):
print(f"{key}: {pair_scores[key]}")
if i >= 5:
break
# outputs:
('T', '##h'): 0.125
('##h', '##i'): 0.03409090909090909
('##i', '##s'): 0.02727272727272727
('i', '##s'): 0.1
('t', '##h'): 0.03571428571428571
('##h', '##e'): 0.011904761904761904
找到得分最高的对 best_pair = ""
max_score = None
for pair, score in pair_scores.items():
if max_score is None or max_score < score:
best_pair = pair
max_score = score
print(best_pair, max_score)
# outputs:
('a', '##b') 0.2
第一个要学习的合并是 ('a', '##b') -> 'ab'
,添加 'ab'
到词汇表中:
vocab.append("ab")
对 splits
字典进行这种合并 def merge_pair(a, b, splits):
for word in word_freqs:
split = splits[word]
if len(split) == 1:
continue
i = 0
while i < len(split) - 1:
if split[i] == a and split[i + 1] == b:
merge = a + b[2:] if b.startswith("##") else a + b
split = split[:i] + [merge] + split[i + 2 :]
else:
i += 1
splits[word] = split
return splits
第一次合并的结果 splits = merge_pair("a", "##b", splits)
splits["about"]
# outputs:
['ab', '##o', '##u', '##t']
设定词汇表的大小为 70 vocab_size = 70
while len(vocab) < vocab_size:
scores = compute_pair_scores(splits)
best_pair, max_score = "", None
for pair, score in scores.items():
if max_score is None or max_score < score:
best_pair = pair
max_score = score
splits = merge_pair(*best_pair, splits)
new_token = (
best_pair[0] + best_pair[1][2:]
if best_pair[1].startswith("##")
else best_pair[0] + best_pair[1]
)
vocab.append(new_token)
print(vocab)
# outputs:
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k',
'##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H',
'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab','##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully',
'Th', 'ch', '##hm', 'cha', 'chap', 'chapt', '##thm', 'Hu', 'Hug', 'Hugg', 'sh', 'th', 'is', '##thms', '##za', '##zat',
'##ut']
def encode_word(word): |
文本分词 def tokenize(text):
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word for word, offset in pre_tokenize_result]
encoded_words = [encode_word(word) for word in pre_tokenized_text]
return sum(encoded_words, [])
tokenize("This is the Hugging Face course!")
# outputs:
['Th', '##i', '##s', 'is', 'th', '##e', 'Hugg', '##i', '##n', '##g', 'Fac', '##e', 'c', '##o', '##u', '##r', '##s',
'##e', '[UNK]']