-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtiny-tokenize-bpe.py
More file actions
72 lines (56 loc) · 1.98 KB
/
tiny-tokenize-bpe.py
File metadata and controls
72 lines (56 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors
from collections import Counter
import json
import os
# Load the dataset
dataset = load_dataset("roneneldan/TinyStories")
# Initialize BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# Prepare training data for BPE
training_texts = []
for i, example in enumerate(dataset['train']):
training_texts.append(example['text'])
# Train the BPE tokenizer
trainer = trainers.BpeTrainer(
vocab_size=10000,
special_tokens=["<unk>", "<pad>", "<s>", "</s>"]
)
tokenizer.train_from_iterator(training_texts, trainer=trainer)
# Add post-processor for special tokens
start_token_id = tokenizer.token_to_id("<s>")
end_token_id = tokenizer.token_to_id("</s>")
tokenizer.post_processor = processors.TemplateProcessing(
single="<s> $A </s>",
special_tokens=[
("<s>", start_token_id),
("</s>", end_token_id),
]
)
def tokenize_text(text):
encoding = tokenizer.encode(text)
return encoding.tokens
# Get vocabulary from the trained tokenizer
vocab_size = tokenizer.get_vocab_size()
vocab = list(tokenizer.get_vocab().keys())
token_to_idx = tokenizer.get_vocab()
idx_to_token = {idx: token for token, idx in token_to_idx.items()}
from datasets import Dataset
# Create output directory if it doesn't exist
os.makedirs('processed_data', exist_ok=True)
# Convert dataset to Dataset format and save
processed_dataset = Dataset.from_list(list(dataset['train']))
processed_dataset.save_to_disk('processed_data/tiny_stories')
# Save the BPE tokenizer
tokenizer.save('processed_data/bpe_tokenizer.json')
# Save vocabulary and mappings
vocab_data = {
'vocab': vocab,
'token_to_idx': token_to_idx,
'idx_to_token': idx_to_token,
'vocab_size': vocab_size,
'tokenizer_type': 'BPE'
}
with open('processed_data/vocabulary.json', 'w', encoding='utf-8') as f:
json.dump(vocab_data, f, ensure_ascii=False, indent=2)