-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathrun.sh
More file actions
18 lines (15 loc) · 785 Bytes
/
run.sh
File metadata and controls
18 lines (15 loc) · 785 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# Author
# Soohwan Kim, Seyoung Bae, Cheolhwang Won, Soyoung Cho, Jeongwon Kwak
DATASET_PATH="SET_YOUR_DATASET_PATH"
VOCAB_DEST='SET_LABELS_DESTINATION'
OUTPUT_UNIT='character' # you can set character / subword / grapheme
PREPROCESS_MODE='phonetic' # phonetic : 칠 십 퍼센트, spelling : 70%
VOCAB_SIZE=5000 # if you use subword output unit, set vocab size
# if you want to use pretrain kober tokenizer refer https://github.com/SKTBrain/KoBERT
# And release the bottom annotation.
python main.py \
--dataset_path $DATASET_PATH \
--vocab_dest $VOCAB_DEST \
--output_unit $OUTPUT_UNIT \
--preprocess_mode $PREPROCESS_MODE \
--vocab_size $VOCAB_SIZE \