-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathrun_eval.sh
More file actions
45 lines (35 loc) · 1.11 KB
/
run_eval.sh
File metadata and controls
45 lines (35 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
BENCHMARK_DATA=benchmark/WebMainBench_100.jsonl
RESULT_DIR=benchmark_results
mkdir $RESULT_DIR
# MinerU-HTML Extractors (Use GPU)
EXTRACTORS=(
"mineru_html_fallback-html-md"
"mineru_html-html-md"
)
MODEL_PATH=YOUR_MINERUHTML_MODEL_PATH
for extractor in ${EXTRACTORS[@]}; do
python eval_baselines.py --bench $BENCHMARK_DATA --task_dir $RESULT_DIR/$extractor --extractor_name $extractor --model_path $MODEL_PATH --default_config gpu
done
# CPU Extractors
EXTRACTORS=(
"magichtml-html-md"
"readability-html-md"
"trafilatura-html-md"
"resiliparse-text"
"trafilatura-md"
"trafilatura-text"
"fullpage-html-md"
"boilerpy3-text"
"gne-html-md"
"newsplease-text"
"justtext-text"
"boilerpy3-html-md"
"goose3-text"
)
for extractor in ${EXTRACTORS[@]}; do
python eval_baselines.py --bench $BENCHMARK_DATA --task_dir $RESULT_DIR/$extractor --extractor_name $extractor
done
# ReaderLM Extractors (Use GPU)
extractor=readerlm-text
MODEL_PATH=YOUR_READERLM_MODEL_PATH
python eval_baselines.py --bench $BENCHMARK_DATA --task_dir $RESULT_DIR/$extractor --extractor_name $extractor --model_path $MODEL_PATH --default_config gpu