dictionary/script/pure_dictionary.py at master · ionkaon/dictionary · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
AUTHOR : Shin Zoqchiuq
VERSION : v2021.07.17

從 `字表.tsv` 生成 `甬城.tsv`
"""
import pandas as pd
import numpy as np
from gninpou import compat2gninpou

sheet = pd.read_csv(
    open("../字表.tsv", encoding="utf-8"), sep="\t",
    usecols=["繁體", "简体", "兼容格式", "出處（甬）", "備註"]
)
sheet.rename(columns={"出處（甬）":"出處"}, inplace=True)
sheet.rename(columns={"兼容格式":"吳拼"}, inplace=True)

sheet.drop(
    (
        (sheet["出處"] > 5).to_numpy() & (sheet["出處"] != 14).to_numpy() |
        (sheet["出處"] == 0).to_numpy() | np.isnan(sheet["出處"]).to_numpy() |
        [type(i) != str for i in iter(sheet["吳拼"])]
    ).nonzero()[0]
    , inplace=True
)

sheet.reset_index(inplace=True, drop=True)
sheet["出處"] = [int(i) for i in iter(sheet["出處"])]
sheet["吳拼"] = [compat2gninpou(i) for i in iter(sheet["吳拼"])]
sheet["備註"] = [
    i if type(i) == float or "(" not in i else np.nan for i in iter(sheet["備註"])
]
sheet.drop_duplicates(inplace=True)
sheet.reset_index(drop=True, inplace=True)

sheet[["繁體", "简体", "吳拼", "出處", "備註"]].to_csv(
    "../各地字表/甬城.tsv", encoding="utf-8", index=False, sep="\t"
)