部首检字

因为 Kindle 的浏览器很垃圾(并且在校(高中)难以找到网络),决定自己整合一个部首检字表。

开始先找到按部首查字–在线新华字典,但是感觉 CJK 字符不是很全。经过一番搜索找到了漢字部首 | 古今文字集成,决定用这个来当材料。

这里记录一下本人的神必过程和抽象代码。

获取部首

curl 'http://ccamc.org/cjkv_radical.php' > b.txt,然后直接读取每一行。它的网页结构非常非常简单(所以我为啥要记录呢),让一切都非常方便。

于是我直接按部首笔画数开了文件把部首和对应的链接弄到里面去了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf8')
import re

with open('b.txt', encoding='utf8') as inp:
K = 0
p1 = '(\d+)畫'
p2 = 'href="(.+?)">(.)'
outp = open('0.txt', 'w', encoding='utf8')
for line in inp:
res1 = re.findall(p1, line)
if res1 != []:
K = res1[0]
outp = open('%s.txt' % str(K), 'w', encoding='utf8')
# print(res1[0])
continue
res2 = re.findall(p2, line)[0]
# print(res2)
outp.write(res2[1] + '\n' + res2[0] + '\n')

获取每个部首下的所有字

随机选了几个1画2画的部首看了看页面,然后直接开始写。

具体思路就是把每个部首下的字开一个文件存起来,并记录剩余笔画数和位于 CJK 哪个扩展,因为我日常使用其实不需要多生僻的字所以我希望生成.epub时只留下 CJK-A 及以前的字。

然后这里出了一个大问题,就是我选的部首和剩余笔画数对应的字都太少了,没有出现字在不同页面的情况(不同情况可以参考口 | 剩餘8畫 | 漢字部首 | 古今文字集成艹 | 剩餘2畫 | 漢字部首 | 古今文字集成),直到我发现生成出来的 .epub 查不到「范」「品」「俪」「栅」才意识到可能出现漏字。

于是后来回家查验发现属实,然后补上了(可以看到那部分写得很丑,虽然其他部分也写得很丑)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf8')
import os
import requests
import re
proxy = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890"}
logfile = open('f.log', 'w', encoding='utf-8')

def get_characters(url, rms, outp):
html = requests.get(url, proxies = proxy).text
html.encode().decode('utf8')

pattern = '<a href="cjkv.php\?cjkv=."><span class="zb">.+?char_img_new/char_img_(.+?)/.+?<span>(.)</span>'
results = re.findall(pattern, html)
for res in results:
# print(res[0], res[1], rms)
s = '%s\n%s\n%s\n' % (res[1], res[0], rms)
outp.write(s)

pattern2 = 'p=(\d+)" title="最後頁"' # more than one page
res2 = re.findall(pattern2, html)
if res2:
cnt = int(res2[0])
for p in range(2, cnt + 1):
url2 = url + '&p=' + str(p)
print(url2)
sys.stdout.flush()
logfile.write(url2 + '\n')

html2 = requests.get(url2, proxies = proxy).text
html2.encode().decode('utf8')

results2 = re.findall(pattern, html2)
for res in results2:
# print(res[0], res[1], rms)
s = '%s\n%s\n%s\n' % (res[1], res[0], rms)
outp.write(s)

def get_pagelist(url, outp):
html = requests.get(url, proxies = proxy).text
html.encode().decode('utf8')

pattern = 'cjkv_radical.php\?q=.+?(&rms=\d+?)\">(\d+?)<'
results = re.findall(pattern, html)
for res in results:
print(url + res[0])
sys.stdout.flush()
logfile.write(url + res[0] + '\n')

get_characters(url + res[0], res[1], outp)

def split_radical_list(inp):
radical = []
for i in inp:
radical.append(i[:-1])
if len(radical) == 2:
yield radical
radical.clear()

for c in range(1, 16):
with open(str(c) + ".txt", 'r', encoding='utf-8') as inp:
dir = "./char/" + str(c)
if not os.path.exists(dir):
os.makedirs(dir)
radicals = split_radical_list(inp)
id = 1
for radical in radicals:
print(radical[1], dir + "/" + str(id) + ".txt")
sys.stdout.flush()
logfile.write("%s %s\n" % (radical[1], dir + "/" + str(id) + ".txt"))

with open(dir + "/" + str(id) + ".txt", 'a', encoding='utf8') as outp:
get_pagelist(radical[1], outp)
id += 1

整合成 .epub

感觉每一个部首对应的字也不多,所以直接就一个部首对应一个.html文件。

另一个抽象大赏,css 直接沿用之前做高中古诗文整合的时候不知道从哪里偷来的 css(不过反正这个文件也非常简单,其实没有也罢),toc.ncx直接暴力手写,.opf文件(因为不想生成中间部分,,之类的)就生成了两段然后也是徒手粘到一起。

总之就是暴力和能用就行的产物。最后也确实能用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding = 'utf8')
proxy = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890"}

def html_head():
return '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="zh-CN" lang="zh-CN"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><link rel="stylesheet" type="text/css" href="../css/main.css" />'
def html_title(title):
return '<title>%s</title></head><body>' % title
def html_tail():
return '</body></html>'

def split_character_list(inp):
character = []
for i in inp:
character.append(i[:-1])
if len(character) == 3:
yield tuple(character)
character.clear()
def split_radical_list(inp):
radical = []
for i in inp:
radical.append(i[:-1])
if len(radical) == 2:
yield radical
radical.clear()

def should_be_in(ch):
if ch[1] != 'Unified_KT' and ch[1] != 'Ext_A_KT': return False
return True
# <h2>目录</h2>
# <p><a href="#1">1话</a></p>
# <h2 id="1">1话</h2>
# <p>一 Unified<br>丁 Unified<br>丂 Unified</p>
def make_html(html, inp, radical):
html.write(html_head())
html.write(html_title('部首 %s' % radical))
html.write('<h1>部首 %s</h1><h2>目录</h2>' % radical)
char_list = list(split_character_list(inp))

rms = -1
for ch in char_list:
if should_be_in(ch) == False:
continue
if rms != int(ch[2]):
rms = int(ch[2])
html.write('<p><a href="#%d">%d畫</a></p>' % (rms, rms))

html.write('<p><hr>') # need to be updated
rms = -1
for ch in char_list:
if should_be_in(ch) == False:
continue
if rms != int(ch[2]):
rms = int(ch[2])
html.write('</p>')
html.write('<h2 id="%d">%d畫</h2>' % (rms, rms))

html.write('<p>%s<br>' % ch[0]) # need to be updated

html.write('</p>')
html.write(html_tail())

for cnt in range(1, 16):
with open("%s.txt" % (cnt), 'r', encoding='utf-8') as inp1:
radicals = split_radical_list(inp1)
id = 0
contents = open('./0/OEBPS/%d/content.html' % cnt, 'w', encoding='utf-8')
contents.write(html_head())
contents.write(html_title('%d畫' % cnt))
contents.write('<h1>%d畫</h1>' % cnt)

for radical in radicals:
id += 1
print(radical[0], id)
contents.write('<p><a href="%s.html">%s</a></p>' % (str(id).zfill(3), radical[0]))
inp2 = open("./char/%s/%s.txt" % (str(cnt), str(id)), 'r', encoding='utf-8')
html = open("./0/OEBPS/%d/%s.html" % (cnt, str(id).zfill(3)), 'w', encoding='utf-8')
make_html(html, inp2, radical[0])
inp2.close()
html.close()

with open('./ch.opf', 'a') as opf:
opf.write('<item href="%d/content.html" id="%d_content" media-type="application/xhtml+xml"/>\n' % (cnt, cnt))
for i in range(1, id + 1):
opf.write('<item href="%d/%s.html" id="%d_%s" media-type="application/xhtml+xml"/>\n' % (cnt, str(i).zfill(3), cnt, str(i).zfill(3)))
with open('./ct.opf', 'a') as opf:
opf.write('<itemref idref="%d_content" linear="yes"/>\n' % (cnt))
for i in range(1, id + 1):
opf.write('<itemref idref="%d_%s" linear="yes"/>\n' % (cnt, str(i).zfill(3)))
contents.write(html_tail())
contents.close()

回顾与反思

有没有人教我写代码。

以及目前的版本(v1.1)放在这里:部首檢字.epub部首檢字.azw3