BathyScapheのヘルプを作成してみよう
Python 3.0.1を使えるようにしたその理由はなにかというと、Mac OS X用の2ちゃんねるブラウザ、BathyScapheのヘルプを作成する時期が近づいてきたと感じられたからでした。
BathyScapheには、充実したオンスクリーンヘルプを用意されています。これは有志により作成されたもので、Wiki上で編集をし、最後にそれをHTML化するというプロセスを踏んで作られます。その最後のプロセス、HTML化において活躍するのがPythonです。Wikiシステムが出力するHTMLファイルをダウンロードし、メタ情報に従いHTMLを再構成します。もちろんリンクもきちんと整備されます。さらには、EUC-JPをShift_JISに変換までしてくれます。
以上の作業に必要な機能は、すべてPythonに標準装備されているので、追加でモジュールをインストールするような必要はありません。特別なことをすることなしに、たいていの作業ができてしまう。これは本当にPythonのメリットであると実感します。
以下のスクリプトを実行することで、ヘルプファイルが簡単に作成できます。
makehelp.py
これはUTF-8で保存する必要があります。
# -*- coding: utf-8 -*- import urllib.request import codecs import re import string import os import csv listfile = 'list.txt' metadatafile = 'meta.txt' storedir = 'store/' repdir = 'rep/' resdir = 'result/' rsrcdir = 'resource/' tempfile = 'temp.txt' baseurl = 'http://bathyscaphe.sourceforge.jp/cgi-bin/helpwiki/' urlfname = '' u = urllib.request.URLopener() reh1 = re.compile('<h1 id="titleh1">.*</h1>') recuth1b = re.compile('^.*<h1 id="titleh1"><img class="h1v" src="/wiki/gfx/h1v_help.png" width="15" height="18" alt=".">') recuth1a = re.compile('</h1>.*$') repagebody = re.compile('<div class="wikipagebody">') refooter = re.compile('</div><!-- end of pagebody -->') reli = re.compile('<li>.*$') redivopen = re.compile('<div.*') redivclose = re.compile('</div>') recutb = re.compile('^\s*<li>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2} - <a href="') recuta = re.compile('" class="wikipage">.*$') remetafilepath = re.compile('<li><em class="italic">filePath</em>:') remetadesc = re.compile('<li><em class="italic">description</em>:') remetakws = re.compile('<li><em class="italic">keywords</em>:') reliclose = re.compile('</li>') # create dir if not os.path.exists(rsrcdir): os.mkdir(rsrcdir) if not os.path.exists(storedir): os.mkdir(storedir) if not os.path.exists(repdir): os.mkdir(repdir) if not os.path.exists(resdir): os.mkdir(resdir) print('prepared directories') # cleanup dir while os.path.exists(rsrcdir + listfile): os.remove(rsrcdir + listfile) while os.path.exists(rsrcdir + metadatafile): os.remove(rsrcdir + metadatafile) while os.path.exists(rsrcdir + tempfile): os.remove(rsrcdir + tempfile) dirlist = os.listdir(storedir) for filename in dirlist: os.remove(storedir + filename) dirlist = os.listdir(repdir) for filename in dirlist: os.remove(repdir + filename) dirlist = os.listdir(resdir) for filename in dirlist: if os.path.isfile(resdir + filename): os.remove(resdir + filename) if os.path.isdir(resdir + filename): dirlist2 = os.listdir(resdir + filename) for filename2 in dirlist2: os.remove(resdir + filename + '/' + filename2) os.rmdir(resdir + filename) print('cleanuped directories') # create list urlfname = 'wiki.cgi?a=list' tfile = u.open(baseurl + urlfname) with open(rsrcdir + tempfile, 'bw') as tmpf: tmpf.write(tfile.read()) inbody = 0 with codecs.open(rsrcdir + tempfile, 'r', 'euc-jp') as f: for li in f.readlines(): if repagebody.search(li): inbody = inbody + 1 if inbody != 0: if redivopen.search(li) and not repagebody.search(li): inbody = inbody + 1 if redivclose.search(li): inbody = inbody - 1 if refooter.search(li): inbody = 0 if inbody != 0: if reli.search(li): li = recutb.sub('', li) li = recuta.sub('', li) with codecs.open(rsrcdir + listfile, 'a', 'shift_jis') as f2: f2.write(li) print('created list file') # create stored files metalist = list() with codecs.open(rsrcdir + listfile, 'r', 'shift_jis') as filelist: for urlfname in filelist.readlines(): filecontent = list() tfile = u.open(baseurl + urlfname) with open(rsrcdir + tempfile, 'bw') as tmpf: tmpf.write(tfile.read()) inbody = 0 with codecs.open(rsrcdir + tempfile, 'r', 'euc_jp') as f: for li in f.readlines(): if reh1.search(li): pagename = li pagename = recuth1b.sub('', pagename) pagename = recuth1a.sub('', pagename) pagename = pagename.strip('\n') if repagebody.search(li): inbody = inbody + 1 if inbody != 0: if redivopen.search(li) and not repagebody.search(li): inbody = inbody + 1 if redivclose.search(li): inbody = inbody - 1 if refooter.search(li): inbody = 0 if inbody != 0: li = li.strip('\n') li = li.strip('\r') filecontent.append(li) metafilepath = '' metadesc = '' metakws = '' for lins in filecontent: if remetafilepath.search(lins): lins = reliclose.sub('', lins) metafilepath = remetafilepath.sub('', lins) metafilepath = metafilepath.strip('\n') metafilepath = metafilepath.strip() if remetadesc.search(lins): lins = reliclose.sub('', lins) metadesc = remetadesc.sub('', lins) metadesc = metadesc.strip('\n') metadesc = metadesc.strip() if remetakws.search(lins): lins = reliclose.sub('', lins) metakws = remetakws.sub('', lins) metakws = metakws.strip('\n') metakws = metakws.strip() if metafilepath != '': urlfname = urlfname.strip('\n') metalist.append([pagename, urlfname, metafilepath, \ metadesc, metakws]) filecontent.pop(0) i = True while i: if re.search('^<hr>\s?$', filecontent[-1]): i = False filecontent.pop() with codecs.open(storedir + pagename + '.txt' , 'w', \ 'shift_jis') as f2: for lins in filecontent: f2.write(lins + '\n') print('created ' + pagename) with codecs.open(rsrcdir + metadatafile, 'w', 'shift_jis') as f: writer = csv.writer(f) writer.writerows(metalist) print('created meta data file') # replace parameters reimgrep = re.compile('src=".*?"\salt="') reimgtag = re.compile('<img') realt = re.compile('alt=') reh4a = re.compile('<h4') reh4b = re.compile('</h4') reh5a = re.compile('<h5') reh5b = re.compile('</h5') reh6a = re.compile('<h6') reh6b = re.compile('</h6') reia = re.compile('<i>') reib = re.compile('</i>') reba = re.compile('<b>') rebb = re.compile('</b>') reul = re.compile('<ul>') rerelh = re.compile('<h2.*?>関連項目</h2>') relchk = False with codecs.open(rsrcdir + metadatafile, 'r', 'shift_jis') as metafile: reader = csv.reader(metafile) for tmeta in reader: rfilepath = storedir + tmeta[0] + '.txt' sfilepath = repdir + tmeta[0] + '.txt' turl = tmeta[2].split('/') print('processing : ' + tmeta[2]) with codecs.open(rfilepath, 'r', 'shift_jis') as rfs: with codecs.open(sfilepath, 'w', 'shift_jis') as sfs: for rf in rfs.readlines(): # img rf = reimgrep.sub('alt="', rf) if reimgtag.search(rf): listimg = rf.split(' ') for strimg in listimg: strimg = strimg.strip() if realt.search(strimg): strimg = strimg[5:-1] rf = rf.replace('alt="' + strimg, \ 'src="../gfx/' + strimg + \ '" alt="' + strimg) # h4 rf = reh4a.sub('<h2', rf) rf = reh4b.sub('</h2', rf) # h5 rf = reh5a.sub('<h3', rf) rf = reh5b.sub('</h3', rf) # h6 rf = reh6a.sub('<h4', rf) rf = reh6b.sub('</h4', rf) # i rf = reia.sub('<em>', rf) rf = reib.sub('</em>', rf) # b rf = reba.sub('<strong>', rf) rf = rebb.sub('</strong>', rf) # a with codecs.open(rsrcdir + metadatafile, 'r', \ 'shift_jis') as mfs: reader2 = csv.reader(mfs) for mmeta in reader2: murl = mmeta[2].split('/') if turl[0] == murl[0]: rf = rf.replace('"' + mmeta[1] + '"', \ '"../' + murl[1] + '"') else: rf = rf.replace('"' + mmeta[1] + '"', \ '"../' + mmeta[2] + '"') # rel_items if relchk: rf = reul.sub('<ul class="rel_items">', rf) relchk = False if rerelh.search(rf): relchk = True sfs.write(rf) with codecs.open(rsrcdir + metadatafile, 'r', 'shift_jis') as metafile: reader = csv.reader(metafile) for tmeta in reader: sfilepath = resdir + tmeta[2] turl = tmeta[2].split('/') print('make html : ' + tmeta[2]) if not os.path.exists(resdir + turl[0]): os.mkdir(resdir + turl[0]) with codecs.open(repdir + tmeta[0] + '.txt', 'r', \ 'shift_jis') as rfs: with codecs.open(sfilepath, 'w', 'shift_jis') as sfs: with codecs.open(rsrcdir + 'htmla.dat', 'r', \ 'shift_jis') as html: for hl in html.readlines(): sfs.write(hl) if tmeta[4] != '': sfs.write('<meta name="keywords" content="' + \ tmeta[4] + '">\n') if tmeta[3] != '': sfs.write('<meta name="description" content="' + \ tmeta[3] + '">\n') sfs.write('<title>' + tmeta[0] + '</title>\n') if turl[0] == 'xpgs': with codecs.open(rsrcdir + 'htmlix.dat', 'r', \ 'shift_jis') as html: for hl in html.readlines(): sfs.write(hl) sfs.write('<h1>' + tmeta[0] + '</h1>\n') else: with codecs.open(rsrcdir + 'htmlst.dat', 'r', \ 'shift_jis') as html: for hl in html.readlines(): sfs.write(hl) sfs.write('<h1><img src="../gfx/icon32.png" ' + \ 'alt="" title="" width="32" height="32" ' + \ 'id="h1icon">' + tmeta[0] + '</h1>\n') for rf in rfs.readlines(): sfs.write(rf) with codecs.open(rsrcdir + 'htmlz.dat', 'r', \ 'shift_jis') as html: for hl in html.readlines(): sfs.write(hl) print('done.')
以下のファイルは、Shift_JIS(LF)で./resource/以下に保存しておきます。
htmla.dat
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html lang="ja"> <head> <meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS"> <meta http-equiv="Content-Style-Type" content="text/css">
htmlix.dat
<link rel="start" href="../BathyScapheHelp.html"> <link rel="index" href="./xall.html"> <link rel="stylesheet" href="../sty/index.css" media="screen" type="text/css"> </head> <body> <div id="navbar"> <div id="home"><a href="../BathyScapheHelp.html">BathyScaphe ヘルプ</a></div> <div id="index"><a href="./xall.html">索引</a></div> </div> <div id="contents">
htmlst.dat
<link rel="start" href="../BathyScapheHelp.html"> <link rel="index" href="../xpgs/xall.html"> <link rel="stylesheet" href="../sty/standard.css" media="screen" type="text/css"> </head> <body> <div id="navbar"> <div id="home"><a href="../BathyScapheHelp.html" rel="start">BathyScaphe ヘルプ</a></div> <div id="index"><a href="../xpgs/xall.html" rel="index">索引</a></div> </div> <div id="contents">