是的,这两天我在玩Python!
为了给LAC添加内置的词典, 需要将通过Lingoes-Extractor解出的数据导入到LAC使用的Sqlite中. 这个过程并不复杂 -- 解开ld2文件数据输出每条记录到一个文本文件中,然后处理每一条记录导入到Sqlite中即可,要是用C++来实现,也就两个晚上的事情,但,但这次我又蛋疼了...
大约一个月前,工作中碰到一段Python脚本,对于我这样习惯C++的人来说,看类似Python脚本跟天书差不多了...虽然上半年看了几天Perl,但到今天也只记得名字了...
于是这次'痛定思痛',决定搞起一个脚本语言来.于是就有了这次蛋疼的事情 -- 用Python实现LD2到Sqlite的导入.
不多说,有兴趣的直接看Python脚本吧...
htmlparser.py
# !/usr/bin/python
# coding:utf-8
import string
from HTMLParser import HTMLParser
class MyParser(HTMLParser):
result = 0
levelField = -1
levelInfo = -1
flag = -1
#str = 'abandon = <C><E>abandons|abandoned|abandoning</E><F><H><M>a·ban·don || ə\'bændən</M></H><I><N><U>n.</U> 放纵, 放任; 狂热</N></I><I><N><U>v.</U> 丢弃; 中止, 放弃; 遗弃, 抛弃; 使放纵</N></I></F></C>'
def handle_starttag(self, tag, attrs):
if tag == 'c':
self.flag = 0 #content
elif tag == 'e':
self.flag = 1 #extend
elif tag == 'f':
self.result.field.append(DictField())
self.levelField += 1
# self.levelInfo = -1
# print 'levelField =', self.levelField
self.flag = 2 #field
elif tag == 'l':
self.flag = 3 #link
elif tag == 'm':
self.flag = 4 #symbol
elif tag == 'i':
self.result.field[self.levelField].info.append(DictInfo())
self.levelInfo += 1
# print 'info == levelField = %s levelInfo = %s' % (self.levelField, self.levelInfo)
self.flag = 5 #info
elif tag == 'n':
self.flag = 6 #meaning
elif tag == 'u':
self.flag = 7 #category
def handle_endtag(self, tag):
if tag == 'u':
self.flag = 6 #meaning
def handle_data(self, data):
index = self.levelField
if self.flag == 1:
self.result.extend.append(data)
elif self.flag == 3:
self.result.field[self.levelField].link = data
elif self.flag == 4:
self.result.field[self.levelField].symbol = data
elif self.flag == 6:
# print 'meaning == index = %d' % index
self.result.field[self.levelField].info[index * 5 + self.levelInfo].meaning = data
# print 'meaning == levelField=%d levelInfo=%d' % (self.levelField, self.levelInfo)
# print 'meaning == info: %s' % self.result.field[self.levelField].info[self.levelInfo]
elif self.flag == 7:
# print 'category == index = %d' % index
self.result.field[self.levelField].info[index * 5 + self.levelInfo].category = data
# print 'category == levelField=%d levelInfo=%d' % (self.levelField, self.levelInfo)
# print 'category == info: %s' % self.result.field[self.levelField].info[self.levelInfo]
def parse(self, html, data):
# self.levelField = -1
# self.levelInfo = -1
# self.flag = -1
self.result = data
self.feed(html)
class DictInfo:
category = ''
meaning = ''
def __str__(self):
return '[category = %s meaning = %s]' % (self.category, self.meaning)
class DictField:
symbol = ''
link = ''
info = [DictInfo() for i in range(0,25)]
def __str__(self):
return '[symbol = %s | link = %s info = %s]' % (self.symbol, self.link, string.join(map(str, self.info)))
class DictData:
word = ''
extend = [] #stringlist
field = []
def __str__(self):
return 'word = %s extend = %s field = %s' % (self.word, string.join(map(str, self.extend)), string.join(map(str, self.field)))
def parseHtml(html, output):
parser = MyParser()
parser.parse(html, output)
parser.close()
def analyseLine(str, output):
pos = str.find(' =')
output.word = str[:pos]
html = str[pos + 3 :]
# print 'html=', html
parseHtml(html, output)
这个实现了从解开后的ld2记录到内部数据的解析;
data2xml.py
import string
#<X>
#<D>dictid</D>
#<E>E1</E>
#<E>E2</E>
#<F>
#<S>Symbol</S>
#<L>Link</L>
#<I>
#<C>category</C>
#<M>Meaning</M>
#</I>
#<I>
#<C>category</C>
#<M>Meaning</M>
#</I>
#</F>
#<F>
#<S>Symbol</S>
#<L>Link</L>
#<I>
#<C>category</C>
#<M>Meaning</M>
#</I>
#</F>
#</X>
def addtag(list, stag, etag):
if len(list) > 0:
ret = ''
for data in list:
ret = stag + string.strip(data, ' ') + etag
return ret
else:
return ''
def addExtend(extend):
return addtag(extend, '<e>', '</e>')
def addInfo(info, index):
if len(info) > 0:
ret = ''
for i in info[index * 5:(index + 1) * 5]:
if i.category == '' and i.meaning == '':
break
ret += '<i>'
if i.category != '':
ret += '<c>' + string.strip(i.category, ' ') + '</c>'
if i.meaning != '':
ret += '<m>' + string.strip(i.meaning, ' ') + '</m>'
ret += '</i>'
return ret
else:
return ''
def addSubField(f, index):
ret = ''
if f.symbol != '':
ret += '<s>' + string.strip(f.symbol, ' ') + '</s>'
if f.link != '':
ret += '<l>' + string.strip(f.link, ' ') + '/l>'
ret += addInfo(f.info, index)
return ret
def addField(field):
index = 0
if len(field) > 0:
ret = '<f>'
for f in field:
ret += addSubField(f, index)
index += 1
ret += '</f>'
return ret
else:
return ''
def data2xml(data):
ret = '<x>' \
+ '<d>1</d>' \
+ addExtend(data.extend) \
+ addField(data.field) \
+ "</x>"
return ret
这个实现了从内部数据到指定xml字串的处理;(蛋疼啊,ld2本来的数据也是xml格式的...但为了展现我'高超'的python功底,自己又定义了一次格式...)
dbaccess.py
#!/usr/bin/python
# coding:utf-8
import sqlite3 as sqlite
import re
def table_create(conn):
cursor = conn.cursor()
sql = [
'CREATE TABLE IF NOT EXISTS Word (wordid INTEGER PRIMARY KEY, word TEXT, flag INTEGER)',
'CREATE TABLE IF NOT EXISTS Src (srcid INTEGER PRIMARY KEY, wordid INTEGER, fmt INTEGER, orig INTEGER, content TEXT)',
'CREATE TABLE IF NOT EXISTS Dict (dictid INTEGER PRIMARY KEY, title TEXT)'
]
for s in sql:
cursor.execute(s)
def add_dict(conn, title):
cursor = conn.cursor()
cursor.execute('INSERT INTO Dict (title) VALUES (\'%s\')' % title)
conn.commit()
def add_record(conn, word, record):
cursor = conn.cursor()
# sql.encode('string_scape')
cursor.execute('INSERT INTO Word (word, flag) VALUES ("%s",1)' % (word))#'INSERT INTO Word (word, flag) VALUES (\'%s\',1)' % (word))
record = record.replace('\"', '')
cursor.execute('INSERT INTO Src (wordid, fmt, orig, content) VALUES (%d, 3, 1, "%s")' % (cursor.lastrowid, record))
# conn.commit()
def db_create(dbfile):
return sqlite.connect(dbfile)
def db_close(conn):
conn.commit();
conn.close()
def db_test(conn):
cursor = conn.cursor()
record = '"1234"'
record.replace('\"', '')
cursor.execute('INSERT INTO Word (word, flag) VALUES ("%s", 1)' % (record))
conn.commit()
#######################################3
#conn = db_create('../data/lac.db')
#add_dict(conn, 'test')
#db_close(conn)
这个实现了相关的数据库功能,包括主要的创建,写入等;
# !/usr/bin/python
# coding:utf-8
import string
import htmlparser
import data2xml
import dbaccess
def main():
# str = 'test = <c><E>1</E><E>2</E>'
file = open("../data/output.txt", "r")
conn = dbaccess.db_create("../data/lac.db3")
dbaccess.table_create(conn)
# dbaccess.db_test(conn)
# return
dbaccess.add_dict(conn, 'Vicon English-Chinese(S) Dictionary')
i = 0
for line in file:
# print line
data = htmlparser.DictData()
htmlparser.analyseLine(string.rstrip(line, '\n'), data)
# print 'data ===== ', data
# print data2xml.data2xml(data)
dbaccess.add_record(conn, data.word, data2xml.data2xml(data))
dbaccess.db_close(conn)
file.close()
main()
这个就是main入口了...
如何?俺写的Python脚本如何?要是俺跟你说,一个月前,我连Python都会拼错,现在却可以写出如此'长'的Python脚本来了...你是觉得我很猛,还是觉得Python佷简单呢...
虽然在编写Python脚本的时候,碰到了各种郁闷错误,各种坑爹的用法,但我还是满喜欢Python的,
总比使用由那位获得两届IOCCC大奖的家伙创建的Perl的感觉爽了很多很多啊....
<---- 松口气的分割线 ---->
昨晚终于搞定了这最终的Python脚本,但测试的结果不是佷满意. 生成LAC的sqlite数据需要将近3个小时不说,这超过80MB的数据文件更加让人崩溃...不过总的来说,终于可以先暂时放下这个数据导入的问题,继续编写LAC了...
这个月工作上有些变故,也一直难以静心敲字,到上周也算到过阶段了...终于可以放松下了...