话不多说,奉上代码。
#/usr/bin/env python
#coding=utf8
#对提取的数据进行预处理
def pretreat(infile,outfile):
rfile = open(infile,'r')
wfile = open(outfile,'wa+')
while(1):
line = rfile.readline()
if not line:
break
line = line.split('>')
#数据的长度,避免重复计算
lens = len(line)
#获得有效信息
for i in range(lens):
line[i] = line[i].split('/')
for i in range(lens):
#处理三元组第三个元素
#print line[i]
flag = 0
if '@zh' in line[i][0]:
line[i][0] = line[i][0].replace('@zh .','')
line[i][0] = line[i][0].replace('/','')
if '^^<http:' in line[i][0]:
flag = 1
line[i][0] = line[i][0].replace('^^<http:','')
line[i][0] = line[i][0].replace('/','')
print line[i][0]
wfile.write(line[i][0].strip())
if len(line[i]) >= 1 and i != 3 and 0 == flag:
if '/' in line[i][len(line[i])-1]:
line[i][len(line[i])-1] = line[i][len(line[i])-1].replace('/','')
wfile.write(line[i][len(line[i])-1].strip()+' ')
wfile.write('\n')
wfile.close()
#判断是否含有字母
def is_alphabet(input):
input = unicode(input,"utf-8")
buf = []
for uchar in input:
if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
return True
else:
return False
#去除国家名中含有字母的三元组
def removealp(infile,outfile):
rfile = open(infile,'r')
wfile = open(outfile,'w')
while(1):
line = rfile.readline()
if not line:
break
linetmp = line
line = line.split(' ')
if False == is_alphabet(line[0]):
wfile.write(linetmp)
wfile.close()
pretreat('article_categories_en_uris_zh.nt','tag_article_categories_en_uris_zh.txt')