#!/usr/bin/enc python
#-*-coding:UTF-8 -*-
#
#
#Copyrigtht (c)
#Laiseek Comany 2012
#All rights reserved.
#
#Finename:nearsyn.py
#Function:预处理工具
#
#Current version:1.0
#author: Chen Yu
#Date: 05/07/2012
#
import re
class NearSyn:
def init(self):
self.syn = []
self.det = []
#加载数据
def load(self,filename,detfile):
sock = open(filename,'r')
self.syn = sock.read().split('\n')
sock.close()
for i in range(len(self.syn)):
self.syn[i] = self.syn[i].split(' ')
self.det = []
sock = open(detfile,'r')
self.det = sock.read().split('\n')
sock.close()
#去除读入元素中的空list
def empty(self):
for i in range(len(self.syn)):
#空元素移除后元素减少
flag = 0
for j in range(len(self.syn[i])):
if self.syn[i][j - flag] == " "or len(self.syn[i][j - flag]) == 0:
self.syn[i].remove(self.syn[i][j - flag])
flag += 1
#从A中剔除D
def dete(self):
for i in range(len(self.syn)):
self.det[i] = self.det[i].split(' ')
for i in range(len(self.syn)):
for j in range(len(self.det[i])):
if self.det[i][j]:
self.syn[i].remove(self.det[i][j])
#合并A和B
def merge(self,filename1,filename2):
sock = open(filename1,'r')
buf1 = sock.read().split('\n')
sock.close()
sock = open(filename2,'r')
buf2 = sock.read().split('\n')
sock.close()
sock = open('app','w')
print len(buf1)
for i in range(len(buf1)):
sock.write(buf1[i] + ' '+ buf2[i]+ '\n')
sock.close()
#去除重复的元素
def unrep(self,filename):
sock = open(filename,'r')
buf = sock.read().split('\n')
sock.close()
ls = list(set(buf))
ls.sort(key = buf.index)
sock = open('sims2','w')
for i in range(len(ls)):
sock.write(ls[i] + '\n')
sock.close()
#去除html标签
def offhtml(self,infile,outfile):
rfile = open(infile,'r')
buf = rfile.read()
rfile.close()
wfile = open(outfile,'w')
buf = re.sub(r"<[^!>](?:[^>]|\n)*>", '',buf)
wfile.write(buf)
#输出
def output(self,filename):
sock = open(filename,'w')
for i in range(len(self.syn)):
#去掉单个词
if len(self.syn[i]) > 1:
for j in range(len(self.syn[i])):
sock.write(self.syn[i][j] + ' ')
sock.write("\n")
sock.close()
if __name__ == '__main__':
t = NearSyn()