5173.com提出抓取同行交易系统的业务信息来做数据分析而提出这么个需求给我,花了1天用python完成
1 # -*- coding:utf-8 -*- 2 #扫描xunbao173.com web页面记录到数据库 3 #zhangbin 2010.5.12 5173.com 4 import sys,os 5 6 import traceback,threading,time,struct,os,os.path,zlib,struct 7 import copy,socket,select 8 #import psycopg2 9 import httplib,re 10 11 import log,config 12 13 14 #function Paging(total) { 15 # this.pageSize = 10;//每页显示记录数 16 # this.step = 5;//最多显示分页页数 17 # this.total = total; //总记录数 18 #} 19 20 ''' 21 22 CREATE DATABASE htmlgrep 23 WITH OWNER = postgres 24 ENCODING = 'UTF8'; 25 26 27 CREATE TABLE htmlGrep 28 ( 29 id integer, 30 item_name character(60) NOT NULL, 31 price_s character(40), 32 rank integer, 33 appear_time integer NOT NULL, 34 disappear_time integer NOT NULL 35 ) WITH (OIDS=TRUE) 36 ; 37 38 ''' 39 40 g_conf = config.SimpleConfig() 41 g_conf.open('grep.conf') 42 g_dbconn = None 43 g_logger = log.Logger('hgrep.log') 44 g_flog = None 45 46 g_cookie = '' 47 48 #def getDBConn(): 49 # global g_dbconn 50 # try: 51 # if g_dbconn == None: 52 # dbhost=g_conf.getPropertyValue('dbhost','localhost') 53 # dbname='gamegrep' 54 # dbuser=g_conf.getPropertyValue('dbuser','postgres') 55 # dbpasswd=g_conf.getPropertyValue('dbpasswd','111111') 56 # g_dbconn = psycopg2.connect(host=dbhost,database=dbname,user=dbuser,password=dbpasswd) 57 # except: 58 # g_logger.error(traceback.format_exc()) 59 # return g_dbconn 60 61 #检索页数量 62 63 ''' 64 <input type="hidden" id="currentPage" value="1"/> 65 <input type="hidden" id="orderBy" value=""/> 66 <input type="hidden" id="pageTotal" value="24"/> 67 ''' 68 69 def getPageNum(html): 70 ms = re.findall("id=\"pageTotal\" value=\"(.*?)\"",html,re.S) 71 if len(ms)==0: 72 pass 73 #return 0 74 return ms[0] 75 76 def getPageHtml(game,idx): 77 html ='' 78 url = "/%s/getServerList"%(game) 79 gamesite=g_conf.getPropertyValue('root.site') 80 conn = httplib.HTTPConnection(gamesite) 81 82 hdr={'Cookie':g_cookie, 83 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 84 'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7', 85 'Accept-Language':'zh-cn,zh;q=0.5', 86 'Keep-Alive':'300', 87 'Connection':'keep-alive', 88 'Accept-Encoding':'gzip,deflate' 89 } 90 91 conn.request("GET", "/%s/buy.gsp?keyWord=&groupName=&orderBy=&page=%s"%(game,idx),'',hdr) 92 r1 = conn.getresponse() 93 html = r1.read() 94 return html 95 96 97 def scanRecordsOfHtml(f,html,serverid,serverName): 98 99 regex = '''<span class="realName">(.*?)</span>.*?rank.*?>(.*?)</dd>.*?price.*?>(.*?)</dd>.*?linkTo\('(.*?)',''' 100 ms = re.findall(regex,html,re.S) 101 #f = open('hgrep.rec.txt','w') 102 #conn = getDBConn() 103 for n in ms: 104 f.write("%s,%s,%s,%s,%s,%s\n"%(n[0],n[1],n[2],n[3],serverid,serverName)) 105 106 107 #扫描游戏服务器 [{name,url}] 108 def scanGameServers(game): 109 global g_cookie 110 url = "/%s/getServerList"%(game) 111 gamesite=g_conf.getPropertyValue('root.site') 112 conn = httplib.HTTPConnection(gamesite) 113 conn.request("GET", url) 114 r1 = conn.getresponse() 115 html = r1.read() 116 #print html 117 ms = re.findall("<div class=\"ser_area_list\">(.*?)</div>",html,re.S) 118 if len(ms)!=2: 119 print 'Html content invalid!' 120 return 121 html = ms[1] 122 #ms = re.findall("<a href=\"(.*?)\">.*?title=\"(.*?)\".*?</a>",html,re.S) 123 #"getServerList?aid=15&id=1136" 124 ms = re.findall("<a href=\".*?aid=(.*?)&id=(.*?)\">.*?title=\"(.*?)\".*?</a>",html,re.S) 125 126 if len(ms) == 0: 127 print 'Game:%s is null!'%(game) 128 return 129 print '%s servers Found'%len(ms) 130 #print r1.getheader('set-cookie') 131 cookie = r1.getheader('set-cookie').split(';')[0] 132 g_cookie = cookie 133 #进入游戏区服 134 scanedserver =[] 135 136 for server in ms: 137 try: 138 139 #if scanedserver.count(server[1])!=0: 140 # continue 141 #scanedserver.append(server[1]) 142 print server 143 f = open(server[2].decode('utf-8').encode('gb2312')+'.txt','w') 144 print "/%s/%s"%(game,server[0]) 145 url = "getServerList?aid=%s&id=%s"%(server[:2]) 146 print url 147 #return url 148 conn = httplib.HTTPConnection(gamesite) 149 150 #conn.request("GET", "/%s/%s/"%(game,server[0]),None,{'Cookie':cookie}) 151 hdr={'Cookie':cookie, 152 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 153 'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7', 154 'Accept-Language':'zh-cn,zh;q=0.5', 155 'Keep-Alive':'300', 156 'Connection':'keep-alive', 157 'Accept-Encoding':'gzip,deflate' 158 } 159 160 #conn.request("GET", "/%s/%s"%(game,server[0]),'',hdr) 161 conn.request("GET", "/%s/%s"%(game,url),'',hdr) 162 r1 = conn.getresponse() 163 html = r1.read() 164 print 'have a sleep' 165 time.sleep(.2) 166 167 conn = httplib.HTTPConnection(gamesite) 168 conn.request("GET", "/%s/%s"%(game,'buy.gsp'),'',hdr) 169 r1 = conn.getresponse() 170 html = r1.read() 171 172 PAGE_SIZE =10 173 PAGE_COUNT = int(getPageNum(html))/PAGE_SIZE + 1 174 175 for page in range(1,PAGE_COUNT+1): 176 print 'attempt to grep Game=%s Page=%s'%(game,page) 177 html = getPageHtml(game,page) 178 g_flog.write( html) 179 180 scanRecordsOfHtml(f,html,server[1],server[2]) 181 f.close() 182 183 except: 184 g_logger.error(traceback.format_exc()) 185 186 187 188 #def scanGameServers2(game): 189 # url = "/%s/buy.gsp"%(game) 190 # gamesite=g_conf.getPropertyValue('root.site') 191 # conn = httplib.HTTPConnection(gamesite) 192 # conn.request("GET", url) 193 # print url 194 # r1 = conn.getresponse() 195 # html = r1.read() 196 # #print html 197 # print html 198 # g_flog.write( html) 199 # 200 #def scanRecords(file): 201 # f = open(file,'r') 202 # html = f.read() 203 # f.close() 204 # regex = '''<span class="realName">(.*?)</span>.*?rank.*?>(.*?)</dd>.*?price.*?>(.*?)</dd>.*?linkTo\('(.*?)',''' 205 # ms = re.findall(regex,html,re.S) 206 # f = open('hgrep.rec.txt','w') 207 # conn = getDBConn() 208 # 209 # for n in ms: 210 # f.write("%s,%s,%s,%s\n"%n) 211 # try: 212 # cr = conn.cursor() 213 # sql = "select count(*) from htmlgrep where id=%s"%(n[3]) 214 # cr.execute(sql) 215 # 216 # rs = cr.fetchone() 217 # if rs[0] == 0 : 218 # #if 1: 219 # #cr = conn.cursor() 220 # sql="insert into htmlgrep values(%s,%s,%s,%s,%s,%s);" 221 # cr.execute(sql,( int(n[3]),n[0],n[2],n[1],int(time.time()),0,)) 222 # conn.commit() 223 # else: 224 # sql = "update htmlgrep set disappear_time=0 where id=%s"%(int(n[3])) 225 # cr.execute(sql) 226 # conn.commit() 227 # 228 # except: 229 # g_logger.error(traceback.format_exc()) 230 # #如果db内的数据不存在当前缓存内则标记为物品消失,并记录消失时间 231 # cr = conn.cursor() 232 # cr.execute('select id from htmlgrep order by id') 233 # rs = cr.fetchone() 234 # while rs: 235 # found = False 236 # for n in ms: 237 # if int(n[3]) == rs[0]: 238 # found = True 239 # break 240 # if not found: 241 # cr2 = conn.cursor() 242 # sql = "update htmlgrep set disappear_time=%s where id=%s"%(int(time.time()),rs[0]) 243 # cr2.execute(sql) 244 # rs = cr.fetchone() 245 # conn.commit() 246 # f.close() 247 # #print str(ms) 248 ############################################################## 249 250 251 class sepApp: 252 def __init__(self): 253 self._conf = config.SimpleConfig() 254 255 256 def getConfig(self): 257 return self._conf 258 259 #def getDBConn(self): 260 # try: 261 # if self.dbconn == None: 262 # dbhost=self.getPropertyValue('dbhost','localhost') 263 # dbname=self.getPropertyValue('dbname','IpRedirect') 264 # dbuser=self.getPropertyValue('dbuser','postgres') 265 # dbpasswd=self.getPropertyValue('dbpasswd','111111') 266 # self.dbconn = psycopg2.connect(host=dbhost,database=dbname,user=dbuser,password=dbpasswd) 267 # except: 268 # self._log.error(traceback.format_exc()) 269 # return self.dbconn 270 271 #def resetDBConn(self): 272 # self.dbconn = None 273 274 def run(self): 275 pass 276 277 def getLogger(self): 278 return self._log 279 280 def run(self, args): 281 282 return 0 283 284 285 286 ############################################################## 287 ############################################################## 288 289 #scanRecords('c:/test - Copy.html') 290 291 292 ''' 293 sql test: 294 --------------------- 295 --select count(*) from htmlgrep 296 --select id,count(id) from htmlgrep group by id limit 100 297 --select * from htmlgrep where id = 2310 298 --delete from htmlgrep 299 300 ''' 301 g_flog = open('c:/test.txt','w') 302 scanGameServers('mhzx') 303 sys.exit(0) 304 if __name__=='__main__': 305 if len(sys.argv)<2: 306 print 'usage: grep.py scan | build' 307 sys.exit() 308 if sys.argv[1]=='scan': 309 g_flog = open('c:/test.txt','w') 310 scanGameServers('mhzx') 311 #if sys.argv[1]=='build': 312 # scanRecords('c:/test.txt') 313 #server = sepApp() 314 315 316 317 318
|