Posted on 2012-09-17 14:20
polly 阅读(444)
评论(0) 编辑 收藏 引用 所属分类:
Python
1from sgmllib import SGMLParser
2import sys,urllib2,urllib,cookielib
3class spider(SGMLParser):
4 def __init__(self,email,password):
5 SGMLParser.__init__(self)
6 self.h3=False
7 self.h3_is_ready=False
8 self.div=False
9 self.h3_and_div=False
10 self.a=False
11 self.depth=0
12 self.names=""
13 self.dic={}
14
15 self.email=email
16 self.password=password
17 self.domain='renren.com'
18 try:
19 cookie=cookielib.CookieJar()
20 cookieProc=urllib2.HTTPCookieProcessor(cookie)
21 except:
22 raise
23 else:
24 opener=urllib2.build_opener(cookieProc)
25 urllib2.install_opener(opener)
26
27 def login(self):
28 url='http://www.renren.com/PLogin.do'
29 postdata={
30 'email':self.email,
31 'password':self.password,
32 'domain':self.domain
33 }
34 req=urllib2.Request(
35 url,
36 urllib.urlencode(postdata)
37 )
38
39 self.file=urllib2.urlopen(req).read()
40 #print self.file
41 def start_h3(self,attrs):
42 self.h3 = True
43 def end_h3(self):
44 self.h3=False
45 self.h3_is_ready=True
46
47 def start_a(self,attrs):
48 if self.h3 or self.div:
49 self.a=True
50 def end_a(self):
51 self.a=False
52
53 def start_div(self,attrs):
54 if self.h3_is_ready == False:
55 return
56 if self.div==True:
57 self.depth += 1
58
59 for k,v in attrs:
60 if k == 'class' and v == 'content':
61 self.div=True;
62 self.h3_and_div=True #h3 and div is connected
63 def end_div(self):
64 if self.depth == 0:
65 self.div=False
66 self.h3_and_div=False
67 self.h3_is_ready=False
68 self.names=""
69 if self.div == True:
70 self.depth-=1
71 def handle_data(self,text):
72 #record the name
73 if self.h3 and self.a:
74 self.names+=text
75 #record says
76 if self.h3 and (self.a==False):
77 if not text:pass
78 else: self.dic.setdefault(self.names,[]).append(text)
79 return
80 if self.h3_and_div:
81 self.dic.setdefault(self.names,[]).append(text)
82
83 def show(self):
84 type = sys.getfilesystemencoding()
85 for key in self.dic:
86 print ( (''.join(key)).replace(' ','')).decode('utf-8').encode(type), \
87 ( (''.join(self.dic[key])).replace(' ','')).decode('utf-8').encode(type)
88
89
90
91
92renrenspider=spider('your email','your password')
93renrenspider.login()
94renrenspider.feed(renrenspider.file)
95renrenspider.show()
96