Posted on 2012-09-17 14:20
polly 阅读(463)
评论(0) 编辑 收藏 引用 所属分类:
Python
![](/Images/OutliningIndicators/ContractedBlock.gif)
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
1
from sgmllib import SGMLParser
2
import sys,urllib2,urllib,cookielib
3
class spider(SGMLParser):
4
def __init__(self,email,password):
5
SGMLParser.__init__(self)
6
self.h3=False
7
self.h3_is_ready=False
8
self.div=False
9
self.h3_and_div=False
10
self.a=False
11
self.depth=0
12
self.names=""
13
self.dic={}
14
15
self.email=email
16
self.password=password
17
self.domain='renren.com'
18
try:
19
cookie=cookielib.CookieJar()
20
cookieProc=urllib2.HTTPCookieProcessor(cookie)
21
except:
22
raise
23
else:
24
opener=urllib2.build_opener(cookieProc)
25
urllib2.install_opener(opener)
26![](/Images/OutliningIndicators/None.gif)
27
def login(self):
28
url='http://www.renren.com/PLogin.do'
29
postdata={
30
'email':self.email,
31
'password':self.password,
32
'domain':self.domain
33
}
34
req=urllib2.Request(
35
url,
36
urllib.urlencode(postdata)
37
)
38
39
self.file=urllib2.urlopen(req).read()
40
#print self.file
41
def start_h3(self,attrs):
42
self.h3 = True
43
def end_h3(self):
44
self.h3=False
45
self.h3_is_ready=True
46
47
def start_a(self,attrs):
48
if self.h3 or self.div:
49
self.a=True
50
def end_a(self):
51
self.a=False
52
53
def start_div(self,attrs):
54
if self.h3_is_ready == False:
55
return
56
if self.div==True:
57
self.depth += 1
58
59
for k,v in attrs:
60
if k == 'class' and v == 'content':
61
self.div=True;
62
self.h3_and_div=True #h3 and div is connected
63
def end_div(self):
64
if self.depth == 0:
65
self.div=False
66
self.h3_and_div=False
67
self.h3_is_ready=False
68
self.names=""
69
if self.div == True:
70
self.depth-=1
71
def handle_data(self,text):
72
#record the name
73
if self.h3 and self.a:
74
self.names+=text
75
#record says
76
if self.h3 and (self.a==False):
77
if not text:pass
78
else: self.dic.setdefault(self.names,[]).append(text)
79
return
80
if self.h3_and_div:
81
self.dic.setdefault(self.names,[]).append(text)
82
83
def show(self):
84
type = sys.getfilesystemencoding()
85
for key in self.dic:
86
print ( (''.join(key)).replace(' ','')).decode('utf-8').encode(type), \
87
( (''.join(self.dic[key])).replace(' ','')).decode('utf-8').encode(type)
88![](/Images/OutliningIndicators/None.gif)
89
90![](/Images/OutliningIndicators/None.gif)
91![](/Images/OutliningIndicators/None.gif)
92
renrenspider=spider('your email','your password')
93
renrenspider.login()
94
renrenspider.feed(renrenspider.file)
95
renrenspider.show()
96![](/Images/OutliningIndicators/None.gif)