# -*- coding: utf-8 -*-
from HttpRequestModule import *
import os
import json
import traceback
import codecs
from lxml import etree
import StringIO, gzip
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def write_file(file_name,file_data,encoding):
if len(file_data) == 0 :
print "file_data is zero"
return
file_dir = r"D:\fs\test_data\qqzone"
file_path=os.path.join(file_dir,file_name)
print file_path
# fp=open(file_path,"w")
# fp.write(file_data)
# fp.flush()
# fp.close()
with codecs.open(file_path,"w",encoding) as f:
f.write(file_data)
def decodeJson(json_string):
decode_json=None
try:
decode_json=json.loads(json_string)
return decode_json
except (TypeError, ValueError) as err:
print( 'TypeError or ValueError:{0}'.format(err) )
except Exception,e:
print( traceback.format_exc() )
pass
return decode_json
def getUserBlogList():
blog_list=[]
diray_url='''
http://b1.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=859226880&blogType=0&cateName=&cateHex=&statYear=2015&reqInfo=7&pos=0&num=15&sortType=0&absType=0&source=0&rand=0.6346770680975169&ref=qzone&g_tk=1611717761&verbose=1
'''
data=doGet(diray_url)
data_len = len(data)
if data_len == 0 :
print "data len is 0"
return blog_list
data_json = data[10:data_len-2]
#write_file('bloglist.txt',data_json,'utf-8')
decode_json=decodeJson(data_json.decode("gbk"))
if decode_json == None :
print "decode_json is None"
return []
if decode_json['code'] != 0:
print "server response code is "+decode_json['code']
return []
data =decode_json['data']
if data['totalNum'] <=0 :
print "server response totalnum is "+data['totalNum']
return []
blog_list=data['list']
return blog_list
def getUserBlog(uin,blogid):
url='''
http://b1.qzone.qq.com/cgi-bin/blognew/blog_output_data?uin=%(uin)s&blogid=%(blogid)s&styledm=ctc.qzonestyle.gtimg.cn&imgdm=ctc.qzs.qq.com&bdm=b.qzone.qq.com&mode=2&numperpage=15×tamp=1437033537&dprefix=&inCharset=gb2312&outCharset=gb2312&ref=qzone
'''%{'uin':uin,'blogid':blogid}
my_headers={
"Accept-Encoding":"gzip,deflate,sdch",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6" ,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36" ,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ,
"Referer": "http://ctc.qzs.qq.com/qzone/newblog/blogcanvas.html"
}
request = urllib2.Request(url,headers=my_headers)
try:
response = urllib2.urlopen(request)
except URLError,e:
if hasattr(e, 'code'):
print('The server couldn\'t fulfill the request. errorcode:{0}'.format(e.code ))
elif hasattr(e, 'reason'):
print('We failed to reach a server. reason:{0}'.format(e.reason ))
else:
page = response.read()
return page
return ""
def getText(elem):
rc = []
for node in elem.itertext():
rc.append(node.strip())
return ''.join(rc)
def gzdecode(data) :
compressedstream = StringIO.StringIO(data)
gziper = gzip.GzipFile(fileobj=compressedstream)
data2 = gziper.read() # 读取解压缩后数据
return data2
def test(blogid):
print blogid
blog_data=getUserBlog('859226880',blogid)
blog_data=gzdecode(blog_data)
#write_file( blogid+'.html',blog_data )
#return
try:
content=blog_data.decode('utf-8')
tree=etree.HTML(content)
node=tree.xpath("//div[@id='blogDetailDiv']")[0]
tgt_data=getText(node)
print "*"*30
print tgt_data
write_file( blogid+'.txt',tgt_data, 'gbk')
return
except Exception,ex :
print "111",Exception,":",ex
try:
content=blog_data.decode('gbk')
tree=etree.HTML(content)
node=tree.xpath("//div[@id='blogDetailDiv']")[0]
tgt_data=getText(node)
print "_"*30
print tgt_data
write_file( blogid+'.txt',tgt_data ,'utf-8')
except Exception,ex :
print "222",Exception,":",ex
def main():
print "main"
test("1288281044")
#return
blog_list=getUserBlogList()
for blog_item in blog_list:
blogId=blog_item['blogId']
print blogId
test( str(blogId) )
pass
main()