python抓取页面之网易
1 #!/usr/local/bin/python2 # -*- coding: utf8 -*-
3 import os,sys,datetime,time,re,logging
4 sys.path.append('../lib')
5 import common
6 import curl_crawler
7 import activity_common,activity_db
8
9 class wy163:
10 def __init__(self):
11 self.bbs_type_list = ['chuyou','huwai','zijia']
12 self.list_url = ['http://bbs.lady.163.com/list/chuyou,%s.html','http://bbs.travel.163.com/list/huwai,%s.html','h ttp://bbs.travel.163.com/list/zijia,%s.html']
13 logDir = './log/'
14 ret = os.path.exists(logDir)
15 if ret == False:
16 os.makedirs(logDir)
17 Format = '%(asctime)s %(levelname)s %(lineno)d : %(message)s'
18 Datefmt='%Y-%m-%d %H:%M:%S'
19 logName='log/wy163.log'
20 logging.basicConfig(level=logging.DEBUG, format=Format, datefmt=Datefmt, filename=logName, filemode='w')
21
22
23 #与dispatch命令一的接口:下载新帖子的列表(以页为单位)
24 def get_activity_post_list(self,my_wrapper,website_id,latest_crawl_time,latest_post_id,latest_post_time,last_id_set,resu lt_queue_inserter):
25 self.my_wrapper=my_wrapper
26 self.website_id=website_id
27 self.last_id_set=last_id_set
28 self.cur_id_set=set()
29 self.retInserter=result_queue_inserter
30 post_num=0
31 if len(last_id_set)==0 and latest_crawl_time==None:
32 #print '更新所有帖子(一般为第一次时)'
33 post_num=self.download_post_list()
34 else:
35 #print '更新最新帖子(新帖子,不包括回复数或点击数发生变化的帖子)'
36 post_num=self.update_post_list()
37
38 self.last_id_set=self.cur_id_set #更新最新的帖子集合
39 self.cur_id_set.clear() #本次最新帖子集合清空
40
41 ret_code=1 #表示帖子列表下载完毕
42 ret_info="共抓取最新帖子%d,163更新完毕"%(post_num)
43 #结果代码,结果信息,最后一个帖子的id,最后一个帖子的时间,帖子数,帖子集合
44 return (ret_code,ret_info,latest_post_id,latest_post_time,post_num,self.last_id_set)
45
46 #获取网页数据
47 def catch_web(self,url):
48 ret=self.my_wrapper.download(url)
49 if ret!=200:
50 return None
51 data=self.my_wrapper.writeback.contents
52 data=unicode(data,'gb18030','ignore').encode('utf-8', 'ignore')
53 return data
54 #所有帖子更新一遍
55 def download_post_list(self):
56 count=0
57 for bbs_type in self.bbs_type_list:
58 page_no=1
59 ret=0
60 while ret!=0 or page_no==1:
61 post_list=[]
62 data=self.catch_onepage_data(bbs_type,page_no)
63 ret=self.parse_onepage_list(data,bbs_type,post_list)
64 if ret!=0: #该页帖子数不为0,则插入结果队列
65 ret_code=0
66 ret_info='bbs_type=%s,page=%s,post_num=%s'%(bbs_type,page_no,ret)
67 # self.retInserter.insert_to_resultqueue((ret_code,ret_info,post_list))
68 page_no+=1
69 count+=ret
70 #print 'find %d posts'%count
71 return count
72
73 #只更新最新帖子(更新第一页)
74 def update_post_list(self):
75 count=0
76 for bbs_type in bbs_type_list:
77 page_no=1
78 post_list=[]
79 data=self.catch_onepage_data(bbs_type,page_no)
80 ret=self.parse_onepage_list(data,bbs_id,post_list)
81 if ret!=0: #该页帖子数不为0,则插入结果队列
82 ret_code=0
83 ret_info='bbs_id=%d,page=%d,post_num=%d'%(bbs_id,page_no+1,ret)
84 self.retInserter.insert_to_resultqueue((ret_code,ret_info,post_list))
85 count+=ret
86 #print 'find %d posts'%count
87 return count
88
89 #获取一页的网页数据
90 def catch_onepage_data(self,bbs_type,page_no):
91 if bbs_type == 'chuyou':
92 url = self.list_url[0]%page_no
93 elif bbs_type == 'huwai':
94 url = self.list_url[1]%page_no
95 elif bbs_type == 'zijia':
96 url = self.list_url[2]%page_no
97 else:
98 return 0
99 print url
100 data=self.catch_web(url)
101 return data
102
103
104 def parse_onepage_list(self,data,bbs_type,post_list):
105 if data==None:
106 return 0
107 s=r'''<div class="articleItem (:?whiteBg)?">\s+<span class="sChk"><input type="checkbox" class="admin" style="di splay:none" value="(/bbs/\w+/(\d+)\.html)"/></span>\s+<span class="s1">\s+<a href="/bbs/\w+/\d+\.html" target=_blank><img src=.* ?align="absmiddle" /></a>\s+<a href="/bbs/\w+/\d+\.html" target="_blank" class="articleUrl" >\s+(.*?)</a>.*?<span class="s2">(.* ?)</span>\s+<span class="s3">.*?<span class="s4"><a href="http://bbs.163.com/(.*?)" target="_blank" title=".*?">(.*?)</a></span> \s+<span class="s5"><span class=.*?>(.*?)</span>/(.*?)</span>\s+<span class="s7">.*?</span>\s+<div class="clear"></div>\s+</div> '''
108 p=re.compile(s,re.I|re.M|re.S)
109 m=p.findall(data)
110 count=0
99 print url
100 data=self.catch_web(url)
101 return data
102
103
104 def parse_onepage_list(self,data,bbs_type,post_list):
105 if data==None:
106 return 0
107 s=r'''<div class="articleItem (:?whiteBg)?">\s+<span class="sChk"><input type="checkbox" class="admin" style="di splay:none" value="(/bbs/\w+/(\d+)\.html)"/></span>\s+<span class="s1">\s+<a href="/bbs/\w+/\d+\.html" target=_blank><img src=.* ?align="absmiddle" /></a>\s+<a href="/bbs/\w+/\d+\.html" target="_blank" class="articleUrl" >\s+(.*?)</a>.*?<span class="s2">(.* ?)</span>\s+<span class="s3">.*?<span class="s4"><a href="http://bbs.163.com/(.*?)" target="_blank" title=".*?">(.*?)</a></span> \s+<span class="s5"><span class=.*?>(.*?)</span>/(.*?)</span>\s+<span class="s7">.*?</span>\s+<div class="clear"></div>\s+</div> '''
108 p=re.compile(s,re.I|re.M|re.S)
109 m=p.findall(data)
110 count=0
111 if m:
112 #print len(m)
113 for item in m:
114 post_url='' #帖子url
115 post_id=item[2] #帖子id
116 post_theme=item[3] #帖子主题
117 #暂缺是否是同游帖的判断函数
118
119 organizer_id=item[5] #发起人id
120 organizer_name=item[6] #发起人昵称
121 reply_count=item[8] #回复数
122 read_count=item[7] #点击数
123 latest_post_time=item[4] #最后回复时间
124 str='%s|%s|%s|%s|%s|%s|%s|%s|%s'%(post_id,post_url,post_theme,organizer_id,organizer_name,reply_ count,read_count,latest_post_time,bbs_type)
125 print str
126 if self.check_last_id_set(post_id,reply_count): #检查更新记录集合
127 continue
128 count+=1
129 #print count
130 #if count >10:
131 # break
132 post_list.append(str)
133 return count
134 #检验帖子id是否在旧的帖子集合中
135 def check_last_id_set(self,post_id,reply_count):
136 id_mark='%s'%(post_id)#帖子id
137 self.cur_id_set.add(id_mark)
138 if id_mark in self.last_id_set:
139 return True
140 #print "新贴:%s"%id_mark
141 return False
369 if __name__=="__main__":
370 x=wy163()
371 my_wrapper=curl_crawler.curl_crawler_wrapper()
372 #url = 'http://bbs.lady.163.com/list/chuyou.html'
373 #ret=my_wrapper.download(url)
374 #data=my_wrapper.writeback.contents
375 #data = unicode(data,'gb18030','ignore').encode('utf-8', 'ignore')
376 #print data
377 #post_list=[]
378 #count =x.parse_onepage_list(data,'chuyou',post_list)
379 ret,ret_info,latest_post_id,latest_post_time,post_num,last_id_set=x.get_activity_post_list(my_wrapper,9,None,None,None,[ ],0)
380 print ret,ret_info,latest_post_id,latest_post_time,post_num
这是对网易旅游版3个版面的抓取,分别抓取帖子主题,url,以及回帖数,发帖人等等信息。进入每个url后的抓取和磨房的类似,不再累述。
页:
[1]