一般爬虫可以分为以下几个步骤:
一、打开指定网页
二、解析网页
三、处理/存储数据,新增任务网页
另外异步的话,需要调度器。
简单爬虫的话,不需要搞复杂验证码,requests/urllib修改cookie,header就能访问的话,写一个打开,一个解析就够了,处理数据和新任务,直接写在解析类就下,gevent也可以直接异步。
项目路径:ur'D:python_pymy_scrapy/scrapy_tools'
# scrapy_tools下添加__init__.py作为包使用
itemparse.py
按照数据的结构建立相应的xpath 结构
# -*- coding: utf-8 -*- """ Created on Fri Jul 07 17:24:34 2017 @author: willowj """ import sys stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr reload(sys) sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr sys.setdefaultencoding('utf8') import gevent import pandas as pd import numpy as np from lxml import html import time import codecs import json def list_0e(list_): if isinstance(list_, list): if not list_: return None else: if len(list_)>1: print 'warning : list>1,list[1]:', list_[1] #,len(list_) return list_[0] else: return list_ class ItemParse(object): """docstring for zhihu_topi""" name = 'ItemParse' base_url = 'https://www.zhihu.com/topic/19551147/top-answers' pageN_x = '//div[@class="zm-invite-pager"]//span[last()-1]/a/text()' new_urls_x = None #以下一条数据的节点,以及每一项 items_node_x = '//div[@class="feed-main"]' #注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头 item_xs = dict( question_name = '''.//a[@class='question_link']/text()''', #question_href = '''.//a[@class='question_link']/@href''', author = './/div[@data-action="/answer/content"]/@data-author-name', author_href = '''.//a[@class='author-link']/@href''', ups_x = './/div[@class="zm-item-vote-info"]/@data-votecount', answers_text = ".//textarea/text()", commentN = './/a[@name="addcomment"]/text()[last()]', entry_url = './/div[@data-action="/answer/content"]/@data-entry-url', #re: #z = re.compile('.') ) #换页url样式 def getnextpages(self): if self.pageN > 1: #自定义换也规则,只有一页则为 False urls = [self.base_url + '?page=%s' %n for n in range(self.pageN,1,-1) ] return urls def __init__(self, html_): #self.item_atrr_xpath() self.results = [] self.new_urls = [] self.pageN = self.update_page_n(html_) self.nextpages = self.getnextpages() self.parase(html_) def parase(self, html_): #优先使用xpath,,补充使用re; 找不到的item 返回none etree = html.document_fromstring(html_) items_nodes = etree.xpath(self.items_node_x) #results = [] for ee in items_nodes: ee_str = None ite = {} for item,itemx in self.item_xs.items(): # re, or xpath if hasattr(itemx, 'findall'): if ee_str is None: ee_str = html.to_string(ee) ite[item] = itemx.findall(ee_str) #xpath elif isinstance(itemx, str) or isinstance(itemx, unicode): if itemx.startswith('./'): ite[item] = ee.xpath(itemx) else: print item raise 'xpath not startwith ./' else: print item raise 'not re.pattarn object or xpath str' if len(ite[item]) == 0: ite[item] = None elif len(ite[item]) == 1: ite[item] = ite[item][0] else: ite[item] = ' '.join([str(__i) for __i in ite[item]]) self.results.append(ite) #new_url if self.new_urls_x: self.new_urls.extend(etree.xpath(self.new_urls_x)) #获取有多少页 def update_page_n(self, html_): if self.pageN_x: etree = html.document_fromstring(html_) pages = etree.xpath(self.pageN_x) pages = list_0e(pages) if isinstance(pages, str): pages.strip() if pages and pages.isdigit(): return int(pages) else: return 1 #普通的获取项目下所有换页 def get_nextpages(self, opener, sleep_sec=None): for url in self.nextpages: if sleep_sec: time.sleep(sleep_sec) #if not hasattr(opener, 'get') _re = opener.get(url) print _re.status_code, _re.url self.parase(_re.text) print time.time() #暂时把 异步控制和存储方法写到了这里 #gevent 协程方法 def __gevent_get_nextpages(self, opener): print id(opener) while self.nextpages: #start_time = time.time() url = self.nextpages.pop() print gevent.getcurrent() zhihu_re = opener.get(url) #gevent.sleep(5) print zhihu_re.status_code, url self.parase(zhihu_re.text) print time.time() #gevent 协程方法 def get_nextpages_by_gevent(self, opener_class, g_n=4): ''' param: opener_class : 创建网页打开器的类 g_n: 协程数量,默认4个 ''' from gevent import monkey; monkey.patch_all() start_time = time.time() gs = [gevent.spawn(self.__gevent_get_nextpages, opener_class()) for i in range(g_n) ] gevent.joinall(gs) print time.time() - start_time self.save_to_excel() def save_to_excel(self, path=None): if path: save_name = path else: save_name = u''+ self.name + time.strftime('%Y%m%d_%H_%M', time.localtime()) + '.xlsx' print save_name result_pd = pd.DataFrame(self.results) print 'pd ok' result_pd.to_excel(u'' + save_name, encoding='gb18030') print 'saved to ' + save_name def save_to_json(self, path=None): if path: save_name = path else: save_name = u''+ self.name + time.strftime('%Y%m%d_%H_%M', time.localtime()) + '.json' print save_name with codecs.open(save_name, 'w', encdoing='gb18030') as f: f.write(josn.dumps(self.results)) print 'saved to '+ save_name
使用时继承类重写类属性和getnextpages 换页方法
web_opener.py
使用requests.Session,保持会话的方式速度大概会快一倍
对应gevent异步,多少个协程就会生成同等的会话,各自打开网页互补干扰。 方法暂时写在itemparse.py
# -*- coding: utf-8 -*- """ 2017年8月17日星期四 下午 17:22 @author: willowj """ import sys sys.setdefaultencoding('utf8') import requests #from requests.cookies import ( # cookiejar_from_dict, extract_cookies_to_jar, RequestsCookieJar, merge_cookies) class SessionFopener(object): """requests 封装的网页打开器 param: headers 默认使用类属性,实例化的时候自己可以传入 cookie_dic 默认禁用 proxies 默认无 """ headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch', 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', #'Cookie':'q' #'Host':'www.zhihu.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', } def __init__(self, headers=None, cookie_dic=None, proxies=None): self.req_s = requests.Session() self.req_s.adapters.DEFAULT_RETRIES = 3 self.req_s.keep_alive = True if headers: self.req_s.headers = headers else: self.req_s.headers = self.headers if not cookie_dic: cookie_dic = {} self.req_s.cookies = requests.cookies.cookiejar_from_dict(cookie_dic) if proxies: self.req_s.proxies = proxies def close(self): self.req_s.close() def get(self, *arg, **karg): return self.req_s.get(*arg, **karg) def post(self, *arg, **karg): return self.req_s.post(*arg, **karg) def set_cookiejar(self, cookie_dic={}): self.req_s.cookies = requests.cookies.cookiejar_from_dict(cookie_dic) def add_cookiejar(self, cookie_dic): self.req_s.cookies = requests.cookies.merge_cookies(self.req_s.cookies, cookie_dic) def set_headers(self, headers={}): self.req_s.headers = headers def add_headers(self, headers_dic): for k,v in header_dic: self.req_s.headers[k] = v def set_proxies(self, proxies): self.req_s.proxies = proxies @classmethod def cookiejar_from_dict(cls, cookie_dic): return requests.cookies.cookiejar_from_dict(cookie_dic) def __enter__(self): print 'enter' return self def __exit__(self, *used): self.req_s.close() del self.req_s print 'exit' if __name__ == '__main__': with SessionFopener() as req_o: res_p = req_o.get('http://httpbin.org/get') print res_p.json()
大众点评店铺爬取示例:
只需要继承后重写解析的节点、换页的url形式就行
暂时未考虑外链接。
# -*- coding: utf-8 -*- """ Created 2017年8月17日星期四 下午 19:33 @author: Administrator """ import sys stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr reload(sys) sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr sys.setdefaultencoding('utf8') sys.path.append(ur'D:python_pymy_scrapy') from scrapy_tools.web_opener import SessionFopener from scrapy_tools.itemparse import ItemParse class DzdpItemParse(ItemParse): """广州酒家(文昌店)的点评 docstring for zhihu_topi""" name = u'DzdpItemParse广州酒家' base_url = 'https://www.dianping.com/shop/516983/review_more' pageN_x = ".//a[@class='PageLink'][last()]/text()" new_urls_x = None #以下一条数据的节点,以及每一项 items_node_x = './/div[@class="comment-list"]/ul/li' #注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头 item_xs = dict( user_id = '''.//*[@class="J_card"]/@user-id''', #question_href = '''.//a[@class='question_link']/@href''' , comm_per = """.//span[@class='comm-per']/text()""", total_mark = """.//*[@class="user-info"]/span[1]/@class""", taste = """.//*[@class="comment-rst"]/span[1]/text()""", environment = """.//*[@class="comment-rst"]/span[2]/text()""", sevice = """.//*[@class="comment-rst"]/span[3]/text()""", comments_agree = '''.//span[@class="heart-num"]/text()''', comment_text = """.//*[@class="J_brief-cont"]/text()""", comment_date = '''.//*[@class="time"]/text()''', recommend_food = u'''.//*[@class="comment-recommend" and (contains(text(),推荐) or contains(text(),喜欢))] [1]/a/text()''' # 中文得使用unicode #re: #z = re.compile('.') ) def getnextpages(self): if self.pageN > 1: #自定义换也规则,只有一页则为 False urls = [self.base_url + '?pageno=%s' %n for n in range(self.pageN, 1, -1) ] return urls open_s = SessionFopener() #实例化一个打开器 respon_= open_s.get(DzdpItemParse.base_url) #打开初始页 gzjj_item = DzdpItemParse(respon_.text) #解析对象用初始页html实例化 #同步方式的话,使用普通方法 gzjj_item.get_nextpages(open_s, sleep_sec=None) #异步方法: #gzjj_item.get_nextpages_by_gevent(SessionFopener) #实例异步方法
结果:本来打开一个网页0.5279 s,开四个协程后77.71s爬完613个页面,平均0.13s一个,速度提升至4倍
200 https://www.dianping.com/shop/516983/review_more?pageno=600 1503074965.07 <Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=602 1503074965.1 <Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=601 1503074965.14 <Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=604 1503074965.54 <Greenlet at 0x9c44440: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=607 1503074965.59 <Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=605 1503074965.64 <Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=606 1503074965.67 <Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=611 1503074966.1 <Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=609 1503074966.15 <Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=610 1503074966.18 <Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=608 1503074966.22 <Greenlet at 0x9c44440: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )> 200 https://www.dianping.com/shop/516983/review_more?pageno=612 1503074966.7 200 https://www.dianping.com/shop/516983/review_more?pageno=614 1503074966.74 200 https://www.dianping.com/shop/516983/review_more?pageno=615 1503074967.05 200 https://www.dianping.com/shop/516983/review_more?pageno=613 1503074967.09 77.7100000381 DzdpItemParse广州酒家20170819_00_49.xlsx pd ok saved to DzdpItemParse广州酒家20170819_00_49.xlsx
分布式多进程、入数据库的话,还得单独写调度器、与数据对接的模块