zoukankan      html  css  js  c++  java
  • 优化网络爬虫

    Date: 2019-07-03

    Author: Sun

    优化之前的网络爬虫代码如下:

    # -*- coding: utf-8 -*-  
    __author__ = 'sun'
    __date__ = '2019/7/3 上午10:53' 
    
    from bs4 import BeautifulSoup as BSP4
    
    import requests
    
    g_set = set()
    
    URL_LIST = [
       ('https://www.geyanw.com/lizhimingyan/list_33_1.html', '励志名言', 'lizhimingyan'),
       ('https://www.geyanw.com/renshenggeyan/list_32_1.html', '人生格言', 'renshenggeyan'),
       ('https://www.geyanw.com/mingyanjingju/list_37_1.html', '名言警句', 'mingyanjingju'),
       ('https://www.geyanw.com/html/mingrenmingyan/list_1_1.html', '名人名言', 'mingrenmingyan'),
       ('https://www.geyanw.com/html/dushumingyan/list_5_1.html', '读书名言', 'dushumingyan'),
       ('https://www.geyanw.com/html/jingdianmingyan/list_2_1.html', '经典名言', 'jingdianmingyan'),
    
    ]
    
    
    def store_file(filename, response):
       html_doc = response.text
       with open("geyan_%s.html" % filename, "w", encoding="utf-8") as f:
          f.write(html_doc)
    
    
    def download(url, filename="index", store_flag=True):
       '''
       :param url:        待爬取的url
       :param filename:   待存储html文件名称
       :param store_flag: 本地持久化的标志
       :return:
       '''
       response = requests.get(url)
    
       if  store_flag:
          store_file(filename, response)
    
       return response
    
    
    def parse_page(page, ctype, url):
       response = download(url, store_flag=False)
       html_doc = response.content
       soup = BSP4(html_doc, "lxml")
       link_list = soup.select("#p_left .newlist ul h2 a")
       #print(link_list)
       index = 1
       for link in link_list:
          url_link = "https://www.geyanw.com" + link['href']
          print("ctype:" + ctype + ", page: " + str(page) + ", url_link: " + url_link)
          if url_link not in g_set:
             index += 1
             response = download(url_link, filename="%s_%s.html" % (ctype, index), store_flag=False)
    
    
    def parse(response):
       url = response.url
       #print(url)
       base_urls = url.split("/list_")
       print(base_urls)
       domain = base_urls[0]
       init_html = base_urls[-1]
       print(domain)
       print(init_html)
       ctype = init_html.split("_")[0]
       cindex = init_html.split("_")[1].split(".")[0]
       g_set.add(url)
    
       html_doc  = response.content
       soup = BSP4(html_doc, "lxml")
       #page_list = soup.select("#p_left .newlist .pagelist li a") #分页内容
       #print(page_list)
    
       total_num = soup.select("#p_left .newlist .pagelist .pageinfo strong")[0]
       page_max = int(total_num.get_text())
    
       [parse_page(page, ctype, "%s/list_%s_%s.html" % (domain, ctype, page)) for page in range(2, page_max+1)]
    
    
    def process(entry_url):
       try:
          response = download(entry_url, store_flag=False)
          parse(response)    #下载和解析进行分开
          return True
       except Exception as e:
          return False
    
    '''
    采用多进程的方式来爬取
    '''
    def multprocess_run():
       from multiprocessing import Pool
       pool = Pool(processes=8)
       result = []
       for (entry_url, name, type ) in  URL_LIST:
          pc = pool.apply_async(process, args=(entry_url, ))
          result.append(pc)
    
       pool.close()
       pool.join()
    
    
    '''
    采用协程来处理并发量
    '''
    import  asyncio
    
    @asyncio.coroutine
    def async_io_loop(entry_url):
       yield from process(entry_url)
    
    
    def async_run():
       loop = asyncio.get_event_loop()
       tasks = [async_io_loop(url) for (url, name, type)  in  URL_LIST]
       loop.run_until_complete(asyncio.wait(tasks))
       loop.close()
    
    
    import threading
    import queue
    import time
    
    class Worker(threading.Thread):
       def __init__(self, name, queue):
          threading.Thread.__init__(self)
          self.queue = queue
          self.start()
    
       def run(self):
          while True:
             if self.queue.empty():
                break
             url = self.queue.get()
             print(self.getName() + " process " + str(url))
             process(url)
             self.queue.task_done()
    
    
    def multithread_run():
       squeue = queue.Queue()
       for (url, name, type) in URL_LIST:
          squeue.put(url)
    
       for i in range(10):
          threadName = 'Thread' + str(i)
          Worker(threadName, squeue)
    
       squeue.join()
    
    
    def main():
    
       #multprocess_run()
    
       #async_run()
    
       multithread_run()
    
       # for (url, name, type) in URL_LIST:
       #  process(url, name, type)
       #[process(url, name, type)  for (url, name, type) in URL_LIST]
       # entry_url = "https://www.geyanw.com/lizhimingyan/list_33_1.html"
       # process(entry_url)
    
    if __name__ == "__main__":
       main()
    
    
    
  • 相关阅读:
    JDBC原理及常见错误分析
    response,session,cookie
    Activity LifeCycle (安卓应用的运行机制)
    简单的接口取数据渲染到图表
    图表里面双重下拉框进行判断
    用js方式取得接口里面json数据的key和value值
    一个div多个图表共用一个图例
    一个页面多图表展示(四个div的方式)
    vue组件之子组件和父组件
    根据判断对颜色进行改变
  • 原文地址:https://www.cnblogs.com/sunBinary/p/11129828.html
Copyright © 2011-2022 走看看