zoukankan      html  css  js  c++  java
  • request+redis 分布式爬虫

    # __author__ = ''
    # __createTime__ = '2019/1/7 13:49'
    # __description__ = '‘’
    # # -*- coding:utf-8 -*-
    import random
    from itertools import chain
    from urllib.parse import quote
    from concurrent.futures import ThreadPoolExecutor
    from redis import Redis
    import pymysql
    import requests
    from lxml import etree
    '''redis + requests 分布式'''
    
    redis_connect = Redis.from_url("redis://:6379", decode_responses=True)
    db = pymysql.connect(host='193.112.41.49', user='', password="",
                                      database='spiders', port=3306,
                                      charset='utf8mb4')
    cursor = db.cursor()
    
    class Conton_Fair():
        def __init__(self,url):
            self.url = url
            self.headers = {
                            'Accept': '*/*',
                            'Accept-Encoding': 'gzip, deflate',
                            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7',
                            'Cache-Control': 'no-cache',
                            'Connection': 'keep-alive',
                            'Cookie': 'ASP.NET_SessionId=u1rolptswy22kite05yuu2dr; Hm_lvt_26d823f5326e82607b28c9dd5bb3276f=1546075438; Hm_lpvt_26d823f5326e82607b28c9dd5bb3276f=1546075438; _gcl_au=1.1.1828690268.1546075439; _ga=GA1.3.682141728.1546075439; _ym_uid=15460754431066088148; _ym_d=1546075443; ASPSESSIONIDSQARTRST=JBKMEFAABPPOIONCBCGLIDOM; cookie-notification=1; ASPSESSIONIDQASDDBCA=ODAOCGMCBGEJAHGFIDCKFJHL; _ctauu_469_1=%7B%22uuid%22%3A%22cp21gbzc66s18asqrg96%22%2C%22vsts%22%3A2%2C%22imps%22%3A%7B%7D%2C%22cvs%22%3A%7B%7D%7D; safedog-flow-item=; WT_FPC=id=2eedfbfb975c7db4e0b1546075438399:lv=1546830767948:ss=1546830613964',
                            'Host': 'www.cantonfair.org.cn',
                            'Pragma': 'no-cache',
                            'Referer':self.url,
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'''
                            }
        def Get_url(self):
            htmls = requests.get(url=self.url,headers = self.headers)
            html = etree.HTML(htmls.text)
            return self.Save_url(html)
    
        def Save_url(self,html):
            h4 = html.xpath('//li//h4')
            for Company in h4:
                if Company.xpath('.//text()'):
                    link =(Company.xpath('./a/@href')[0].replace('Product', 'Company').split('&productid')[
                              0] + '&corptype=1').replace('en', 'cn')
                    # 加入缓存
                    redis_connect.sadd("urls", link)
            # 下一页
            Next = html.xpath('//a[text()="Next"]/@href')
            if Next:
                self.url = 'http://www.cantonfair.org.cn/en/search/%s'%Next[0]
                self.Get_url()
    def main(kw):
        url_datas = quote(kw)
        url = list.aspx?k=%s&lang=2&len=100' % url_datas
        Class_Conton = Conton_Fair(url)
        Class_Conton.Get_url()
    
    
    if __name__ == '__main__':
        # while True:
        ssql = """SELECT kw FROM words WHERE status=0 or status=5 LIMIT 100 """
        cursor.execute(ssql)
        dataAll = cursor.fetchall()
        list_url = list(chain.from_iterable(dataAll))
        # urls = list(chain.from_iterable(dataAll))
        with ThreadPoolExecutor(3) as executor:
            for data_url in list_url:
                executor.submit(main,data_url)
                upda = '''UPDATE words SET status=5 WHERE kw=%r'''%data_url
                cursor.execute(upda)
                db.commit()

    使用分布式爬取,我的思路是这样的,一台机器爬取指定的url,存到缓存,爬url比解析总是要快吧,一页都有好几十的那种,就算每台机器的速度都一样,爬一次的url够几台机器同时去解析的了

    接下来就是我们的解析了:

    因为这个网站需要的数据是动态加载的,我js比较差,也不想去找函数,我就直接使用splash渲染了,它和selenium差不多,但是的话,splash比较快一些,就选择这个了。

    可以去了解了解哦

    # __author__ = ''
    # __createTime__ = '2019/1/7 15:20'
    # __description__ = '代码简要说明'
    
    import time
    import requests
    from redis import Redis
    
    redis_connect = Redis.from_url("redis:/:6379", decode_responses=True)
    
    def splash_render(url):
        splash_url = "http:/:8050/render.html"
    
        args = {
            "url": url,
            "timeout": 5,
            "image": 0
        }
        response = requests.get(splash_url, params=args)
        return response.text
    
    
    if __name__ == '__main__':
        # 判断缓存中是否有url
        if "first_urls" in redis_connect.keys():
            # 随机取一个url并且移除,如果需要去重的话,可以考虑使用布隆过滤器去去重
            url = redis_connect.spop("urls")
            html = splash_render(url)
            print(html)

    解析网页的结果这份代码可以拷贝到许多台机器同时运行,当然,以上只是简单版的,不要以为这样分布式就完事了

    以上内容作为课堂笔记,如有雷同,请联系于我
  • 相关阅读:
    Redis 分区
    利用phpexcel把excel导入数据库和数据库导出excel实现
    Phpcms V9网站从本地上传到服务器需要修改的地方
    PHPcms怎么调用二级栏目
    phpcms调用一级栏目和二级栏目
    [v9] 列表页 调用 正文内容 或 自定义 字段(moreinfo的调用方法)
    phpcms v9最常用的22个调用代码
    phpcms_v9 多图字段 内容页,首页,分页自定义字段调用
    JS常用语句
    phpcms v9中调用栏目及调用多个子栏目中的文章列表
  • 原文地址:https://www.cnblogs.com/ArtisticMonk/p/10255792.html
Copyright © 2011-2022 走看看