zoukankan      html  css  js  c++  java
  • dogedoge浏览器爬取标题

    # coding:utf-8
    import hashlib
    
    import datetime
    import lxml
    import pymysql
    import requests
    
    from lxml import etree
    import sys
    
    reload(sys)
    
    sys.setdefaultencoding('utf-8')
    
    def search_data(kw, n):
        ll = []
        res = requests.get('https://www.dogedoge.com/results?q={}'.format(kw))
        if n > 1:
            res = requests.get('https://www.dogedoge.com/results?q={}&p={}'.format(kw, n))
        con = etree.HTML(res.text)
        url = con.xpath('//div[@class="result results_links_deep highlight_d result--url-above-snippet"]')
        for u in url:
            title = ''
            for i in u.xpath('./div/h2/a//text()'):
                title += i
            url = ''
            for i in u.xpath('./div/div/div/a/span//text()'):
                url += i
            domain = ''
            if url.find('http') != -1:
                domain = url.split('/')[2]
            else:
                domain = url.split('/')[0]
            md5 = hashlib.md5(url).hexdigest()
            item = {}
            item['keywd'] = kw
            item['domain'] = domain
            item['title'] = title
            item['md5'] = md5
            item['url'] = url
            item['searcher'] = 'dogedoge'
            ll.append(item)
        save(ll)
        try:
            next = con.xpath('//div[@id="rld-2"]')
        except:
            print '没有下一页了'
            return ''
        else:
            return next
    
    
    def main(kw):
        n = 1
        while True:
            next_page = search_data(kw, n)
            if not next_page:
                break
            n += 1
    
    
    def save(ll):
        db = pymysql.connect(
            host=MYSQL_HOST,
            db=MYSQL_DBNAME,
            user=MYSQL_USER,
            passwd=MYSQL_PASSWD,
            charset='utf8',
            use_unicode=True)
        cursor = db.cursor()
        for item in ll:
            # print type(item), item['searcher']
            try:
                # 插入数据库
                cursor.execute(
                    "insert into weixintb(md5,keyword,title,url,`date`,`domain`, browser) value(%s, %s, %s, %s, %s, %s,%s)",
                    (item['md5'],
                     item['keywd'],
                     item['title'],
                     item['url'],
                     datetime.datetime.now(),
                     item['domain'],
                     item['searcher']
                     ))
                # 提交sql语句
                db.commit()
    
            except Exception as error:
                # 出现错误时打印错误日志
                # print error
                # logger.error(error)
                db.rollback()
        cursor.close()
        db.close()
    
    main('爬取关键词')
  • 相关阅读:
    第二节. SignalR开篇以及如何指定传输协议
    第一节:.Net版基于WebSocket的聊天室样例
    Maven常用dependency记录
    Linux学习笔记
    Maven 学习笔记
    SQL脚本去重分组统计
    版本号定义
    C# Random循环生成随机数重复问题解决方案
    C#通过Oracle.ManagedDataAccess无法访问Oralce
    Java注解的使用,类似于C#的Attribute
  • 原文地址:https://www.cnblogs.com/qxh-beijing2016/p/12770181.html
Copyright © 2011-2022 走看看