zoukankan      html  css  js  c++  java
  • 简单的python爬虫保存百度、360 搜索内容到数据库

    import requests
    
    import re
    
    from pyquery import PyQuery as Pq
    
    import pymysql.cursors
    
    connection = pymysql.connect(host='localhost',user='root',password='lihang',db='report',charset='utf8',cursorclass=pymysql.cursors.DictCursor)
    
    inssql = "INSERT INTO `gamble` (`url`, `title`,`detailurl`) VALUES (%s, %s, %s)"
    
    selsql = "SELECT *  FROM `gamble` WHERE `url`=%s"
    
    s = requests.session()
    
    headers = {
    
                "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    
                "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
    
                "Accept-Encoding":"gzip, deflate",
    
                "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",
    
                "Content-Type":"application/x-www-form-urlencoded",
    
                'Connection':'keep-alive',
    
                'DNT':'1',
    
                'Content-Type':'application/x-www-form-urlencoded'
    
                }
    
    url_360 ="https://www.so.com/s"
    
    pqyload_360 = {
    
                'q':'房地产',#设置搜索关键字
    
                'pn':1,
    
                'ie':'utf8'
    
                }
    
    url_baidu ="https://www.baidu.com/s"
    
    pqyload_baidu = {
    
                'wd':'房地产',#设置搜索关键字
    
                'pn':0,
    
                'tn':'monline_4_dg',
    
                'ie':'utf-8'
    
                }
    
    baimingdan = {#不采集详细网址中包含下列关键字的网站
    
        "baidu.com",
    
        "douban.com",
    
        "tianya.cn"
    
    }
    
    def getbaidu():
    
        for i in range(100): #设置循环页数
    
            print(i)
    
            i=i+1
    
            r=s.get(url_baidu,params=pqyload_baidu,headers=headers)
    
            page=Pq(r.content.decode('utf-8'))
    
            baiduUrls = []
    
            for site in page('div.result.c-container  h3.t  a').items():
    
                baiduUrls.append((site.attr('href'),site.text()))
    
            for tmpurl in baiduUrls:
    
                flag=True
    
                try:
    
                    tmpPage = s.get(tmpurl[0],allow_redirects=False)#得到真实页
    
                    try:
    
                        Ehttpurl = re.match(r"http://.*?/",tmpPage.headers.get('location')).group(0)
    
                        for bb in baimingdan:#判断白名单
    
                            if bb in tmpPage.headers.get('location'):
    
                                flag = False
    
                                break
    
                            else:
    
                                flag = True
    
                        if flag:
    
                            with connection.cursor() as cursor:
    
                                cursor.execute(selsql, (Ehttpurl))
    
                                result = cursor.fetchone()
    
                                if(result==None):
    
                                    cursor.execute(inssql, (Ehttpurl, tmpurl[1],tmpPage.headers.get('location')))
    
                                    connection.commit()
    
                    except Exception as e:
    
                        print(e)
    
                except Exception as e:
    
                    print(e)
    
            pqyload_baidu["pn"]+=10#循环结束,开始下一页
    
    def get360 ():
    
        for i in range(100): #设置循环页数
    
            i=i+1
    
            print(i)
    
            r=s.get(url_360,params=pqyload_360,headers=headers)
    
            page=Pq(r.content.decode('utf-8'))
    
            baiduUrls = []
    
            for site in page('ul.result h3.res-title a').items():
    
                baiduUrls.append((site.attr('href'),site.text()))
    
            for tmpurl in baiduUrls:
    
                flag=True
    
                try:
    
                    tmpPage = s.get(tmpurl[0])#得到真实页
    
                    try:
    
                        detailurl = re.search(r'URL=\'(.*?)\'', tmpPage.content.decode('utf-8'), re.S)
    
                        httpurl = re.match(r"http://.*?/",detailurl.group(1)).group(0)
    
                        for bb in baimingdan:#判断白名单
    
                            if bb in detailurl.group(1):
    
                                flag = False
    
                                break
    
                            else:
    
                                flag = True
    
                        if flag:
    
                            with connection.cursor() as cursor:
    
                                cursor.execute(selsql, (httpurl))
    
                                result = cursor.fetchone()
    
                                if(result==None):
    
                                    cursor.execute(inssql, (httpurl, tmpurl[1],detailurl.group(1)))
    
                                    connection.commit()
    
                    except Exception as e:
    
                        print(e)
    
                except Exception as e:
    
                    print(e)
    
            pqyload_360["pn"]+=1#循环结束,开始下一页
    
    get360()
    
    getbaidu()
    
    数据库语句
    
    CREATE TABLE `yellow` (
    
      `id` int(11) NOT NULL AUTO_INCREMENT,
    
      `url` varchar(255) DEFAULT NULL,
    
      `detailurl` varchar(255) DEFAULT NULL,
    
      `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
    
      `lv` varchar(255) DEFAULT NULL,
    
      `subtime` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
    
      `title` varchar(255) DEFAULT NULL,
    
      PRIMARY KEY (`id`)
    
    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
    
    有问题请联系hudcan@sina.com 个人网站:http://ext.123456cc.cc
  • 相关阅读:
    Vue练习三十六:05_01_模拟select控件
    Vue练习三十五:04_09_星级评分系统
    Vue练习三十四:04_07_各种数组方法练习
    Vue练习三十三:04_06_当前输入框高亮显示
    Vue练习三十二:04_05_设置读取属性
    【CC++笔记】指针输出字符串
    【CC++笔记】数组指针越界
    【算法】递归思想
    【学习方法】堕落大学生补救方法
    【数字逻辑】补码技巧
  • 原文地址:https://www.cnblogs.com/mysgk/p/9427007.html
Copyright © 2011-2022 走看看