zoukankan      html  css  js  c++  java
  • 团队项目——爬取搜狐新闻

    通过json爬取新闻数据(1)——搜狐不同的新闻主页有不同的页码格式

    #-*-coding:utf-8-*-
    # @Time :2021/4/17 14:58
    # @Author:shuaichao
    # @File :.py
    # @Software: PyCharm
    import gzip
    
    from bs4 import BeautifulSoup        #网页解析,获悉数据.231
    import re                            #正则表达式
    import urllib.request,urllib.error   #制定URL,获取网页数据
    import pymysql
    import traceback
    import time
    import requests
    import json
    #得到制定一个URL的网页内容
    from pip._vendor.six 
    import BytesIO
    
    
    def askUrl(url):
        head={
            # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
            # "Connection": "keep-alive",
            # "Cache-Control": "max-age = 0",
            # "Accept-Language": "zh - CN, zh;q = 0.9",
            # "Accept-Encoding": "gzip, deflate, br",
            # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }
        if __name__ == '__main__':
            request = urllib.request.Request(url, headers=head)
            html = ""
            try:
                response = urllib.request.urlopen(request)
                html = response.read().decode("utf-8")
            except urllib.error.URLError as e:
                if hasattr(e,"code"):
                    print(e.code)
                if hasattr(e,"reason"):
                    print(e.reasen)
            return html
    #链接数据库
    def get_conn():
        conn = pymysql.connect(
            host="localhost",
            user="root",
            passwd="qwer1234",
            db="news",
            charset="utf8mb4"
        )
        cursor = conn.cursor()
        return conn, cursor
    #关闭数据库
    def close_conn(conn, cursor):
        if cursor:
            cursor.close()
        if conn:
            conn.close()
    #更新新闻数据
    def update_news(allinfo):
        cursor = None
        conn = None
        try:
            conn, cursor = get_conn()
            sql = "insert into new(title, article, fenlei) values(%s,%s,%s)"
            print(f"{time.asctime()}开始更新最新数据")
            for item in allinfo:
                cursor.execute(sql, item)
            conn.commit()
            print(f"{time.asctime()}更新最新数据完毕")
        except:
            traceback.print_exc()
        finally:
            close_conn(conn, cursor)
    
    #爬取网页信息
    def get_info(baseurl):
        html = askUrl(baseurl)
        bs = BeautifulSoup(html, "html.parser")
        return bs
    #soup处理并转换成字符串
    def transport(bs, info):
        ex_info = bs.find_all(class_=info)
        info = str(ex_info)
        return ex_info, info
    if __name__=="__main__":
        head = {
            # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
            # "Connection": "keep-alive",
            # "Cache-Control": "max-age = 0",
            # "Accept-Language": "zh - CN, zh;q = 0.9",
            # "Accept-Encoding": "gzip, deflate, br",
            # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
        }
        # 存放所有的新闻网址
        linkall = []
        # 所有存放新闻的.js文件
        linkJQ = []
        # 所有超链接id
        Linkid = []
        # 所有超链接Authorid
        LinkAid = []
        # 存放所有标题
        allTitle = []
        # 存放所有文章
        allArticle = []
        # 存放所有图片链接
        allImg = []
        # 汇总所有存入mysql的数据
        allinfo = []
        #去除json数据中的非法字符
        # str = 'https://v2.sohu.com/integration-api/mix/region/10459?size=25&adapter=pc&secureScore=50&page=2&pvId=16193134284174ePlzLC&requestId=2011032041009993_1619313472935&callback=jQuery1124048872472139460466_1619313428177&_=1619313428180'
        # res = requests.get(str, headers=head)
        # r = res.text.replace('/**/jQuery1124048872472139460466_1619313428177(', '')[:-2]
        # response_data = json.loads(r, strict=False)
        # print(response_data['data'][0]['authorId'])
        # print(response_data['data'][0]['id'])
    
        # 制作每个js网页的链接
    
        index = 'https://v2.sohu.com/integration-api/mix/region/10459?page=1&size=500'
        res = requests.get(index, headers=head)
        response_data = json.loads(res.text)
        #存入每个新闻的id和authorid
        for index, value in enumerate(response_data['data']):
            if int(response_data['data'][index]['id']) > 1000000:
                Linkid.append(response_data['data'][index]['id'])
                LinkAid.append(response_data['data'][index]['authorId'])
        # 制作旅游新闻所有网址
        for index, value in enumerate(Linkid):
            linkall.append('https://www.sohu.com/a/' + str(Linkid[index])+'_' + str(LinkAid[index]) + '?scm=1002.590044.0.10372-1021&spm=smpc.ch25.content1-n-1.3.1619334835898hGaWxuo')
        # 最后一个链接是广告,删除
        # linkall.pop()
        #开始爬取主要数据
        for index, value in enumerate(linkall):
            bs = get_info(value)
            title = bs.select("h1")
            article = bs.select("article > p")
            if title and article:
                a = []
                str = ''
                # 总标题表添加标题
                allTitle.append(title[0].get_text().strip().replace("原创", "").replace("
    ", ""))
                print(index)
                print(value)
                print(title[0].get_text().strip().replace("原创", ""))
                # 总文章表添加文章
                for item in range(1, len(article)):
                    str += article[item].get_text()
                # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"
    ", "")
                allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", ""))
                # 总图片表添加图片
                # ex_info, info = transport(bs, "ql-align-center")
                # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>')
                # Img = re.findall(findImg, info)
                # if Img:
                #     allImg.append(Img)
                # else:
                #     allImg.append("")
            else:
                print(index)
                print(value)
                del linkall[index]
        for index, value in enumerate(allTitle):
            allinfo.append([value])
            allinfo[index].append(allArticle[index])
            allinfo[index].append('教育')
        update_news(allinfo)
  • 相关阅读:
    Hadoop笔记
    InnoDB存储引擎概述--文件,表,索引,锁,事务的原理与实现
    SpringCloud-Eureka
    spring boot启动报错Error starting ApplicationContext(未能配置数据源)
    SSM框架配置
    SpringMvc笔记
    MySql笔记-->3
    MySql笔记-->2
    MySql笔记 -->1
    C# Lambda表达式
  • 原文地址:https://www.cnblogs.com/chaogehahaha/p/14762958.html
Copyright © 2011-2022 走看看