zoukankan      html  css  js  c++  java
  • 每日学习

    今天开始爬取数据:

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import json
    import pymysql
    def getnewsdetail(newsurl):
        res = requests.get(newsurl)
        res.encoding = 'utf-8'
        value=[]
        soup = BeautifulSoup(res.text, 'html.parser')
        if (soup.select('.main-title')):
            title = soup.select('.main-title')[0].text
        else:
            title="异常爬取"
        if (soup.select('.date-source span')):
            timesource = soup.select('.date-source span')[0].text  # 获取时间)
            dt = datetime.strptime(timesource, '%Y年%m月%d日 %H:%M')
            dt.strftime('%Y-%m-%d')
        else:
            timesource="异常爬取"
        if(soup.select('.date-source a')):
            place = soup.select('.date-source a')[0].text  # 获取新闻来源
        else:
            if soup.select('#top_bar > div > div.date-source > span.source'):
                place = soup.select('#top_bar > div > div.date-source > span.source')[0].text
            else:
                place="异常爬取"
        if(soup.select("#artibody")):
            articleall = soup.select("#artibody")[0]  # 获取文章内容
        else:
            articleall = soup.select("#article")[0]  # 获取文章内容
        if(soup.select('#article p')):
            editor = soup.select('#article p')[-1].text.strip('责任编辑:')  # 获取作者姓名
        else:
            editor='异常爬取'
        value=[title,timesource,place,editor,articleall]
        return value
    def parseListLinks(url):
        newsdetail=[]
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        res=requests.get(url,headers=headers)
        jd=json.loads(res.text[47:-14])
        for ent in jd['result']['data']:
            newsdetail.append(getnewsdetail(ent['url']))
        return newsdetail
    url='https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2509&k=&num=50&page={}&r=0.7778780795677847&callback=jQuery1112046350965709357705_1620651288029&_=1620651288032'
    news_total=[]
    for i in range(2):
        newsurl=url.format(i)
        newsary=parseListLinks(newsurl)
        news_total.extend(newsary)
        print(i)
    tuplist = tuple(news_total)
    db = pymysql.connect(host="localhost", user="root", password="1229", database="lianxi", charset='utf8')
    cursor = db.cursor()
    sql_xiwen2 = "INSERT INTO xinwen2 values (%s,%s,%s,%s,%s)"
    try:
        cursor.executemany(sql_xiwen2,tuplist)
        db.commit()
    except:
          print('执行失败,进入回调3')
          db.rollback()
    db.close()

    爬取腾讯新闻数据的一个爬虫,之后还会对其他新闻网站进行爬取

    作者:哦心有
    本文版权归作者和博客园共有,欢迎转载,但必须给出原文链接,并保留此段声明,否则保留追究法律责任的权利。
  • 相关阅读:
    laravel excel 导入
    linux 怎么解压
    mysql分表和表分区详解
    mysql主从复制windows-》linux
    Redis和Memcache的区别
    mysql group by 用法解析(详细)
    [置顶] mysql常用函数
    mysql测试数据库employees一些sql语句
    session入mysql
    session入库
  • 原文地址:https://www.cnblogs.com/haobox/p/14911336.html
Copyright © 2011-2022 走看看