zoukankan      html  css  js  c++  java
  • python爬虫实例

    python爬虫实例

     

    这里有两个爬虫的实例,是刚开始学python用的,一个是爬取京东茅台酒评论的,另一个是爬取新浪网国内新闻的,两个都是网上的教程里边的,代码略微有些不同,供参考学习。

    都可以在andconda里跑

    复制代码
    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    import json
    import pandas
    news_total=[]
    commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
    url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047'
    def parseListLinks(url):
        newsdetails=[]
        res = requests.get(url)
        jd= json.loads(res.text.strip().lstrip('newsloadercallback(').rstrip(');'))
        for ent in jd['result']['data']:
            newsdetails.append(getNewsDetail(ent['url']))
        return newsdetails
            
    def getNewsDetail(newsurl):
        result={}
        res=requests.get(newsurl)
        res.encoding='utf-8'
        soup=BeautifulSoup(res.text,'html.parser')     
        result['title']=soup.select('#artibodyTitle')[0].text
        result['newssource']=soup.select('.time-source span a')[0].text
        timesource=soup.select('.time-source')[0].contents[0].strip()
        dt1=datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
        result['dt'] =dt1.strftime('%Y-%m-%d-%H:%M')
        result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
        result['editor']=soup.select('.article-editor')[0].text.strip('责任编辑:')
        result['comments']=getCommentCounts(newsurl)
        print('获得一条新闻')
        return result      
           
        
    def getCommentCounts(newsurl):
        m=re.search('doc-i(.+).shtml',newsurl)
        newsid=m.group(1)
        comments=requests.get(commentURL.format(newsid))
        jd=json.loads(comments.text.strip('var data='))
        return jd['result']['count']['total'] 
    
    for i in range(1,8):
        print('正在爬取第'+str(i)+'页......')
        newsurl=url.format(i)
        newsary= parseListLinks(newsurl)
        news_total.extend(newsary)
    print('抓取结束')                                 
    df=pandas.DataFrame(news_total)
    df.to_excel('news.xlsx')
    复制代码
    复制代码
    import requests 
    import re
    import json
    import time
    import xlwt
    
    #
    #
    #配置表格
    #不需要明白是干啥的
    #有下面4行代码就可以往表格写中文了
    #
    style=xlwt.XFStyle()
    font=xlwt.Font()
    font.name='SimSun'
    style.font=font
    
    #创建一个表格
    w=xlwt.Workbook(encoding='utf-8')
    #添加个sheet
    ws=w.add_sheet('sheet 1',cell_overwrite_ok=True)
    #当前写入表格到第 row行
    row=1
    #
    #写入表格头
    #
    ws.write(0,0,'content')
    ws.write(0,1,'userClientShow')
    ws.write(0,2,'creationTime')
    ws.write(0,3,'userLevelName')
    ws.write(0,4,'productColor')
    ws.write(0,5,'userLevelId')
    ws.write(0,6,'score')
    ws.write(0,7,'referenceName')
    ws.write(0,8,'referenceTime')
    ws.write(0,9,'isMobile')
    ws.write(0,10,'nickname')
    
    #
    #接受一个json对象
    #将内容写进表格
    #一次一页评论
    #
    def write_json_to_xls(dat):
        global row
        for comment in dat['comments']:
            ws.write(row,0,comment['content'])
            ws.write(row,1,comment['userClientShow'])
            ws.write(row,2,comment['creationTime'])
            ws.write(row,3,comment['userLevelName'])
            ws.write(row,4,comment['productColor'])
            ws.write(row,5,comment['userLevelId'])
            ws.write(row,6,comment['score'])
            ws.write(row,7,comment['referenceName'])
            ws.write(row,8,comment['referenceTime'])
            ws.write(row,9,comment['isMobile'])
            ws.write(row,10,comment['nickname'])
            row+=1
    
    #
    #
    # 循环获取数据
    #
    #
    for i in range(1,10+1):
        url='https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=' % i
        try:
            json_req = requests.get(url)
            dat = json_req.json()
            write_json_to_xls(dat)
            print(u'写入一页数据')
        except Exception as e:
           print(u'获取数据失败数据',e)
        time.sleep(0.5)
    
    
    #将数据存进表格
    w.save('result.xls')
    复制代码
     
  • 相关阅读:
    ansible执行命令或playbook报错原因及解决方法整理
    一款基于WordPress生成的微信小程序源码,免费开源
    WordPress小程序之酱茄Free小程序开源版更新敏感词检测功能
    酱茄pro小程序发布直播和地理位置功能(WordPress小程序)
    WordPress小程序-酱茄cms(积分阅读小程序)V1.1.0发布
    酱茄Free主题 – 酱茄WordPress资讯主题免费开源版下载
    酱茄Pro小程序V1.6.6更新之订阅消息发布
    WordPress小程序源码下载 酱茄开源版小程序源码
    酱茄助你三分钟打造专属WordPress社区论坛小程序
    酱茄cms小程序专为WordPress内容/知识付费场景设计
  • 原文地址:https://www.cnblogs.com/Jeremy2001/p/10203323.html
Copyright © 2011-2022 走看看