zoukankan      html  css  js  c++  java
  • python爬虫实例

    这里有两个爬虫的实例,是刚开始学python用的,一个是爬取京东茅台酒评论的,另一个是爬取新浪网国内新闻的,两个都是网上的教程里边的,代码略微有些不同,供参考学习。

    都可以在andconda里跑

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    import json
    import pandas
    news_total=[]
    commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
    url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047'
    def parseListLinks(url):
        newsdetails=[]
        res = requests.get(url)
        jd= json.loads(res.text.strip().lstrip('newsloadercallback(').rstrip(');'))
        for ent in jd['result']['data']:
            newsdetails.append(getNewsDetail(ent['url']))
        return newsdetails
            
    def getNewsDetail(newsurl):
        result={}
        res=requests.get(newsurl)
        res.encoding='utf-8'
        soup=BeautifulSoup(res.text,'html.parser')     
        result['title']=soup.select('#artibodyTitle')[0].text
        result['newssource']=soup.select('.time-source span a')[0].text
        timesource=soup.select('.time-source')[0].contents[0].strip()
        dt1=datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
        result['dt'] =dt1.strftime('%Y-%m-%d-%H:%M')
        result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
        result['editor']=soup.select('.article-editor')[0].text.strip('责任编辑:')
        result['comments']=getCommentCounts(newsurl)
        print('获得一条新闻')
        return result      
           
        
    def getCommentCounts(newsurl):
        m=re.search('doc-i(.+).shtml',newsurl)
        newsid=m.group(1)
        comments=requests.get(commentURL.format(newsid))
        jd=json.loads(comments.text.strip('var data='))
        return jd['result']['count']['total'] 
    
    for i in range(1,8):
        print('正在爬取第'+str(i)+'页......')
        newsurl=url.format(i)
        newsary= parseListLinks(newsurl)
        news_total.extend(newsary)
    print('抓取结束')                                 
    df=pandas.DataFrame(news_total)
    df.to_excel('news.xlsx')
    import requests 
    import re
    import json
    import time
    import xlwt
    
    #
    #
    #配置表格
    #不需要明白是干啥的
    #有下面4行代码就可以往表格写中文了
    #
    style=xlwt.XFStyle()
    font=xlwt.Font()
    font.name='SimSun'
    style.font=font
    
    #创建一个表格
    w=xlwt.Workbook(encoding='utf-8')
    #添加个sheet
    ws=w.add_sheet('sheet 1',cell_overwrite_ok=True)
    #当前写入表格到第 row行
    row=1
    #
    #写入表格头
    #
    ws.write(0,0,'content')
    ws.write(0,1,'userClientShow')
    ws.write(0,2,'creationTime')
    ws.write(0,3,'userLevelName')
    ws.write(0,4,'productColor')
    ws.write(0,5,'userLevelId')
    ws.write(0,6,'score')
    ws.write(0,7,'referenceName')
    ws.write(0,8,'referenceTime')
    ws.write(0,9,'isMobile')
    ws.write(0,10,'nickname')
    
    #
    #接受一个json对象
    #将内容写进表格
    #一次一页评论
    #
    def write_json_to_xls(dat):
        global row
        for comment in dat['comments']:
            ws.write(row,0,comment['content'])
            ws.write(row,1,comment['userClientShow'])
            ws.write(row,2,comment['creationTime'])
            ws.write(row,3,comment['userLevelName'])
            ws.write(row,4,comment['productColor'])
            ws.write(row,5,comment['userLevelId'])
            ws.write(row,6,comment['score'])
            ws.write(row,7,comment['referenceName'])
            ws.write(row,8,comment['referenceTime'])
            ws.write(row,9,comment['isMobile'])
            ws.write(row,10,comment['nickname'])
            row+=1
    
    #
    #
    # 循环获取数据
    #
    #
    for i in range(1,10+1):
        url='https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=' % i
        try:
            json_req = requests.get(url)
            dat = json_req.json()
            write_json_to_xls(dat)
            print(u'写入一页数据')
        except Exception as e:
           print(u'获取数据失败数据',e)
        time.sleep(0.5)
    
    
    #将数据存进表格
    w.save('result.xls')
  • 相关阅读:
    706. Design HashMap 实现哈希表
    5. Longest Palindromic Substring 返回最长的回文子串
    8. String to Integer (atoi) 字符串转成整数
    22. Generate Parentheses产生所有匹配括号的方案
    245. Shortest Word Distance III 单词可以重复的最短单词距离
    java之spring之初始spring
    java之hibernate之hibernate缓存
    java之hibernate之hibernate查询
    java之hibernate之加载策略和抓取策略
    java之hibernate之 cascade和inverse
  • 原文地址:https://www.cnblogs.com/wwwhza/p/7762461.html
Copyright © 2011-2022 走看看