zoukankan      html  css  js  c++  java
  • python 爬取文章后存储excel 以及csv

    import requests
    from bs4 import BeautifulSoup
    import random
    import openpyxl
    xls=openpyxl.Workbook()
    sheet=xls.active
    sheet.title='movies'
    sheet['A1']='序号'
    sheet['B1']='名称'
    sheet['C1']='评分'
    sheet['D1']='推荐语'
    sheet['E1']='链接'
    
    for i in range(11):
        params={
            'start': str(i*25),
            'filter':''
        }
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
        }
        url='https://movie.douban.com/top250'
        res=requests.get(url,params=params,headers=headers)
        con=res.text
        soup=BeautifulSoup(con,'html.parser')
        maindiv=soup.find(class_="grid_view")
        for titles in maindiv.find_all('li'):
            try:
                num = titles.find('em',class_="").text
                #查找序号
                title = titles.find('span', class_="title").text
                #查找电影名
                tes = titles.find('span',class_="inq").text
                #查找推荐语
                comment = titles.find('span',class_="rating_num").text
                #查找评分
                url_movie = titles.find('a')['href']
                print(num + '.' + title + '——' + comment + '
    ' + '推荐语:' + tes +'
    ' + url_movie)
                sheet.append([num,title,comment,tes,url_movie])
            except:
                continue
    xls.save('douban.xlsx')

    csv:

    import requests
    from bs4 import BeautifulSoup
    import random
    import openpyxl
    import csv
    
    url="https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles"
    headers={
        'referer': 'https://www.zhihu.com/people/zhang-jia-wei/posts/posts_by_votes?page=1',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    csv_file=open('dazhangwei.csv','w',newline='',encoding='utf-8')
    writer=csv.writer(csv_file)
    header=['标题','简介','连接']
    writer.writerow(header)
    x=0
    while True:
        params={
            'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
            'offset': str((x*10)),
            'limit': '10',
            'sort_by': 'voteups'
        }
        res=requests.get(url,headers=headers,params=params)
        res_json=res.json()
        con=res_json['data']
        for i in con:
            lists=[i['title'],i['url'],i['excerpt']]
            writer.writerow(lists)
        if res_json['paging']['is_end'] == True:
            break
        x+=1
    csv_file.close()
  • 相关阅读:
    6. 模块picklejson andomoszipfile面对对象(类的封装 操作 __init__)
    xlwt模块,(Excel表格)
    5. 迭代器生成器高阶函数推导式内置函数模块(math.time)
    4. 函数参数变量闭包递归
    3. 深浅拷贝/格式化/字符串/列表/字典/集合/文件操作
    2. 运算/循环/字符串操作
    1. 变量/数据类型
    Puppet自动化部署-安装及配置(3)
    Puppet自动化部署-前期环境准备(2)
    Puppet自动化运维-资源介绍篇(4)
  • 原文地址:https://www.cnblogs.com/houdj/p/12034612.html
Copyright © 2011-2022 走看看