zoukankan      html  css  js  c++  java
  • python 爬取文章后存储excel 以及csv

    import requests
    from bs4 import BeautifulSoup
    import random
    import openpyxl
    xls=openpyxl.Workbook()
    sheet=xls.active
    sheet.title='movies'
    sheet['A1']='序号'
    sheet['B1']='名称'
    sheet['C1']='评分'
    sheet['D1']='推荐语'
    sheet['E1']='链接'
    
    for i in range(11):
        params={
            'start': str(i*25),
            'filter':''
        }
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
        }
        url='https://movie.douban.com/top250'
        res=requests.get(url,params=params,headers=headers)
        con=res.text
        soup=BeautifulSoup(con,'html.parser')
        maindiv=soup.find(class_="grid_view")
        for titles in maindiv.find_all('li'):
            try:
                num = titles.find('em',class_="").text
                #查找序号
                title = titles.find('span', class_="title").text
                #查找电影名
                tes = titles.find('span',class_="inq").text
                #查找推荐语
                comment = titles.find('span',class_="rating_num").text
                #查找评分
                url_movie = titles.find('a')['href']
                print(num + '.' + title + '——' + comment + '
    ' + '推荐语:' + tes +'
    ' + url_movie)
                sheet.append([num,title,comment,tes,url_movie])
            except:
                continue
    xls.save('douban.xlsx')

    csv:

    import requests
    from bs4 import BeautifulSoup
    import random
    import openpyxl
    import csv
    
    url="https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles"
    headers={
        'referer': 'https://www.zhihu.com/people/zhang-jia-wei/posts/posts_by_votes?page=1',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    csv_file=open('dazhangwei.csv','w',newline='',encoding='utf-8')
    writer=csv.writer(csv_file)
    header=['标题','简介','连接']
    writer.writerow(header)
    x=0
    while True:
        params={
            'include': 'data[*].comment_count,suggest_edit,is_normal,thumbnail_extra_info,thumbnail,can_comment,comment_permission,admin_closed_comment,content,voteup_count,created,updated,upvoted_followees,voting,review_info,is_labeled,label_info;data[*].author.badge[?(type=best_answerer)].topics',
            'offset': str((x*10)),
            'limit': '10',
            'sort_by': 'voteups'
        }
        res=requests.get(url,headers=headers,params=params)
        res_json=res.json()
        con=res_json['data']
        for i in con:
            lists=[i['title'],i['url'],i['excerpt']]
            writer.writerow(lists)
        if res_json['paging']['is_end'] == True:
            break
        x+=1
    csv_file.close()
  • 相关阅读:
    windows中dos命令指南
    HDU 2084 数塔 (dp)
    HDU 1176 免费馅饼 (dp)
    HDU 1004 Let the Balloon Rise (map)
    变态杀人狂 (数学)
    HDU 2717 Catch That Cow (深搜)
    HDU 1234 开门人和关门人 (模拟)
    HDU 1070 Milk (模拟)
    HDU 1175 连连看 (深搜+剪枝)
    HDU 1159 Common Subsequence (dp)
  • 原文地址:https://www.cnblogs.com/houdj/p/12034612.html
Copyright © 2011-2022 走看看