zoukankan      html  css  js  c++  java
  • spider

    #

    from lxml import etree
    import requests
    import csv
    
    fp = open('./douban.csv','w+',encoding='utf-8',newline='')
    writer = csv.writer(fp)
    writer. writerow(('name','url','author','publisher','date','price','rate','comment'))  #写头部
    
    urls = ['https://book.douban.com/top250?start={}'.format(num) for num in range(0,250,25)]
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'}
    
    for url in urls:
        page = requests.get(url,headers).text
        tree = etree.HTML(page)
        infos = tree.xpath('//tr[@class="item"]')
        for info in infos:
            name = info.xpath('td/div/a/@title')[0]
            url = info.xpath('td/div/a/@href')[0]
            book_infos = info.xpath('//td/p/text()')[0]
            author = book_infos.split('/')[0]
            pub = book_infos.split('/')[-3]
            date = book_infos.split('/')[-2]
            price = book_infos.split('/')[-1]
            rate = info.xpath('td/div/span[2]/text()')[0]
            comments = info.xpath('td/p/span/text()')
            comment = comments[0] if len(comments) != 0 else ''
            writer.writerow((name,url,author,pub,date,price,rate,comment))
    fp.close()
    豆瓣书籍top250 csv文件
  • 相关阅读:
    个人总结
    找水王
    nabcd需求分析
    四则运算最终篇-网页版四则运算
    第一次冲刺--个人工作总结02
    第一次冲刺--个人工作总结01
    组队APP功能点定点NABCD分析
    水王
    软件工程结对作业01
    个人工作总结06
  • 原文地址:https://www.cnblogs.com/zhangchen-sx/p/11295690.html
Copyright © 2011-2022 走看看