zoukankan      html  css  js  c++  java
  • python爬虫1——获取网站源代码(豆瓣图书top250信息)

    # -*- coding: utf-8 -*-
    import requests
    import re
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class Spider(object):
        def __init__(self):
            print('开始爬取豆瓣图书top250的内容。。。。。。')
    
        # 传入url,返回网页源代码
        def getSourceCode(self, url):
            html = requests.get(url)
            return html.text
    
        # 从源代码中提取出我们需要的内容块:{书名、作者出版社等、评分、评价人数}。
        def getEveryBookContent(self, sourceCode):
            everyBookContent = re.findall('<table width="100%">(.*?)</table>', sourceCode, re.S)
            # everyBookContent = re.findall('<div class="pl2">(.*?)</div>(.*?)<p class="pl">(.*?)</p>', sourceCode, re.S)
            return everyBookContent
    
        # 从内容块中提取出数据
        def getBookInfo(self, eachBookContent):
            bookInfo = {}
            # bookInfo['title'] = re.subn('( |\n|<br/>|</?span.*?>)', "", re.search('<a href=.*?>(.*?)</a>', eachBookContent, re.S).group(1))[0]
            bookInfo['title'] = re.sub('( |\n|<br/>|</?span.*?>)', "", re.search('<a href=.*?>(.*?)</a>', eachBookContent, re.S).group(1))
            bookInfo['author'] = re.search('<p class="pl">(.*?)</p>', eachBookContent, re.S).group(1)
            bookInfo['discussNum'] = re.sub('( |\n|<br/>)', "", re.search('<span class="pl">\((.*?)\)</span>', eachBookContent, re.S).group(1))
            bookInfo['score'] = re.search('<span class="rating_nums">(.*?)</span>', eachBookContent, re.S).group(1)
            return bookInfo
    
        # 将结果保存到文件
        def saveBookInfo(self, bookList):
            f = open("bookList.txt", "a")
            for each in bookList:
                f.writelines('书  名:\t {}\n'.format(each['title']))
                f.writelines('作  者:\t {}\n'.format(each['author']))
                f.writelines('评论数:\t {}\n'.format(each['discussNum']))
                f.writelines('评  分:\t {}\n\n'.format(each['score']))
            f.close()
    
        def start(self, url):
            sourceCode = self.getSourceCode(url)
            everyBookContent = self.getEveryBookContent(sourceCode)
            bookList = []
            for each in everyBookContent:
                bookList.append(self.getBookInfo(each))
            self.saveBookInfo(bookList)
    
    
    if __name__ == '__main__':
        douban = Spider()
        url = 'http://book.douban.com/top250?start=0'
        i = 0
        while i <= 225:
            url = 'http://book.douban.com/top250?start={}'.format(i)
            douban.start(url)
            i += 25
  • 相关阅读:
    C# 串口通信总结
    客户端下载文件和服务器端下载文件总结
    Android 上传图片到 Asp.Net 服务器的问题
    iOS内存泄漏自动检测工具PLeakSniffer
    让iOS开发变得更有效率-分类、工具类
    分分钟解决iOS开发中App启动广告的功能
    响应者链及相关机制总结
    stackoverflow上关于iOS的票数最多(最常见)的15个问题
    iOS 开发之 ReactiveCocoa(进阶)
    iOS 开发之 ReactiveCocoa(基础)
  • 原文地址:https://www.cnblogs.com/everSeeker/p/4977856.html
Copyright © 2011-2022 走看看