zoukankan      html  css  js  c++  java
  • python爬虫1——获取网站源代码(豆瓣图书top250信息)

    # -*- coding: utf-8 -*-
    import requests
    import re
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class Spider(object):
        def __init__(self):
            print('开始爬取豆瓣图书top250的内容。。。。。。')
    
        # 传入url,返回网页源代码
        def getSourceCode(self, url):
            html = requests.get(url)
            return html.text
    
        # 从源代码中提取出我们需要的内容块:{书名、作者出版社等、评分、评价人数}。
        def getEveryBookContent(self, sourceCode):
            everyBookContent = re.findall('<table width="100%">(.*?)</table>', sourceCode, re.S)
            # everyBookContent = re.findall('<div class="pl2">(.*?)</div>(.*?)<p class="pl">(.*?)</p>', sourceCode, re.S)
            return everyBookContent
    
        # 从内容块中提取出数据
        def getBookInfo(self, eachBookContent):
            bookInfo = {}
            # bookInfo['title'] = re.subn('( |\n|<br/>|</?span.*?>)', "", re.search('<a href=.*?>(.*?)</a>', eachBookContent, re.S).group(1))[0]
            bookInfo['title'] = re.sub('( |\n|<br/>|</?span.*?>)', "", re.search('<a href=.*?>(.*?)</a>', eachBookContent, re.S).group(1))
            bookInfo['author'] = re.search('<p class="pl">(.*?)</p>', eachBookContent, re.S).group(1)
            bookInfo['discussNum'] = re.sub('( |\n|<br/>)', "", re.search('<span class="pl">\((.*?)\)</span>', eachBookContent, re.S).group(1))
            bookInfo['score'] = re.search('<span class="rating_nums">(.*?)</span>', eachBookContent, re.S).group(1)
            return bookInfo
    
        # 将结果保存到文件
        def saveBookInfo(self, bookList):
            f = open("bookList.txt", "a")
            for each in bookList:
                f.writelines('书  名:\t {}\n'.format(each['title']))
                f.writelines('作  者:\t {}\n'.format(each['author']))
                f.writelines('评论数:\t {}\n'.format(each['discussNum']))
                f.writelines('评  分:\t {}\n\n'.format(each['score']))
            f.close()
    
        def start(self, url):
            sourceCode = self.getSourceCode(url)
            everyBookContent = self.getEveryBookContent(sourceCode)
            bookList = []
            for each in everyBookContent:
                bookList.append(self.getBookInfo(each))
            self.saveBookInfo(bookList)
    
    
    if __name__ == '__main__':
        douban = Spider()
        url = 'http://book.douban.com/top250?start=0'
        i = 0
        while i <= 225:
            url = 'http://book.douban.com/top250?start={}'.format(i)
            douban.start(url)
            i += 25
  • 相关阅读:
    快速排序
    常见的正则表达式验证(更新中)
    中介者模式
    RadioButtonList控件如何取得选中的值
    职责链模式
    设计模式之GOF23建造者模式
    设计模式之GOF23工厂模式02
    设计模式GOF23之工厂模式01
    多线程测试时的辅助类--CountDownLatch
    设计模式GOF23之单例模式
  • 原文地址:https://www.cnblogs.com/everSeeker/p/4977856.html
Copyright © 2011-2022 走看看