zoukankan      html  css  js  c++  java
  • Python简易爬虫

    #  coding: utf-8
    import urllib
    import urllib2
    import re
    import os
    
    if __name__=='__main__':
        print "抓取开始..."
        j = 1
        for i in range(1,35):
            url='http://www.qiushibaike.com/8hr/page/'+str(i)+'/?s=4981088'
            header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'}
            try:
                request=urllib2.Request(url=url,headers=header)
                response=urllib2.urlopen(request)
                content= response.read()
            except urllib2.HTTPError as e:
                print e
                exit()
            except urllib2.URLError as e:
                print e
                exit()
            pattern=re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>',re.S)
            items=re.findall(pattern,content)
            path="qiubai"
            if not os.path.exists(path):
                os.makedirs(path)
            for item in items:
                file_path=path+"/"+str(j)+'.txt'
                f=open(file_path,'w')
                item=item.replace('<br/>','
    ')
                f.write(item)
                f.close()
                j=j+1
        print "内容抓取完成..."

    重构后

    #  coding: utf-8
    import urllib
    import urllib2
    import re
    import os
    
    class Spider(object):
        #构造方法
        def __init__(self):
            self.url='http://www.qiushibaike.com/8hr/page/%s/?s=4981088'
            self.user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'
    
        #获取网页源代码
        def get_page(self,page_index):
            header={'User-Agent':self.user_agent}
            try:
                request=urllib2.Request(url=self.url%str(page_index),headers=header)
                response=urllib2.urlopen(request)
                content= response.read()
                return content
            except urllib2.HTTPError as e:
                print e
                exit()
            except urllib2.URLError as e:
                print e
                exit()
    
        #分析网页源代码
        def analysis(self,content):
            pattern = re.compile('<div class="content">.*?<span>(.*?)</span>.*?</div>', re.S)
            items = re.findall(pattern, content)
            return items
    
            #保存网页源代码
        def save(self,items,path,page_index):
            path = "qiubai"
            strPage=''
            if not os.path.exists(path):
                os.makedirs(path)
            if page_index<10:
                strPage='0'+str(page_index)
            else:
                strPage=str(page_index)
            j = 1
            strJ=''
            for item in items:
                if j<10:
                    strJ='0'+str(j)
                else:
                    strJ=str(j)
                file_path = path + "/" + strPage+strJ + '.txt'
                f = open(file_path, 'w')
                item = item.replace('<br/>', '
    ')
                f.write(item)
                f.close()
                j = j + 1
    
        #运行
        def run(self):
            print '开始抓取内容了...'
            for i in range(1,35):
                content=self.get_page(i)
                items=self.analysis(content)
                self.save(items,'qiubai',i)
            print '内容抓取完了...'
    
    
    if __name__=='__main__':
        spider=Spider()
        spider.run()
  • 相关阅读:
    python计算机视觉项目实践
    Codeforces Round #256 (Div. 2) B (448B) Suffix Structures
    SonarLint插件的安装与使用
    后缀表达式求值
    有用代码段2
    提高Java代码质量的Eclipse插件之Checkstyle的使用具体解释
    Intellij Idea搭建Spark开发环境
    代码备忘, TODO宏实现
    浏览器自己主动填表安全漏洞:查看浏览器保存的password
    PDO 查询mysql返回字段整型变为String型解决方法
  • 原文地址:https://www.cnblogs.com/beast-king/p/6832539.html
Copyright © 2011-2022 走看看