zoukankan      html  css  js  c++  java
  • 自己写一个爬虫 copider

    copider 模仿scrapy的一些写法,当然我这个是单进程的,不是异步的

    1.目录 copider/copider.py

    #coding=utf-8
    
    '''
    Created on 2015年10月8日
    
    @author: snt1
    '''
    
    import urllib2
    import lxml.html
    import StringIO
    
    
    
    class Spider(object):
        def __init__(self, url, meta=None):
            self.URL = url
            self.META = meta
            self.TEXTMARK = self.get(url)
            self.SEL = self.selector(doc=self.TEXTMARK)
            
            
        def get(self, url):
            try:
                req = urllib2.Request(url)
                req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36')
                shtml = urllib2.urlopen(req, timeout=15).read()
            except Exception, e:
                print e,"...next.."
                
            data = StringIO.StringIO(shtml)
            HTML = data.read()
            return(HTML)
        
        # 返回html
        @property
        def html(self):
            return self.TEXTMARK
        
        @property
        def url(self):
            return self.URL
        
        @property
        def meta(self):
            return self.META
        
        def selector(self, doc=None):
            if doc:
                HTML = doc
            else:
                HTML = self.HTML
            return lxml.html.fromstring(HTML)
        
        def xpath(self, rule):
            iter_list = self.SEL.xpath(rule)
            attrList = []
            try:
                for ele in iter_list:
                    attrList.append(ele.attrib)
                    #attrList.append(ele.attrib)
                return attrList
            except Exception, e:
                return iter_list
            
            
    def Request(url, func, **meta):
        if meta:
            response=Spider(url,meta['meta'])
        else:
            response=Spider(url)
        func(response)
    
     

    2.copider/aero.py

    #coding=utf-8
    
    '''
    Created on 2015年10月8日
    
    @author: snt1
    '''
    
    
    import re
    import time
    from copider import Spider, Request
    
    
    
    
    
    class AeroCopider(object):
        
        name = "aero"
        storeId = "554b14c97b010cc731e81b35" # 站点ID
        allowed_domains = ["www.xxxx.com"]
        
        root_url = 'http://www.xxxx.com'
        category_url = root_url + '/category/index.jsp?numResultsPerPage=100&categoryId=%s'
        cap_category_url = root_url + '/family/index.jsp?categoryId=%s&page=%d&numResultsPerPage=100'
        url_dicts = {'3534623':'Girls', '3534624':'Guys'}
            
        def __init__(self):
            self.start_urls()
    
        def start_urls(self):
            for fid in self.url_dicts.keys():
                url = self.category_url %fid
                response = Spider(url)
                node_a = response.xpath('//*[@id="sidebar-left"]/div/dl[2]/dd//dt/a/@href')
                node_text = response.xpath('//*[@id="sidebar-left"]/div/dl[2]/dd//dt/a/text()')
                
                url_list, cid_list  = [],[]
                for num, preparing in enumerate(node_a):
                    parttern = re.compile(r'family.jsp?categoryId=')
                    if parttern.search(preparing):
                        chd_url = self.root_url+preparing
                        pattern_sub = re.compile('&cp=.*?$')
                        chd_url = pattern_sub.sub('', chd_url, re.S|re.I|re.M)
                        
                        pattern_fin = re.compile(r'family.jsp?categoryId=(d+)')
                        cid = pattern_fin.findall(chd_url)[0]
                        url_list.append(chd_url)
                        cid_list.append(cid)
                        print(u'产品分类链接:%s -> %s' %(node_text[num], chd_url))
                        cateid = cid_list[num]
                        Request(chd_url, self.parse_page, meta={'cateid':cateid})
                        print
                        
    
        def parse_page(self, response):
            #total_page = response.xpath('//div[@class="pagination"]/ul/li/a[@rel="nofollow"]/text()')
            total_items = int(response.xpath('//*[@id="main-wrap"]//li[@class="count"]/span/text()')[0])
            mod, rem = divmod(total_items, 100)
            if mod > 1:
                if rem > 0:
                    mod += 1
            else:
                mod = 1
    
            total_page = mod
            print(u'产品总分页数: %s -> %s' %(total_page,response.url))
            
            cateid = response.meta['cateid']
            for page in range(1, total_page+1):
                url = self.cap_category_url %(cateid, page)
                Request(url, self.parse_product)
                 
         
        def parse_product(self, response):
            product = response.xpath('//*[@id="products"]//h4/a/@href')
            print(u'以下来自哪个页面:%s' %response.url)
            print(u'产品:%s个 -> 路径:%s' %(len(product), product))
            
    
        
                    
    if __name__ == '__main__':
        AeroCopider()
    
        
  • 相关阅读:
    测试Remoting服务端和客户端的双向通信
    对T4模板研究针对SQL SERVER的EF代码生成
    菜鸟级WinForm分页控件
    [小技术应用]框架下动态调用用户控件的模态弹出窗体
    根据数据库表动态添加菜单及打开窗体
    Windows Mobile 5.0下蓝牙移动打印测试
    Window Mobile/CE的PC端安装测试
    基于Dev控件,在WinForm下动态配置界面
    TortoiseSVN使用简介
    SQL的一些经典算法
  • 原文地址:https://www.cnblogs.com/caoguo/p/4915570.html
Copyright © 2011-2022 走看看