zoukankan      html  css  js  c++  java
  • python例子-PyQuery抓取信息.

    #!/usr/bin/python
    #coding:utf-8
    
    from pyquery import PyQuery
    import re
    
    # 抓取:http://www.stylebop.com/cn/product_details.php?id=606526&special=sale
    # 获得   产品名 品牌 价格 size  图片(大图)
    def main():
        pqhtml = PyQuery(url = 'http://www.stylebop.com/cn/product_details.php?id=606526&special=sale')
        #产品图片:
        img_li = pqhtml('li').filter('.image_click_rotator')
        pattern_img = re.compile(".*?'(.*?jpg)'.*?'.*?'.*?'.*?'.*?'(.*?jpg)'.*?")
        img_list = []
        for li in img_li:
            #div = li.getchildren()[0]
            #a = div.getchildren()[0]
            href = li.getchildren()[0].getchildren()[0].get('href')
            items = re.findall(pattern_img,href)
            img_large = list(items[0])[1]
            if img_large[0:4] != 'http' :
                img_large = 'http://www.stylebop.com%s' %img_large
            img_list.append(img_large)
        print '产品图片:' , img_list
    
        #产品品牌:
        brand = pqhtml('div').filter('.productInfo')('a:first').text()
        print '品牌:%s' %brand
    
        #价格
        price_div = pqhtml('div').filter('#product_price')  #根据ID获取价格的div
        price_first_span = price_div('span:first') #获取第一个span
        old_price = ''
        new_price = ''
        if price_first_span.hasClass('old_price'):
            old_price = price_first_span.text
            new_price = price_div('span:eq(1)').text() + ' / ' + price_div('span:eq(3)').text()
        else:
            new_price = price_div.text() + ' / ' + price_div('span:first').text
        print '价格:' , new_price
        #print '价格:%s' % new_price #这样打印会报编码错误:'ascii' codec can't encode character u'u20ac' in position 21: ordinal not in range(128)
    
        #size
        size_option = pqhtml('select').filter('.newInput2')('option')
        size_list = []
        for size in size_option:        #为HTMLElement对象
            size_list.append(size.text)
        print 'size:', size_list
    
        #产品名:
        pname = pqhtml('div').filter('.productInfo')('span:first').text()
        print '产品名:%s' % pname
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    JAVA for(i = 0; i<a.length; i++) 解析
    3.2.2多维数组 3.3 排序
    3.2数组
    字符串和数组
    2.7.3与程序转移有关的跳转语句
    2.7.2 循环语句
    读书共享 Primer Plus C-part 4
    Linux 批量修改文件名
    关于/usr/local/lib/libz.a(zutil.o): relocation R_X86_64_32 against `.rodata.str1.1' can not be used when making a shared object; recompile with -fPIC解决办法
    做一个有深度的程序猿
  • 原文地址:https://www.cnblogs.com/xccnblogs/p/4894405.html
Copyright © 2011-2022 走看看