zoukankan      html  css  js  c++  java
  • python爬虫学习(1)__抓取煎蛋图片

    #coding=utf-8
    #python_demo 爬取煎蛋妹子图在本地文件夹
    import requests
    import threading
    import time
    import os
    from bs4 import BeautifulSoup
    #伪造头文件
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
        'Accept-Encoding': 'gzip',
        'Cookie': '1024679722=aada4mZxRMxqvInd7D6PSgq%2FIkpGFeGlZWAH1gqP8Q; __auc=57bffd35154a91de3cd5d3b1ddb; 1024679722=ebeaLZUFikSR1OE6lm5MJYJSV0V1DbcooxQr0CHu; jdna=596e6fb28c1bb47f949e65e1ae03f7f5#1467948344088; Hm_lvt_fd93b7fb546adcfbcf80c4fc2b54da2c=1467001661,1467189261,1467685014,1467857178; Hm_lpvt_fd93b7fb546adcfbcf80c4fc2b54da2c=1467948345; _ga=GA1.2.1739476572.1438849462; _gat=1'}
    def saveImgs(*allUrl):
        if not os.path.exists('/home/zhanyunwu/jiandanpic'):
            os.mkdir('/home/zhanyunwu/jiandanpic') #在本地新建文件夹
        print allUrl
        if len(allUrl)!=0:
            print '当前页面有', len(allUrl), '张图片即将下载'
            for l in allUrl:
                filename='/home/zhanyunwu/jiandanpic/'+parseName(l)
                saveImg(l,filename)
                time.sleep(1)
        else:
            print '当前页面无图片下载'
    def saveImg(url,filename):
        print '当前图片url:',str(url),'当前图片名称',filename
        # u=urllib2.urlopen(url)
        # data=u.read()
        reponse=requests.get(str(url),headers=headers)
        image=reponse.content
        # f=open(filename,'wb')
        with open(filename,'wb') as f:
            f.write(image)
    
    def parseName(url):
        u=str(url).split('.')
        filename=str(url)[30:55]+'.'+u[-1]
        return filename
    #getallImgUrl
    def getAllImgUrl(url):
        allurl = []
        req=requests.get(url,headers=headers)
        # print req.status_code
        if req.status_code !=200:
            return allurl
        soup=BeautifulSoup(req.content,"lxml")
        links=soup.select('ol.commentlist img')
        print links
        for l in links:
            allurl.append(l.attrs.get('src'))
        return allurl
    #多线程爬取
    def crawler(n,m):
        for l in range(n,m):
            url = 'http://jandan.net/ooxx/page-' + str(l) + '#comments'
            u=getAllImgUrl(url)
            saveImgs(*u)
    c1=threading.Thread(target=crawler,args=(1850,1900))
    c2=threading.Thread(target=crawler,args=(1950,2000))
    c3=threading.Thread(target=crawler,args=(2001,2064))
    c1.start()
    c2.start()
    c3.start()
    c1.join()
    c2.join()
    c3.join()
    print 'success'
    

      

  • 相关阅读:
    微信小程序开发工具初始化代码仓库
    微信小程序开发如何初始化远程仓库并 PUSH
    Git 提示用户密码错误如何修改
    MariaDB 在创建外键的时候提示 1005 错误
    Fiddler实现移动端手机抓包
    Windows 安装 Go语言开发环境以及使用
    laravel队列,事件简单使用方法
    视图
    laravel Passport
    多台服务器 同时部署一套代码
  • 原文地址:https://www.cnblogs.com/yunwuzhan/p/5719786.html
Copyright © 2011-2022 走看看