zoukankan      html  css  js  c++  java
  • Python 实现的下载op海贼王网的图片(网络爬虫)

    没得事就爬一下我喜欢的海贼王上的图片

    须要在d盘下建立一个imgcache目录

    # -*- coding: utf-8 -*-
    
    import urllib
    import urllib2
    
    import json
    from bs4 import BeautifulSoup
    import threadpool 
    import thread
    class htmlpaser:
            def __init__(self):
                    self.url='http://1.hzfans.sinaapp.com/process.php'
            #POST数据到接口
            def Post(self,postdata):
                    # headers = {
                    #         'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  
                    # }
                    # data = urllib.urlencode(postdata)
                    # req = urllib2.Request(self.url,data,headers)
                    # resp = urllib2.urlopen(req,None,20)
                    # html = resp.read()
                    # return html
                    data = urllib.urlencode(postdata)
                    req = urllib2.Request(url, data)
                    html= urllib2.urlopen(req).read()
                    print html
            #获取html内容
            def GetHtml(self,url):
                    headers = {  
                        'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  
                    }  
                    req = urllib2.Request(url,None,headers)
                    resp = urllib2.urlopen(req,None,5)
                    html = resp.read()
                    #return html.decode('utf8')
                    return html
            def GetHtml2(self,url): 
                    page = urllib.urlopen(url) 
                    html = page.read() 
                    page.close() 
                    return html
            def GetHtml3(self,url):
                    req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                    'Accept':'text/html;q=0.9,*/*;q=0.8',
                    'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                    'Accept-Encoding':'gzip',
                    'Connection':'close',
                    'Referer':None #注意假设依旧不能抓取的话,这里能够设置抓取站点的host
                    }
                    req_timeout = 5
                    req = urllib2.Request(url,None,req_header)
                    resp = urllib2.urlopen(req,None,req_timeout)
                    html = resp.read()
                    return html
            def GetList(self,html):
                    soup = BeautifulSoup(''.join(html))
                    baseitem=soup.find('ul',{'class':'list'})
                    slist=baseitem.select('li a')
                    return slist
    
            def DownImg(self,imgurl):
                    path= r"d:/imgcache/"+self.gGetFileName(imgurl)
                    data = urllib.urlretrieve(imgurl,path)
                    return data
            def gGetFileName(self,url):
                    if url==None: return None
                    if url=="" : return ""
                    arr=url.split("/")
                    return arr[len(arr)-1]
            def mkdir(path):
                    import os
                    path=path.strip()
                    path=path.rstrip("\")
                    # 推断路径是否存在
                    # 存在     True
                    # 不存在   False
                    isExists=os.path.exists(path)
                    # 推断结果
                    if not isExists:
                            # 假设不存在则创建文件夹
                            # 创建文件夹操作函数
                            os.makedirs(path)
                            return True
                    else:
                            # 假设文件夹存在则不创建,并提示文件夹已存在
                            return False
            #返回两个值
            def ParseContent(self,html):
                    soup = BeautifulSoup(''.join(html))
                    baseitem=soup.find('div',{'class':'showbox'})
                    title=soup.find('div',{'class':'msg'}).find('div',{'class':'m_left'}).get_text()
                    imglist=baseitem.find_all('img')
                    for img in imglist:
                            imgurl=img.get('src')
                            self.DownImg(imgurl)
                    content=baseitem.get_text().encode('utf8')
                    position=content.find('热点推荐')
                    return title,content[0:position]
            
            def ParseItem(self,item):
                    url=item.get('href')
                    if url==None:
                            return
                    #print url+'
    '
                    html=obj.GetHtml2(url)
                    title,content=obj.ParseContent(html)
                    #print title+'
    '
                    return title
    def print_result(request, result):
            print str(request.requestID)+":"+result
            
            
    obj=htmlpaser()
    
    pool = threadpool.ThreadPool(10) 
    for i in range(1,40):
            url="http://op.52pk.com/shtml/op_wz/list_2594_%d.shtml"%(i)
            html=obj.GetHtml2(url)
            items=obj.GetList(html)
            print 'add job %d
    ' % (i)
            requests = threadpool.makeRequests(obj.ParseItem, items, print_result)
            [pool.putRequest(req) for req in requests] 
    pool.wait()
    


  • 相关阅读:
    python从zk获取连接并测试dubbo接口
    利用python脚本和telnet调试dubbo接口
    python制造有序中文json串的方法
    unittest用pycharm执行报错
    安装jenkins时无法解析主机:www.jenkins.io
    mui 左右滑动效果
    mui 日期控件的用法
    sql 不同where下的统计
    使用EF关于分页查询时遇到的一点疑问
    css按钮定位在div底部
  • 原文地址:https://www.cnblogs.com/bhlsheji/p/5152931.html
Copyright © 2011-2022 走看看