zoukankan      html  css  js  c++  java
  • Python 实现的下载op海贼王网的图片(网络爬虫)

    没得事就爬一下我喜欢的海贼王上的图片

    须要在d盘下建立一个imgcache目录

    # -*- coding: utf-8 -*-
    
    import urllib
    import urllib2
    
    import json
    from bs4 import BeautifulSoup
    import threadpool 
    import thread
    class htmlpaser:
            def __init__(self):
                    self.url='http://1.hzfans.sinaapp.com/process.php'
            #POST数据到接口
            def Post(self,postdata):
                    # headers = {
                    #         'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  
                    # }
                    # data = urllib.urlencode(postdata)
                    # req = urllib2.Request(self.url,data,headers)
                    # resp = urllib2.urlopen(req,None,20)
                    # html = resp.read()
                    # return html
                    data = urllib.urlencode(postdata)
                    req = urllib2.Request(url, data)
                    html= urllib2.urlopen(req).read()
                    print html
            #获取html内容
            def GetHtml(self,url):
                    headers = {  
                        'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  
                    }  
                    req = urllib2.Request(url,None,headers)
                    resp = urllib2.urlopen(req,None,5)
                    html = resp.read()
                    #return html.decode('utf8')
                    return html
            def GetHtml2(self,url): 
                    page = urllib.urlopen(url) 
                    html = page.read() 
                    page.close() 
                    return html
            def GetHtml3(self,url):
                    req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                    'Accept':'text/html;q=0.9,*/*;q=0.8',
                    'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                    'Accept-Encoding':'gzip',
                    'Connection':'close',
                    'Referer':None #注意假设依旧不能抓取的话,这里能够设置抓取站点的host
                    }
                    req_timeout = 5
                    req = urllib2.Request(url,None,req_header)
                    resp = urllib2.urlopen(req,None,req_timeout)
                    html = resp.read()
                    return html
            def GetList(self,html):
                    soup = BeautifulSoup(''.join(html))
                    baseitem=soup.find('ul',{'class':'list'})
                    slist=baseitem.select('li a')
                    return slist
    
            def DownImg(self,imgurl):
                    path= r"d:/imgcache/"+self.gGetFileName(imgurl)
                    data = urllib.urlretrieve(imgurl,path)
                    return data
            def gGetFileName(self,url):
                    if url==None: return None
                    if url=="" : return ""
                    arr=url.split("/")
                    return arr[len(arr)-1]
            def mkdir(path):
                    import os
                    path=path.strip()
                    path=path.rstrip("\")
                    # 推断路径是否存在
                    # 存在     True
                    # 不存在   False
                    isExists=os.path.exists(path)
                    # 推断结果
                    if not isExists:
                            # 假设不存在则创建文件夹
                            # 创建文件夹操作函数
                            os.makedirs(path)
                            return True
                    else:
                            # 假设文件夹存在则不创建,并提示文件夹已存在
                            return False
            #返回两个值
            def ParseContent(self,html):
                    soup = BeautifulSoup(''.join(html))
                    baseitem=soup.find('div',{'class':'showbox'})
                    title=soup.find('div',{'class':'msg'}).find('div',{'class':'m_left'}).get_text()
                    imglist=baseitem.find_all('img')
                    for img in imglist:
                            imgurl=img.get('src')
                            self.DownImg(imgurl)
                    content=baseitem.get_text().encode('utf8')
                    position=content.find('热点推荐')
                    return title,content[0:position]
            
            def ParseItem(self,item):
                    url=item.get('href')
                    if url==None:
                            return
                    #print url+'
    '
                    html=obj.GetHtml2(url)
                    title,content=obj.ParseContent(html)
                    #print title+'
    '
                    return title
    def print_result(request, result):
            print str(request.requestID)+":"+result
            
            
    obj=htmlpaser()
    
    pool = threadpool.ThreadPool(10) 
    for i in range(1,40):
            url="http://op.52pk.com/shtml/op_wz/list_2594_%d.shtml"%(i)
            html=obj.GetHtml2(url)
            items=obj.GetList(html)
            print 'add job %d
    ' % (i)
            requests = threadpool.makeRequests(obj.ParseItem, items, print_result)
            [pool.putRequest(req) for req in requests] 
    pool.wait()
    


  • 相关阅读:
    POJ 1887 Testing the CATCHER
    HDU 3374 String Problem
    HDU 2609 How many
    POJ 1509 Glass Beads
    POJ 1458 Common Subsequence
    POJ 1159 Palindrome
    POJ 1056 IMMEDIATE DECODABILITY
    POJ 3080 Blue Jeans
    POJ 1200 Crazy Search
    软件体系结构的艺术阅读笔记1
  • 原文地址:https://www.cnblogs.com/bhlsheji/p/5152931.html
Copyright © 2011-2022 走看看