zoukankan      html  css  js  c++  java
  • 【Python爬虫基础】抓取知乎页面所有图片

    抓取地址所有图片

    #! /usr/bin/env python
    from urlparse import urlsplit
    from os.path import basename
    import urllib2
    import re
    import requests
    import os
    import json
    
    
    url = 'https://www.zhihu.com/question/37787176'
    
    if not os.path.exists('images'):
        os.mkdir("images")
    
    print("start>>>>>>>")
    
    page_size = 50
    offset = 0
    url_content = urllib2.urlopen(url).read()
    answers = re.findall('h3 data-num="(.*?)"', url_content)
    limits = int(answers[0])
    
    while offset < limits:
        post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
        params = json.dumps({
            'url_token': 37787176,
            'pagesize': page_size,
            'offset': offset
        })
        data = {
            '_xsrf': '',
            'method': 'next',
            'params': params
        }
        header = {
            'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
            'Host': "www.zhihu.com",
            'Referer': url
        }
        response = requests.post(post_url, data=data, headers=header)
        answer_list = response.json()["msg"]
        img_urls = re.findall('img .*?src="(.*?_b.*?)"', ''.join(answer_list))
        for img_url in img_urls:
            try:
                img_data = urllib2.urlopen(img_url).read()
                file_name = basename(urlsplit(img_url)[2])
                print(file_name)
                output = open('images/' + file_name, 'wb')
                output.write(img_data)
                output.close()
            except:
                pass
        offset += page_size
    
    print("end>>>>>>>")

    正则抓取网页title

    #!/usr/bin/python  
    # coding:utf-8   
    import httplib2  
    import urllib2  
    import re #正则表达式模块  
    
    class PageClass:  
        #获取指定url的网页内容  
        def get_page(self,url,headers):  
            http=httplib2.Http()  
            response,content=http.request(url,'GET',headers=headers)
            return content.decode('utf-8')
    
    def main():              
        headers={"cookie":'your cookie'}
        url = 'http://v.ktgj.com'
        #print headers
        page = PageClass()
        content = page.get_page(url,headers)
        return content
    
    if __name__ == "__main__":
        htmltext = main()
        pattern = re.compile(r'<title>(.*?)</title>')
        match = pattern.match(htmltext)
        if match:
            print match.group()
        print htmltext

    下载网页图片

    #! /usr/bin/env python
    from urlparse import urlsplit
    from os.path import basename
    import urllib2
    import re
    import requests
    import os
    import json
    import datetime
    
    if not os.path.exists('images'):
        os.mkdir("images")
    
    print("start>>>>>>>>>>>>>>>>>>>>>>>")
    
    url = "http://www.ssff66.com/se/jingpintaotu/519271.html"
    response = requests.get(url)
    #print(response.text)
    img_urls = re.findall('img .*?src="(.*?)"', response.text)
    #print(img_urls)
    
    for img_url in img_urls:
        try:
            img_data = urllib2.urlopen(img_url,timeout = 5).read()
            file_name = basename(urlsplit(img_url)[2])
            print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "  " + file_name)
            output = open('images/' + file_name, 'wb')
            output.write(img_data)
            output.close()
        except Exception,e:
            print("error : " + e.message)
            pass
    
    print("end>>>>>>>>>>>>>>>>>>>>>>>")
  • 相关阅读:
    太tmd恐怖了,一个搞破解的过程分析。
    JQuery爱好者们的福音:jQuery EasyUI 开源插件套装 完全替代ExtJS
    期待5月的灿烂阳光
    2010 2月记
    JQuery 的跨域方法 可跨任意网站
    准备写个ASP.NET MVC 2开发的系列文章
    Win7 访问网络共享文件夹显示空白目录的问题解决
    4月的长沙
    将ASP.NET MVC 2.0 部署在IIS6和IIS7上的教程
    谈谈年底感想
  • 原文地址:https://www.cnblogs.com/jhli/p/5915329.html
Copyright © 2011-2022 走看看