zoukankan      html  css  js  c++  java
  • 【Python爬虫基础】抓取知乎页面所有图片

    抓取地址所有图片

    #! /usr/bin/env python
    from urlparse import urlsplit
    from os.path import basename
    import urllib2
    import re
    import requests
    import os
    import json
    
    
    url = 'https://www.zhihu.com/question/37787176'
    
    if not os.path.exists('images'):
        os.mkdir("images")
    
    print("start>>>>>>>")
    
    page_size = 50
    offset = 0
    url_content = urllib2.urlopen(url).read()
    answers = re.findall('h3 data-num="(.*?)"', url_content)
    limits = int(answers[0])
    
    while offset < limits:
        post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
        params = json.dumps({
            'url_token': 37787176,
            'pagesize': page_size,
            'offset': offset
        })
        data = {
            '_xsrf': '',
            'method': 'next',
            'params': params
        }
        header = {
            'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
            'Host': "www.zhihu.com",
            'Referer': url
        }
        response = requests.post(post_url, data=data, headers=header)
        answer_list = response.json()["msg"]
        img_urls = re.findall('img .*?src="(.*?_b.*?)"', ''.join(answer_list))
        for img_url in img_urls:
            try:
                img_data = urllib2.urlopen(img_url).read()
                file_name = basename(urlsplit(img_url)[2])
                print(file_name)
                output = open('images/' + file_name, 'wb')
                output.write(img_data)
                output.close()
            except:
                pass
        offset += page_size
    
    print("end>>>>>>>")

    正则抓取网页title

    #!/usr/bin/python  
    # coding:utf-8   
    import httplib2  
    import urllib2  
    import re #正则表达式模块  
    
    class PageClass:  
        #获取指定url的网页内容  
        def get_page(self,url,headers):  
            http=httplib2.Http()  
            response,content=http.request(url,'GET',headers=headers)
            return content.decode('utf-8')
    
    def main():              
        headers={"cookie":'your cookie'}
        url = 'http://v.ktgj.com'
        #print headers
        page = PageClass()
        content = page.get_page(url,headers)
        return content
    
    if __name__ == "__main__":
        htmltext = main()
        pattern = re.compile(r'<title>(.*?)</title>')
        match = pattern.match(htmltext)
        if match:
            print match.group()
        print htmltext

    下载网页图片

    #! /usr/bin/env python
    from urlparse import urlsplit
    from os.path import basename
    import urllib2
    import re
    import requests
    import os
    import json
    import datetime
    
    if not os.path.exists('images'):
        os.mkdir("images")
    
    print("start>>>>>>>>>>>>>>>>>>>>>>>")
    
    url = "http://www.ssff66.com/se/jingpintaotu/519271.html"
    response = requests.get(url)
    #print(response.text)
    img_urls = re.findall('img .*?src="(.*?)"', response.text)
    #print(img_urls)
    
    for img_url in img_urls:
        try:
            img_data = urllib2.urlopen(img_url,timeout = 5).read()
            file_name = basename(urlsplit(img_url)[2])
            print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "  " + file_name)
            output = open('images/' + file_name, 'wb')
            output.write(img_data)
            output.close()
        except Exception,e:
            print("error : " + e.message)
            pass
    
    print("end>>>>>>>>>>>>>>>>>>>>>>>")
  • 相关阅读:
    window.onload方法 和jquery中的$(document).ready()方法区别
    jQuery ligerGrid 打造通用的分页排序查询表格(提供下载)
    ASP.NET MVC 4 移动特性
    javaScript RegExp常用正则表达式
    jQuery解密之执行过程分析
    1.1 jQuery总体框架
    jqMobi指南系列教程是《 jqMobi Javascript Frameworks Cheat Sheet》 的中文版
    网页设计前端页面制作的规范要求和注意事项
    js控制position:fiexd 浏览器定位 兼容个浏览器
    通用权限管理设计 之 数据权限
  • 原文地址:https://www.cnblogs.com/jhli/p/5915329.html
Copyright © 2011-2022 走看看