zoukankan      html  css  js  c++  java
  • 初学-BeautifulSoup爬取豆瓣页面

    # -*- coding: utf-8 -*-
    import os
    import urllib
    import urllib2
    from bs4 import BeautifulSoup

    headers = {
    'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
    'Accept - Language':'zh - CN, zh;',
    'Cache - Control':'max - age = 0',
    'Connection':'keep - alive',
    'Content - Length':'125',
    'Content - Type':'application / x - www - form - urlencoded',
    'X-Content-Type-Options':'nosniff',
    'X-DAE-Node':'daisy2b',
    'X-Douban-Mobileapp':'0',
    'X-Xss-Protection':'1; mode=block',
    }


    def parse(html,downloader_Function):
    soup = BeautifulSoup(html, 'html.parser')
    all_a = soup.find_all(rel="nofollow")
    for a in all_a:

    if 'src' not in a.attrs:
    print a['href']
    else:
    path = a['src']
    name = a['alt']
    downloader_Function(path,name)

    def htmlContent(url):
    req = urllib2.Request(url, headers=headers)
    resp = urllib2.urlopen(req)
    html = resp.read()
    return html


    def fileDownloader(path,fileName):
    currentDir = os.getcwd() + '/download/'

    filePath = currentDir +'%s.png'%fileName
    urllib.urlretrieve(path,filePath)

    def start():
    htmlText = htmlContent('https://movie.douban.com/')
    print htmlText
    parse(htmlText,fileDownloader)

    start()
    print(dir(BeautifulSoup))

  • 相关阅读:
    简练软考知识点整理-项目定义活动过程
    简练软考知识点整理-规划进度管理
    简练软考知识点整理-控制范围
    软考考前注意事项
    简练软考知识点整理-确认范围管理
    数据库之表关系
    数据库引擎
    数据库概念
    IO模型
    异步回调,线程队列,协程
  • 原文地址:https://www.cnblogs.com/air-liyan/p/8422840.html
Copyright © 2011-2022 走看看