zoukankan      html  css  js  c++  java
  • 初学-BeautifulSoup爬取豆瓣页面

    # -*- coding: utf-8 -*-
    import os
    import urllib
    import urllib2
    from bs4 import BeautifulSoup

    headers = {
    'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
    'Accept - Language':'zh - CN, zh;',
    'Cache - Control':'max - age = 0',
    'Connection':'keep - alive',
    'Content - Length':'125',
    'Content - Type':'application / x - www - form - urlencoded',
    'X-Content-Type-Options':'nosniff',
    'X-DAE-Node':'daisy2b',
    'X-Douban-Mobileapp':'0',
    'X-Xss-Protection':'1; mode=block',
    }


    def parse(html,downloader_Function):
    soup = BeautifulSoup(html, 'html.parser')
    all_a = soup.find_all(rel="nofollow")
    for a in all_a:

    if 'src' not in a.attrs:
    print a['href']
    else:
    path = a['src']
    name = a['alt']
    downloader_Function(path,name)

    def htmlContent(url):
    req = urllib2.Request(url, headers=headers)
    resp = urllib2.urlopen(req)
    html = resp.read()
    return html


    def fileDownloader(path,fileName):
    currentDir = os.getcwd() + '/download/'

    filePath = currentDir +'%s.png'%fileName
    urllib.urlretrieve(path,filePath)

    def start():
    htmlText = htmlContent('https://movie.douban.com/')
    print htmlText
    parse(htmlText,fileDownloader)

    start()
    print(dir(BeautifulSoup))

  • 相关阅读:
    队列
    使用JPype实现Python调用JAVA程序
    Django和Flask对于URL尾斜杠(back slash)的处理
    数据仓库建设中的数据建模方法(转)
    python自定义logger handler
    Eclipse下.project和.classpath作用(转)
    理解python的with语句
    django常见小问题收集(转)
    windows下无法创建django工程的问题
    Excel的python读写
  • 原文地址:https://www.cnblogs.com/air-liyan/p/8422840.html
Copyright © 2011-2022 走看看