初学-BeautifulSoup爬取豆瓣页面 - 走看看

zoukankan html css js c++ java

初学-BeautifulSoup爬取豆瓣页面

# -*- coding: utf-8 -*-
import os
import urllib
import urllib2
from bs4 import BeautifulSoup

headers = {
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
'Accept - Language':'zh - CN, zh;',
'Cache - Control':'max - age = 0',
'Connection':'keep - alive',
'Content - Length':'125',
'Content - Type':'application / x - www - form - urlencoded',
'X-Content-Type-Options':'nosniff',
'X-DAE-Node':'daisy2b',
'X-Douban-Mobileapp':'0',
'X-Xss-Protection':'1; mode=block',
}

def parse(html,downloader_Function):
soup = BeautifulSoup(html, 'html.parser')
all_a = soup.find_all(rel="nofollow")
for a in all_a:

if 'src' not in a.attrs:
print a['href']
else:
path = a['src']
name = a['alt']
downloader_Function(path,name)

def htmlContent(url):
req = urllib2.Request(url, headers=headers)
resp = urllib2.urlopen(req)
html = resp.read()
return html

def fileDownloader(path,fileName):
currentDir = os.getcwd() + '/download/'

filePath = currentDir +'%s.png'%fileName
urllib.urlretrieve(path,filePath)

def start():
htmlText = htmlContent('https://movie.douban.com/')
print htmlText
parse(htmlText,fileDownloader)

start()
print(dir(BeautifulSoup))

查看全文

相关阅读:
简练软考知识点整理-项目定义活动过程
 简练软考知识点整理-规划进度管理
 简练软考知识点整理-控制范围
 软考考前注意事项
 简练软考知识点整理-确认范围管理
 数据库之表关系
 数据库引擎
 数据库概念
 IO模型
 异步回调，线程队列，协程

原文地址：https://www.cnblogs.com/air-liyan/p/8422840.html

Copyright © 2011-2022 走看看