第一个网页文本爬虫程序(没有添加下载器):
1 import requests 2 from bs4 import BeautifulSoup 3 import os 4 5 headers={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTMl,like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 6 url_begin= 'http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000' 7 start_url=requests.get(url_begin,headers=headers) 8 #print(start_url.text) 9 Soup=BeautifulSoup(start_url.text,'lxml') 10 link_list=Soup.find('div',class_="x-sidebar-left-content").find_all('a') 11 #print(link_list) 12 13 for link in link_list: 14 url='http://www.liaoxuefeng.com' + link['href'] 15 html=requests.get(url,headers=headers) 16 html_Soup=BeautifulSoup(html.text,'lxml') 17 title_list = html_Soup.find('div', class_="x-content").find_all('h4') 18 # print(title_list) 19 for title in title_list: 20 titlereal = title.get_text() 21 print(titlereal) 22 23 content_list = html_Soup.find("div", class_="x-wiki-content").find_all('p') 24 for content in content_list: 25 # print(content) 26 contentreal = content.get_text() 27 print(contentreal)
第二个网页图片爬虫(引入os模块,可以将网页内容爬取到本地文件夹)
1 import requests 2 from bs4 import BeautifulSoup 3 import os 4 import urllib 5 import urllib3 6 7 url= 'http://www.dbmeinv.com/?pager_oofset=1' 8 x=0 9 10 def crawl(url): 11 headers = { 12 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTMl,like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 13 req=requests.get(url,headers=headers) 14 15 Soup=BeautifulSoup(req.text,'lxml') 16 link_list=Soup.find_all('img') 17 for girl in link_list: 18 link= girl.get('src') 19 print(link) 20 21 global x 22 path=r'/Users/wangxitao/Desktop/douban' 23 local=os.path.join(path,'image\%s.jpg'%x) 24 urllib.request.urlretrieve(link,local) 25 #'image\%s.jpg'%x 26 27 x+=1 28 print("正在下载第%s张"%x) 29 30 for page in range(1,10): 31 page+=1 32 url='http://www.dbmeinv.com/?pager_offset=%d'%page 33 crawl(url) 34 35 print('爬取完毕')