如何爬取?
明确目标:爬取百度百科,定初始百度词条:python,初始URL:http://baike.baidu.com/item/Python,爬取数据量为1000条,值爬取简介,标题,和简介中url
怎么爬: 利用谷歌开发工具,分析html结构,分析查询层次与方法
怎么写: 面向过程和面向对象两个方向
环境声明:
python 3.50
requests 库
beautifulsoup 库
使用面向过程的方式爬取:
#!/usr/bin/python3 import re import bs4 import requests from bs4 import BeautifulSoup # 从百度百科爬取数据为三个字段,标题,简介,关联URL # 给定初始百度词条:python,初始URL:http://baike.baidu.com/item/Python,爬取数据量为1000条 # 那就先有4个模块,URL管理器,下载器,解析器,数据展示 # 通过requests、BeautifulSoup两个库,实现下载器和解析器,通过两个集合数据类型,实现URL管理器 # URL拼接 起始url :http://baike.baidu.com # new_urls = set() # old_urls = set() # 已经在old_urls不再爬取,不在添加到new_urls中并从其中返回一个URL def url_manager(links): if links is not None: # 把重复的url去掉 links = links.difference(old_urls) if links is not None: for i in links: new_urls.add(i) def download_html(url): headers = { # 'Host': 'static.tieba.baidu.com', 'Referer': 'http://baike.baidu.com/item/Python', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'} # 返回下载页面 try: response = requests.get(url, headers=headers) response.raise_for_status() response.encoding = response.apparent_encoding return response.text except: return None pass def analysis(page_html, one_url): # 返回标题,简介,关联URL # temp = title + introduction + page_url # links = 关联URL links = [] temp_url = 'http://baike.baidu.com' soup = BeautifulSoup(page_html, 'html.parser') # 获取标题 title = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1').get_text() # print(title) # 获取简介 introduction = soup.find('div', class_="lemma-summary").get_text().replace(' Python[1]xa0 (英国发音:/ˈpaɪθən/ 美国发音:/ˈpaɪθɑːn/),', '') # print(introduction) # 获得关联URL,只爬取简介中关联的URL links_labl = soup.find('div', class_="lemma-summary").find_all('a', href=re.compile("^/item/")) # links_text = soup.find('div', class_="lemma-summary").find_all('a', href=re.compile("^/item/")) # for i in links_text: # print(i.get_text()) for link in links_labl: new_url = temp_url + link['href'] links.append(new_url) temp = one_url + ' : ' + title + '_' + introduction message.append(temp) if links is not None: links = set(links) else: links = None return links def out_data(): for i in message: print(i) pass if __name__ == '__main__': new_urls = set() old_urls = set() message = [] start_url = 'http://baike.baidu.com/item/Python' # 起始页 page_html = download_html(start_url) links = analysis(page_html, start_url) url_manager(links) # 起始页简介中URL for i in range(100): url = new_urls.pop() try: page_html = download_html(url) if not page_html: continue urls = analysis(page_html, url) url_manager(urls) except: print('爬取失败') old_urls.add(url) # 依次打印爬取到的值 out_data()