1.
from bs4 import BeautifulSoup info = [] with open('D:web1111/new_index.html','r') as wb_data: Soup = BeautifulSoup(wb_data,'lxml') images = Soup.select('body > div.main-content > ul > li > img') titles = Soup.select('body > div.main-content > ul > li > div.article-info > h3 > a') descs = Soup.select('body > div.main-content > ul > li > div.article-info > p.description') rates = Soup.select('body > div.main-content > ul > li > div.rate > span') cates = Soup.select('body > div.main-content > ul > li > div.article-info > p.meta-info') # print (images,titles,descs,rates,cates) for title,image,desc,rate,cate in zip(titles,images,descs,rates,cates): data = { 'title' :title.get_text(), 'rate' :rate.get_text(), 'desc' :desc.get_text(), 'cate' :list(cate.stripped_strings), 'image' :image.get('src') } info.append(data) for i in info: if float(i['rate'])>3: print(i['title'],i['cate']) ''' body > div.main-content > ul > li:nth-child(1) > div.article-info > h3 > a body > div.main-content > ul > li:nth-child(1) > div.article-info > p.meta-info > span:nth-child(2) body > div.main-content > ul > li:nth-child(1) > div.rate > span body > div.main-content > ul > li:nth-child(1) > div.article-info > p.description body > div.main-content > ul > li:nth-child(1) > img '''
2.
# !/usr/bin/python #-*- coding: utf-8 -*- from bs4 import BeautifulSoup import requests url = 'http://bj.xiaozhu.com/fangzi/1508951935.html' wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text,'lxml') # 因为是单页面,使用 select 方法获得的元素又是一个列表,那么列表中的第一个元素且也是唯一一个元素即是我们要找的信息 用 “[0]” 索引将其取出 # 后在对其使用处理的方法,因为 beautifulsoup 的些筛选方法并不能针对列表类型的元素使用 ;) title = soup.select('div.pho_info > h4')[0].text address = soup.select('div.pho_info > p')[0].get('title') # 和 get('href') 同理,他们都是标签的一个属性而已,我们只需要的到这个属性的内容即可 price = soup.select('div.day_l > span')[0].text pic = soup.select('#curBigImage')[0].get('src') # “#” 代表 id 这个找元素其实就是找他在页面的唯一 host_name = soup.select('a.lorder_name')[0].text host_gender = soup.select('div.member_pic > div')[0].get('class')[0] # 请在此处打印并观察结果 print(title) print(address) print(price) print(pic) print(host_name) print(host_gender) # 根据结果观察不同性别会用不同的图标样式(class),设计一个函数进行转换 def print_gender(class_name): if class_name == 'member_ico1': return '女' if class_name == 'member_ico': return '男' data = { 'title':title, 'address':address, 'price':price, 'pic':pic, 'host_name':host_name, 'host_gender':print_gender(host_gender) } print(data) # -------------------补充------------------ # 如何批量获取链接 page_link = [] # <- 每个详情页的链接都存在这里,解析详情的时候就遍历这个列表然后访问就好啦~ def get_page_link(page_number): for each_number in range(1,page_number): # 每页24个链接,这里输入的是页码 full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number) wb_data = requests.get(full_url) soup = BeautifulSoup(wb_data.text,'lxml') for link in soup.select('a.resule_img_a'): # 找到这个 class 样为resule_img_a 的 a 标签即可 page_link.append(link) # ---------------------
3.
from bs4 import BeautifulSoup data = [] path = './web/new_index.html' with open(path, 'r') as f: Soup = BeautifulSoup(f.read(), 'lxml') titles = Soup.select('ul > li > div.article-info > h3 > a') pics = Soup.select('ul > li > img') descs = Soup.select('ul > li > div.article-info > p.description') rates = Soup.select('ul > li > div.rate > span') cates = Soup.select('ul > li > div.article-info > p.meta-info') for title, pic, desc, rate, cate in zip(titles, pics, descs, rates, cates): info = { 'title': title.get_text(), 'pic': pic.get('src'), 'descs': desc.get_text(), 'rate': rate.get_text(), 'cate': list(cate.stripped_strings) } data.append(info) for i in data: if len(i['rate']) >= 3: print(i['title'], i['cate'])
4.
# !/usr/bin/python #-*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup def fit(url): base_url = url headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} html = requests.get(base_url,headers=headers) soup = BeautifulSoup(html.text,'lxml') # print soup data = [] # content = soup.select('body > div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.title > a')[0] # url_last = content.get('href') url_first = 'https://www.zhihu.com' # url_context = url_first+url_last def url_handle(url_last): ##排除掉专栏 if url_last.startswith('/question'): url_context = url_first + url_last return url_context else: url_context = url_last return url_context titles = soup.select('div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.title > a') writes = soup.select('body > div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.content > div > div.entry-body > div.entry-meta > div > span.author-link-line > a') prizes = soup.select('body > div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.content > div > div.entry-left.hidden-phone > a') contents = soup.select('body > div.zg-wrap.zu-main.clearfix > div.zu-main-content > div > ul > li > div.title > a') for title,write,prize,content in zip(titles,writes,prizes,contents): url_last = content.get('href') url_context = url_handle(url_last) # url_context = url_first + url_last html_content = requests.get(url_context,headers=headers) soup = BeautifulSoup(html_content.text,'lxml') if url_context.startswith('https://www.zhihu.com'): first_answer = soup.select('#zh-question-answer-wrap > div:nth-of-type(1) > div.zm-item-rich-text.expandable.js-collapse-body > div.zm-editable-content.clearfix')[0].get_text() else: first_answer = 'no answer' info = { 'title': title.get_text(), 'write': write.get_text(), 'prize': prize.get_text(), 'content': first_answer } data.append(info) for a in data: print 'title:'+a['title'] print 'write:' + a['write'] print 'prize:' + a['prize'] print 'content:' + a['content'] url = 'https://www.zhihu.com/search?type=content&q=健身' fit(url)