[本文出自天外归云的博客园]
1. 在安卓网上对热门机型进行爬网,取前五十:
# -*- coding: utf-8 -*- import requests,re from bs4 import BeautifulSoup def get_rank_list(): s = requests.Session() rank_list = [] for pageNum in xrange(1,10): url = "http://product.hiapk.com/mobile/p"+str(pageNum)+"-s1-list.html" r = s.get(url) soup = BeautifulSoup(r.content,"lxml") content = soup.find(id='content') if len(rank_list)<51: for item in content.findAll('dt'): phone_name = item.find('a').attrs['title'] if (not re.search('iphone', phone_name, re.IGNORECASE)) and (len(rank_list)<51): rank_list.append(phone_name) else: break else: break return rank_list if __name__ == '__main__': for phone in get_rank_list(): print phone
2. 在talkingdata上对安卓手机统计数据进行分类爬取:
# -*- coding: utf-8 -*- import requests,re,sys from bs4 import BeautifulSoup ''' type: 1-按品牌排名 2-按机型排名 3-按分辨率排名 4-按操作系统排名 5-按运营商排名 6-按网络排名 ''' def rank_crawl(type): s = requests.Session() url = 'http://mi.talkingdata.com/terminals.html?terminalType='+str(type) r = s.get(url) soup = BeautifulSoup(r.content,"lxml") list_content = soup.find(id='list-content') rank_list = [] for item in list_content.findAll('a'): rank_list.append(item.attrs['title']) return rank_list if __name__ == '__main__': rank_all = {} rank_all['brand'] = rank_crawl(1) rank_all['model'] = rank_crawl(2) rank_all['resolution'] = rank_crawl(3) rank_all['system'] = rank_crawl(4) rank_all['operator'] = rank_crawl(5) rank_all['network'] = rank_crawl(6) for one in rank_all[sys.argv[1]]: print one.encode("gbk")