收集到非常多易迅网的商品ID,于是想把这些ID相应的商品信息爬下来。通过简单分析发现。易迅网的各类信息都是直接放在HTML页面上。所以,解析一个页面就好了。
最后返回每一个ID相应的商品url,标题,易迅价,促销价。类目 。
以下是python代码:
#!/usr/bin/env python #coding:utf-8 ''' Created on 2015年03月11日 @author: zhaohf ''' import urllib2 from bs4 import BeautifulSoup def get_yixun(id): price_origin,price_sale,category = '0','0','' url = 'http://item.yixun.com/item-' + id + '.html' html = urllib2.urlopen(url).read().decode('utf-8') soup = BeautifulSoup(html) title = unicode(soup.title.text.strip().strip(u'【价格_报价_图片_行情】-易迅网').replace(u'】','')).encode('utf-8').decode('utf-8') try: soup_origin = soup.find("dl", { "class" : "xbase_item xprice xprice_origin" }) price_origin = soup_origin.find("span", { "class" : "mod_price xprice_val" }).contents[1].text #易迅价 print 'price_origin: ' + price_origin except: pass try: soup_sale= soup.find('dl',{'class':'xbase_item xprice'}) price_sale = soup_sale.find("span", { "class" : "mod_price xprice_val" }).contents[1] #促销价 print 'price_sale: '+ price_sale except: pass try: category = unicode(soup.find('div',{'class','mod_crumb'}).text).encode('utf-8').decode('utf-8').replace(' ','') #所属类目 except: pass if not (price_origin is None or price_origin =='0'): print url + ' '+ price_origin + ' ' + price_sale + ' '+ category return url + ' ' + title +' '+price_origin+' '+price_sale+ ' '+ category else: print url + ' ' + price_sale+ ' ' + price_sale + ' ' + category return url + ' ' + title +' '+price_sale+' '+price_sale+ ' '+ category return None