今天根据正则表达式简单的爬了一下大众点评,把北京的美食爬了爬,(店铺名,人均消费,地址)
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import re import urllib.request from urllib.request import urlopen def getPage(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36'} req = urllib.request.Request(url=url, headers=headers) res = urllib.request.urlopen(req) return res.read().decode('utf-8') def parsePage(s): ret = com.finditer(s) for i in ret: ret = { "店铺名": i.group("shop_name"), "人均价格": i.group("per_capita"), "地址": i.group("address"), } yield ret def main(num): url = "http://www.dianping.com/beijing/ch10/p%s?aid=92020785%%2C102284990&cpt=92020785%%2C102284990" % num response_html = getPage(url) ret = parsePage(response_html) print(ret) f = open("eat_info", "a", encoding="utf-8") for obj in ret: print(obj) data = str(obj) f.write(data + " ") com = re.compile( '<div class="txt">.*?<h4>(?P<shop_name>.*?)</h4>' '.*?<b>¥(?P<per_capita>d+)</b>.*?<span class="addr">(?P<address>.*?)</span>', re.S) count = 1 for i in range(50): main(count) count += 1