思路:
先将需要获取的匹配出,然后可以用"永真"(即while True:)来遍历使得URL可以一直自增变化(百度点击下一页URL的pn参数就增加10)每增加10就爬行一遍URL然后提取一次数据。
#-*-coding:UTF-8-*- import sys,re,requests,graphics,Tkinter import easygui as gui string = raw_input("string is :") pn = 0 while True: url = "http://www.baidu.com/s?wd=%s&pn=%d" % (string, pn) pn += 10 html = requests.get(url).text # html = """ # <div class="c-tools" id="tools_2269957611132062659_2" data-tools='{"title":"织梦CMS 官方网站 - 内容管理系统 - 上海卓卓网络科技有限公司","url":"http://www.baidu.com/link?url=gXtstOFbadX8Lia_Fwwl_AS8VUgXEfqcHe4bpP6Paj-BIGvrYgaUwI4BXvB2M4vg"}'><a class="c-tip-icon"><i class="c-icon c-icon-triangle-down-g"></i></a></div> # """ res = "<div .*? data-tools=(.*?)>.*?</div>" con = re.findall(res, html) for i in con: d = eval(i.strip("'"))#将正则匹配到的json格式的数据转换为字典,eval即为转换。 print "title:" + d[u'title'] + " " + d['url'] num = raw_input(u"e or q:") if num == "q": exit()
后期又修改了一下.
1 #!/usr/bin/env python 2 #encoding:utf-8 3 #by i3ekr 4 5 import sys,re,requests,time,json 6 print """ 7 8 #G 9 #K 10 .Et 11 :# 12 : ## 13 ##Dj K 14 .####G### 15 E;#####f; 16 ######## 17 #######. 18 .i#L#,t 19 DEDECMS 20 21 """ 22 string = raw_input("string is :") 23 pn = 0 24 nn = 0 25 r = requests.session() 26 head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'} 27 while True: 28 url = "http://www.baidu.com/s?wd=%s&pn=%d" % (string, pn) 29 html = r.get(url, headers=head).text 30 res = "<div .*? data-tools=(.*?)>.*?</div>" 31 con = re.findall(res, html) 32 pn += 10 33 nn += 1 34 try: 35 for i in con: 36 a = eval(eval(i)) 37 b = r.get(a.get("url"), headers=head) 38 print "[%s] %s"%(nn,b.url) 39 except Exception as e: 40 pass