糗事百科网站段子爬取,糗事百科是我见过的最简单的网站了!!!
1 #-*-coding:utf8-*- 2 3 import requests 4 import re 5 import sys 6 reload(sys) 7 sys.setdefaultencoding("utf-8") 8 9 url = 'http://www.qiushibaike.com/hot/page/' 10 11 page = 1 12 13 urls = [] 14 15 f = open("happy.txt",'w') 16 17 for i in range(1,10): 18 u = url+str(i) +'/' 19 urls.append(u) 20 # print urls[i-1] 21 html = requests.get(urls[i-1]) 22 23 content = re.findall('<span>(.*?)</span>',html.text,re.S) 24 25 for item in content : 26 if item[0]!='<': 27 print item 28 f.writelines(str(item)+' ') 29 30 f.close()