python需要unicode编码,网页需要utf-8。
判断类型:
>>> s = "你好" #utf-8编码 >>> >>> l1 = [s] >>> >>> print l1 ['xe4xbdxa0xe5xa5xbd'] >>> >>> s = u"你好" #unicode编码 >>> >>> l1 = [s] >>> >>> print l1 [u'u4f60u597d']
蜘蛛爬虫
1.由一个或几个起始网址分析其内容,目的是分析出新的网址(种子)。然后对新的种子进行分析。
2.获取需要的内容,读取需要的内容。
用chrome浏览器打开指定页面
google-chrome ***.html
抓取百度搜索数据
#coding:utf-8 import urllib2,re myinput = raw_input() #输入搜索关键字 httpaddr = "http://www.baidu.com/s?wd=%s" %myinput f = urllib2.urlopen(httpaddr) buf = f.read().replace(" ","") #去掉换行符 #print buf all_buf = re.findall('''<div id="content_left">.*<div style="clear:both;height:19px;">''',buf) for i in all_buf: print i
循环抓取指定网站的链接存入数据库
1 #coding:utf-8 2 import urllib2,re 3 import zlib 4 import sys 5 from uliweb.orm import * 6 7 #db = get_connection("mysql://root:mysql@localhost/sprider?charset=utf8") 8 #获取url地址 9 def get_url(httpaddr): 10 f = urllib2.urlopen(httpaddr) 11 buf = f.read() 12 if f.headers.get('Content-Encoding') == 'gzip': 13 buf = zlib.decompress(buf, 16+zlib.MAX_WBITS) 14 buf = buf.replace(" ","") 15 #print buf 16 all_buf = re.findall(r'''<a.*?href.*?=["'](http://.*?)["'][> ]''',buf) 17 return all_buf 18 #将新的url存入数据库 19 def saved_url(url): 20 newtab = spider_url.get(spider_url.c.url == url) #判断是否有重复 21 if newtab: 22 return 23 newtab = spider_url() 24 newtab.url = url 25 newtab.status = "0" 26 newtab.save() 27 #更新url读取状态 28 def update_url(url): 29 newtab = spider_url.get(spider_url.c.url == url) 30 newtab.status = "1" 31 newtab.save() 32 #执行新的任务 33 def get_new_task(): 34 eachurl = spider_url.get(spider_url.c.status == "0") 35 if eachurl: 36 return eachurl.url 37 return eachurl 38 39 if __name__ == "__main__": 40 url = sys.argv[1] 41 db = get_connection("mysql://root:mysql@localhost/sprider?charset=utf8") 42 #注意以下数据库表的操作不能单独列出来,必须在create前面,connection后面 43 class spider_url(Model): 44 url = Field(str) 45 status = Field(str) 46 47 db.metadata.drop_all() 48 db.metadata.create_all() 49 50 while 1: 51 try: 52 url_buf = get_url(url) 53 for url in url_buf: 54 saved_url(url) 55 except: 56 pass 57 url = get_new_task() 58 if url is None: 59 break 60 #url_buf = get_url(url) 61 update_url(url)