博客园有自己的lucene.net搜索引擎,还有google的站内搜索,不过即使是google搜索,也不会完全索引我的内容,它也挑三捡四的,即使索引了,由于计算机方面的搜索的词典的冷门性,也没办法有很高的查全率。所以我一直希望做一个自己的博客的全文索引。
本来想搞一个能用的基于rake+hbase+whenever+massive_record的方案来实现可扩展,做了一半,感觉整个工程周期太长,还是放了一旁,拿起以前的代码,改吧改吧先能用起来再说
我使用的是以前15-1688小额批发搜索引擎的部分脚本,之前使用web ui的方式来定制抓取的脚本模板,这里就直接拿来用了。
整个抓取数据的过程分为4步,共4个脚本,
A生成列表页链接
B抓取列表页
C抽取詳細页链接
D抽取詳細页
我就直接上代码了
A
#!/usr/bin/env python #encoding=utf-8 """ 2010.10.15,v0.2 2010.10.07,v0.1 批量生成列表页链接 """ import sys,os,time list_url_template = "http://www.cnblogs.com/lexus/default.html?page=%s" list_url_start = 1 list_url_end = 154 list_links_file = os.path.join("./","list_links.txt") g_step=1 def cost_time(func): def newFunc(*args, **args2): t0 = time.time() print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__) back = func(*args, **args2) print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__) print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__) return back return newFunc @cost_time def show(list_url_start=list_url_start,\ list_url_end=list_url_end,\ list_url_template=list_url_template): lines=[] for i in xrange(list_url_start,list_url_end+1): line="http://www.cnblogs.com/lexus/default.html?page=%s\n"%(i*g_step) print line.rstrip() lines.append(line) open(list_links_file,"w").writelines(lines) print "total count:%s"%len(lines) print "done!" #import os.path #print os.path.abspath(".") if __name__=="__main__": l=len(sys.argv) if l==1: show() elif l==2: show(list_url_end=int(sys.argv[1])) elif l==3: show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2])) elif l==4: show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]),list_url_template=sys.argv[3])
B
#!/usr/bin/env python #encoding=utf-8 """ 2010.10.16,v0.3 2010.10.09,v0.2 2010.10.07,v0.1 批量抓取列表页 """ from __future__ import with_statement from __future__ import division import socket as original_socket original_socket.setdefaulttimeout(10) from eventlet.timeout import with_timeout from eventlet.green import urllib2 import sys ####reload(sys) ####sys.setdefaultencoding('utf-8') import eventlet from eventlet import pools #httplib2 = eventlet.import_patched('httplib2') #httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20) import time import os import os.path import stat import select import shutil import re import gzip import StringIO list_list_folder = os.path.join("./","lists") list_info_folder = os.path.join("./","lists_infos") status_file = os.path.join("./","lists_infos/status.txt") error_file = os.path.join("./","lists_infos/error.txt") error_file_bak = os.path.join("./","lists_infos/error.txt.bak") success_file = os.path.join("./","lists_infos/success.txt") list_links_file = os.path.join("./","list_links.txt") g_headers={} g_pool_num = 5 def init(): if not os.path.exists(list_list_folder): os.mkdir(list_list_folder) if not os.path.exists(list_info_folder): os.mkdir(list_info_folder) print "完成初始化" def delete(src): '''delete files and folders''' permission(src) if os.path.isfile(src): try: os.remove(src) except: pass elif os.path.isdir(src): for item in os.listdir(src): itemsrc=os.path.join(src,item) delete(itemsrc) try: os.rmdir(src) except: pass def permission(src): os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG) def clear(): delete(list_list_folder) delete(list_info_folder) print "还原为初始" def size(src): "检查文件或文件夹大小" r = 0L if os.path.isfile(src): r=os.path.getsize(src) else: for root, dirs, files in os.walk(src): r += sum([os.path.getsize(os.path.join(root, name)) for name in files]) l=len(str(r)) if l>9: r=r/1024/1024/1024 r="%.2f GiB"%r elif l>6: r=r/1024/1024 r="%.2f MiB"%r elif l>3: r=r/1024 r="%.2f KiB"%r print "%s 大小为:%s"%(src,r) def status(str): "running/stop" f=open(status_file,"w") f.write(str) f.close() def error(url,ex): f=open(error_file,"a") f.write("%s\n"%(url,)) f.close() def success(url): f=open(success_file,"a") f.write("%s\n"%url) f.close() def url2filename(url): import base64 return base64.urlsafe_b64encode(url) def url2filename2(url): url=url.strip() idx=url.rfind("/") r=url[idx+1:] if idx==-1 or len(r)==0: # raise ValueError("url2filename function parser error") print "启用特殊url2filename" r = re.findall(r"\d+", url)[-1] return r def cost_time(func): def newFunc(*args, **args2): t0 = time.time() print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__) back = func(*args, **args2) print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__) print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__) return back return newFunc def statistics(func): def tongji(): total,successed=0,0 if os.path.exists(list_links_file): total=len(set(open(list_links_file,"r").readlines())) print "total lines:%s"%total if os.path.exists(success_file): successed=len(set(open(success_file,"r").readlines())) print "successed lines:%s"%successed print "left lines:%s"%(total-successed) def newFunc(*args,**args2): tongji() back = func(*args, **args2) tongji() return back return newFunc def get_html(url): def do(url): html="" try: req = urllib2.Request(url = url,headers = g_headers) html = urllib2.urlopen(req).read() return html except Exception,e: print url,"error",e error(url,e) return None rr = with_timeout(10, do, url, timeout_value=None) return rr def get_html22(url): import types def do(url): html="" try: req = urllib2.Request(url = url,headers = g_headers) html = urllib2.urlopen(req).read() t=type(html) if t==types.StringTypes or t==types.UnicodeType: return html else: print url,"error======" return "" except Exception,e1: pdata = StringIO.StringIO(rr)#下面6行是实现解压缩 gzipper = gzip.GzipFile(fileobj = pdata) try: html = gzipper.read() return html except Exception,e2: print url,e1,e2 error(url,e1) return "" rr = with_timeout(10, do, url, timeout_value="") return rr def get_html2(url): "when use gzipped page will get fetch error" #print url with httppool.item() as http: #eventlet.sleep(0) resp, content = http.request(url) print content return content def save_html2file(filename,html): f=open(filename,"w") f.write(html) f.close() def save_url2file(url): #html="" #try: # html=get_html(url) #except Exception,e: # print url,"fetch error",e # error(url,e) # return html=get_html(url) if html is not None and html<>"": filename=os.path.join(list_list_folder,url2filename(url)) save_html2file(filename,html) if os.path.getsize(filename)<1024*20: error(url,"size小于%s"%(1024*20)) print url,"error" return success(url)#以成功的为基准,剩下的都是不成功的或未执行的 print url,"success" else: print url,"error" error(url,"html为None或为空") @cost_time @statistics def batch_get_html(urls): print "执行批量下载网页工作" pool=eventlet.GreenPool(g_pool_num) for url in urls: pool.spawn_n(save_url2file,url) pool.waitall() print "done!" def process_continue(): "接着success抓取剩下的部分" #读取完整的部分和已完成的部分进行取非交集合 done=set(open(success_file,"r").read().split("\n")) all=set(open(list_links_file,"r").read().split("\n")) left=all-done batch_get_html(left) if __name__=="__main__": init() l=len(sys.argv) if l==1: content="" if not select.select([sys.stdin,],[],[],0.0)[0]: print "load from %s"%list_links_file content=open(list_links_file,"r").read() else: print "load from stdin" content=sys.stdin.read() urls=content.strip().split("\n") #print urls batch_get_html(urls) size(list_list_folder) elif l==2: argv=sys.argv[1] if argv=="clear": clear() if argv=="continue": process_continue() elif l==3: argv=sys.argv[1] if argv=="load": url=sys.argv[2] print url save_url2file(url) print "done!"
C
#!/usr/bin/env python #encoding=utf-8 """ 2010.10.16,v0.22 2010.10.11,v0.21 2010.10.09,v0.2 2010.10.07,v0.1 从列表页抽取详细页的链接和缩略图链接的脚本 """ import sys import re import os.path list_list_folder = os.path.join("./","lists") success_file = os.path.join("./","lists_infos/success.txt") detail_links_file = os.path.join("./","extract_detail_links.txt") #g_pattern=r"""[^"]*?)\1[\s\S]*?[^"]*?)\3 """ g_pattern=r"""http://www.cnblogs.com/lexus/archive/\d{4}/\d{1,2}/\d{1,2}/\d{1,}\.html)\1[\s\S]*?>(?P[\s\S]*?)<[\s\S]*?/[\s\S]*?a[\s\S]*?>""" if g_pattern[-2]=='"': g_pattern=g_pattern[:-2]+'\\"' else: g_pattern=g_pattern[:-1] def url2filename(url): import base64 return base64.urlsafe_b64encode(url) def url2filename2(url): url=url.strip() idx=url.rfind("/") r=url[idx+1:] if idx==-1 or len(r)==0: # raise ValueError("url2filename function parser error") print "启用特殊url2filename" r = re.findall(r"\d+", url)[-1] return r def delete(src): '''delete files and folders''' #permission(src) if os.path.isfile(src): try: os.remove(src) print "删除文件%s"%src except: pass elif os.path.isdir(src): for item in os.listdir(src): itemsrc=os.path.join(src,item) delete(itemsrc) try: os.rmdir(src) print "删除文件夹%s"%src except: pass def clear(): delete(detail_links_file) def extract_detail_link(url): lines=[] regex=re.compile(g_pattern) file=os.path.join(list_list_folder,url2filename(url)) subject=open(file,"r").read() for match in regex.finditer(subject): #line="%s,%s\n"%(match.group("link").replace("&","&"),match.group("img").replace("http:/www","http://www").replace(","," ")) line="%s,\n"%(match.group("link").replace("&","&"),) lines.append(line) return lines def batch_extract_detail_links(): f=open(detail_links_file,"w") urls=open(success_file,"r").read().strip().split("\n") total=[] for url in urls: lines=extract_detail_link(url) total.extend(lines) print "%s,%s"%(url,len(lines)) s=set(total) f.writelines(s) f.close() print "done!" print "repeat count:%s"%(len(total)-len(s)) print "total lines:%s"%len(s) if __name__=="__main__": l=len(sys.argv) if l==1: batch_extract_detail_links() elif l==2: if sys.argv[1]=="clear": clear()
D
#!/usr/bin/env python #encoding=utf-8 """ 2010.10.16,v0.13 2010.10.15,v0.12 2010.10.13,v0.11 2010.10.07,v0.1 批量抓取详细页 """ from __future__ import with_statement from __future__ import division import socket as original_socket original_socket.setdefaulttimeout(10) from eventlet.timeout import with_timeout from eventlet.green import urllib2 from urlparse import urljoin import sys ####reload(sys) ####sys.setdefaultencoding('utf-8') import eventlet from eventlet import pools #httplib2 = eventlet.import_patched('httplib2') #httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20) import time import os import os.path import stat import select g_host = "http://www.cnblogs.com/lexus" g_data_folder = os.path.join("./","details") g_info_folder = os.path.join("./","details_infos") g_status_file = os.path.join("./","details_infos/status.txt") g_error_file = os.path.join("./","details_infos/error.txt") g_success_file = os.path.join("./","details_infos/success.txt") g_result_links_file = os.path.join("./","extract_detail_links.txt") g_pool_num = 1 g_headers={} headers = """""" headers = headers.strip().replace("\r\n","\n") if headers<>"": for elem in headers.split("\n"): if elem.strip()=="": continue a,b=elem.split(":",1) a=a.strip() b=b.strip() g_headers[a]=b def init(): if not os.path.exists(g_data_folder): os.mkdir(g_data_folder) if not os.path.exists(g_info_folder): os.mkdir(g_info_folder) print "完成初始化" def delete(src): '''delete files and folders''' permission(src) if os.path.isfile(src): try: os.remove(src) except: pass elif os.path.isdir(src): for item in os.listdir(src): itemsrc=os.path.join(src,item) delete(itemsrc) try: os.rmdir(src) except: pass def permission(src): os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG) def clear(): delete(g_data_folder) delete(g_info_folder) print "还原为初始" def size(src): "检查文件或文件夹大小" r = 0L if os.path.isfile(src): r=os.path.getsize(src) else: for root, dirs, files in os.walk(src): r += sum([os.path.getsize(os.path.join(root, name)) for name in files]) l=len(str(r)) if l>9: r=r/1024/1024/1024 r="%.2f GiB"%r elif l>6: r=r/1024/1024 r="%.2f MiB"%r elif l>3: r=r/1024 r="%.2f KiB"%r print "%s 大小为:%s"%(src,r) def status(str): "running/stop" f=open(g_status_file,"w") f.write(str) f.close() def error(url,ex): f=open(g_error_file,"a") f.write("%s\n"%(url,)) f.close() def success(url): f=open(g_success_file,"a") f.write("%s\n"%url) f.close() def url2filename(url): import base64 return base64.urlsafe_b64encode(url) def url2filename2(url): url=url.strip() idx=url.rfind("/") r=url[idx+1:] if idx==-1 or len(r)==0: # raise ValueError("url2filename function parser error") print "启用特殊url2filename" r = re.findall(r"\d+", url)[-1] return r def statistics(func): def tongji(): total,successed=0,0 if os.path.exists(g_result_links_file): total=len(set(open(g_result_links_file,"r").readlines())) print "total lines:%s"%total if os.path.exists(g_success_file): successed=len(set(open(g_success_file,"r").readlines())) print "successed lines:%s"%successed print "left lines:%s"%(total-successed) def newFunc(*args,**args2): tongji() back = func(*args, **args2) tongji() return back return newFunc def cost_time(func): def newFunc(*args, **args2): t0 = time.time() print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__) back = func(*args, **args2) print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__) print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__) return back return newFunc def get_html(url): def do(url): html="" try: req = urllib2.Request(url = url,headers = g_headers) html = urllib2.urlopen(req).read() return html except Exception,e: print url,"error",e error(url,e) return None rr = with_timeout(10, do, url, timeout_value=None) return rr def get_html2(url): #print url with httppool.item() as http: #eventlet.sleep(0) resp, content = http.request(url,'GET',headers=g_headers) #resp, content = http.request(url) return content def save_html2file(filename,html): f=open(filename,"w") f.write(html) f.close() def save_url2file(url): a,b=url.strip().split(",") if not a.startswith("http://"): a=urljoin(g_host,a) #a=a.replace("&","&") html=get_html(a) if html is not None and html<>"": filename=os.path.join(g_data_folder,url2filename(a)) save_html2file(filename,html) if os.path.getsize(filename)<1024*10: error(url,"size小于%s"%(1024*10)) print url,"error" return success(url)#以成功的为基准,剩下的都是不成功的或未执行的 print url,"success" else: print url,"error" error(url,"html为None或为空") def save_url2file2(url): a,b=url.strip().split(",") if not a.startswith("http://"): a=urljoin(g_host,a) html="" try: html=get_html(a) except Exception,e: print url,e,"fetch error" error(url,e) return if html<>"": filename=os.path.join(g_data_folder,url2filename(a)) save_html2file(filename,html) if os.path.getsize(filename)<1024*10: error(url,"size小于%s"%(1024*10)) print url,"error" return success(url)#以成功的为基准,剩下的都是不成功的或未执行的 print url,"success" @cost_time @statistics def batch_get_html(urls): print "执行批量下载网页工作" pool=eventlet.GreenPool(g_pool_num) for url in urls: pool.spawn_n(save_url2file,url) pool.waitall() size(g_data_folder) print "done!" def count(): total,successed=set(),set() if os.path.exists(g_success_file): successed=set(open(g_success_file,"r").read().strip().split("\n")) if os.path.exists(g_result_links_file): total=set(open(g_result_links_file,"r").read().strip().split("\n")) left=total-successed return total,successed,left def process_continue(): "接着success抓取剩下的部分" #读取完整的部分和已完成的部分进行取非交集合 total,successed,left=count() batch_get_html(left) def process_forever(): "循环处理,直到全部完成" total,successed,left=count() print "left" while len(left)>0: print "由于还没未完成页面,再次循环执行" process_continue() total,successed,left=count() if __name__=="__main__": init() l=len(sys.argv) if l==1: content="" if not select.select([sys.stdin,],[],[],0.0)[0]: print "load from %s"%g_result_links_file content=open(g_result_links_file,"r").read() else: print "load from stdin" content=sys.stdin.read() urls=content.strip().split("\n") #print urls batch_get_html(urls) elif l==2: argv=sys.argv[1] if argv=="clear": clear() if argv=="continue": process_continue() if argv=="loop": process_forever() elif l==3: if sys.argv[1]=="load": url=sys.argv[2] save_url2file(url) print "done!"
代码我使用pre标签进行标识,直接在browser下查看可能有连行的问题,本来想找高亮工具,但是我的blog client没有这个feature,就算了,copy代码的话,查看源代码定位pre标签换行什么的都有的,不需要改动就可以用了,
这几年写了大概3060篇blog,大概3/4应该是转载加一些自己的注解吧,整个抓下来大概265M的样子。
代码使用了协程coroutine来加快下载,
本来有一个fabric的统领性的脚本可以传到远程服务器上利用服务器IO再解析再返回本地的脚本,找不到了,大概是上次电脑crash的时候丢了,我再努力找找,我有备份的好习惯应该能找到,呵呵
先这样,下一篇我再写一下,如何来索引数据
PS:也不知道博客园的积分排名是怎么算的,我写的blog也不少,怎么就排不进首页呢,总是在470左右的样子
这个爬虫的一个问题是没有办法发现新的链接,及内容的变化,这也是我为什么想用hbase+whenever来做的原因,不过这次先做为备份数据,先保留一份到本地吧。