zoukankan      html  css  js  c++  java
  • 构建自己的博客专用搜索引擎--抓数据

    博客园有自己的lucene.net搜索引擎,还有google的站内搜索,不过即使是google搜索,也不会完全索引我的内容,它也挑三捡四的,即使索引了,由于计算机方面的搜索的词典的冷门性,也没办法有很高的查全率。所以我一直希望做一个自己的博客的全文索引。

    本来想搞一个能用的基于rake+hbase+whenever+massive_record的方案来实现可扩展,做了一半,感觉整个工程周期太长,还是放了一旁,拿起以前的代码,改吧改吧先能用起来再说

    我使用的是以前15-1688小额批发搜索引擎的部分脚本,之前使用web ui的方式来定制抓取的脚本模板,这里就直接拿来用了。

    整个抓取数据的过程分为4步,共4个脚本,

    A生成列表页链接

    B抓取列表页

    C抽取詳細页链接

    D抽取詳細页

    我就直接上代码了

    A

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.15,v0.2
    2010.10.07,v0.1
    批量生成列表页链接
    """
    import sys,os,time
    list_url_template = "http://www.cnblogs.com/lexus/default.html?page=%s"
    list_url_start    = 1
    list_url_end      = 154
    list_links_file   = os.path.join("./","list_links.txt")
    g_step=1
    
    def cost_time(func):
        def newFunc(*args, **args2):
            t0 = time.time()
            print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
            back = func(*args, **args2)
            print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
            print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
            return back
        return newFunc
    
    @cost_time
    def show(list_url_start=list_url_start,\
             list_url_end=list_url_end,\
             list_url_template=list_url_template):
        lines=[]
        for i in xrange(list_url_start,list_url_end+1):
            line="http://www.cnblogs.com/lexus/default.html?page=%s\n"%(i*g_step)
            print line.rstrip()
            lines.append(line)
        open(list_links_file,"w").writelines(lines)
        print "total count:%s"%len(lines)
        print "done!"
    
    #import os.path
    #print os.path.abspath(".")
    if __name__=="__main__":
        l=len(sys.argv)
        if l==1:
            show()
        elif l==2:
            show(list_url_end=int(sys.argv[1]))
        elif l==3:
            show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]))
        elif l==4:
            show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]),list_url_template=sys.argv[3])
    

    B

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.16,v0.3
    2010.10.09,v0.2
    2010.10.07,v0.1
    批量抓取列表页
    """
    from __future__ import with_statement
    from __future__ import division
    
    import socket as original_socket
    original_socket.setdefaulttimeout(10)
    from eventlet.timeout import with_timeout
    from eventlet.green import urllib2
    
    import sys
    ####reload(sys)
    ####sys.setdefaultencoding('utf-8')
    
    import eventlet
    from eventlet import pools
    #httplib2 = eventlet.import_patched('httplib2')
    #httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)
    
    import time
    
    import os
    
    import os.path
    
    import stat
    
    import select
    
    import shutil
    
    import re
    
    import gzip
    import StringIO
    
    list_list_folder    = os.path.join("./","lists")
    list_info_folder    = os.path.join("./","lists_infos")
    status_file         = os.path.join("./","lists_infos/status.txt")
    error_file          = os.path.join("./","lists_infos/error.txt")
    error_file_bak      = os.path.join("./","lists_infos/error.txt.bak")
    success_file        = os.path.join("./","lists_infos/success.txt")
    list_links_file     = os.path.join("./","list_links.txt")
    g_headers={}
    g_pool_num          = 5
    
    def init():
        if not os.path.exists(list_list_folder):
            os.mkdir(list_list_folder)
        if not os.path.exists(list_info_folder):
            os.mkdir(list_info_folder)
        print "完成初始化"
    
    def delete(src):
        '''delete files and folders'''
        permission(src)
        if os.path.isfile(src):
            try:
                os.remove(src)
            except:
                pass
        elif os.path.isdir(src):
            for item in os.listdir(src):
                itemsrc=os.path.join(src,item)
                delete(itemsrc)
            try:
                os.rmdir(src)
            except:
                pass
    
    def permission(src):
        os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)    
    
    def clear():
        delete(list_list_folder)
        delete(list_info_folder)
        print "还原为初始"
    
    def size(src):
        "检查文件或文件夹大小"
        r = 0L
        if os.path.isfile(src):
            r=os.path.getsize(src)
        else:
            for root, dirs, files in os.walk(src):
               r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
        l=len(str(r))
    
        if l>9:
            r=r/1024/1024/1024
            r="%.2f GiB"%r
        elif l>6:
            r=r/1024/1024
            r="%.2f MiB"%r
        elif l>3:
            r=r/1024
            r="%.2f KiB"%r
        print "%s 大小为:%s"%(src,r)
    
    def status(str):
        "running/stop"
        f=open(status_file,"w")
        f.write(str)
        f.close()    
    
    def error(url,ex):
        f=open(error_file,"a")
        f.write("%s\n"%(url,))
        f.close()
    
    def success(url):
        f=open(success_file,"a")
        f.write("%s\n"%url)
        f.close()
    
    def url2filename(url):
        import base64
        return base64.urlsafe_b64encode(url)
    
    def url2filename2(url):
        url=url.strip()
        idx=url.rfind("/")
        r=url[idx+1:]
        if idx==-1 or len(r)==0:
    #       raise ValueError("url2filename function parser error")
            print "启用特殊url2filename"
            r = re.findall(r"\d+", url)[-1]
        return r
    
    def cost_time(func):
        def newFunc(*args, **args2):
            t0 = time.time()
            print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
            back = func(*args, **args2)
            print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
            print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
            return back
        return newFunc
    
    def statistics(func):
        def tongji():
            total,successed=0,0
            if os.path.exists(list_links_file):
                total=len(set(open(list_links_file,"r").readlines()))
                print "total lines:%s"%total
            if os.path.exists(success_file):
                successed=len(set(open(success_file,"r").readlines()))
                print "successed lines:%s"%successed
            print "left lines:%s"%(total-successed)
        def newFunc(*args,**args2):
            tongji()
            back = func(*args, **args2)
            tongji()
            return back
        return newFunc
    
    def get_html(url):
        def do(url):
            html=""
            try:
                req = urllib2.Request(url = url,headers = g_headers)
                html = urllib2.urlopen(req).read()
                return html
            except Exception,e:
                print url,"error",e
                error(url,e)
                return None
        rr = with_timeout(10, do, url, timeout_value=None)
        return rr
    
    def get_html22(url):
        import types
        def do(url):
            html=""
            try:
                req = urllib2.Request(url = url,headers = g_headers)
                html = urllib2.urlopen(req).read()
                t=type(html)
                if t==types.StringTypes or t==types.UnicodeType:
                    return html
                else:
                    print url,"error======"
                    return ""
            except Exception,e1:
                pdata = StringIO.StringIO(rr)#下面6行是实现解压缩
                gzipper = gzip.GzipFile(fileobj = pdata)
                try:
                    html = gzipper.read()
                    return html
                except Exception,e2:
                    print url,e1,e2
                    error(url,e1)
                return ""
        rr = with_timeout(10, do, url, timeout_value="")
        return rr
    
    def get_html2(url):
        "when use gzipped page will get fetch error"
        #print url
        with httppool.item() as http:
            #eventlet.sleep(0)
            resp, content = http.request(url)
            print content
            return content
    
    def save_html2file(filename,html):
        f=open(filename,"w")
        f.write(html)
        f.close()
    
    def save_url2file(url):
        #html=""
        #try:
        #    html=get_html(url)
        #except Exception,e:
        #    print url,"fetch error",e
        #    error(url,e)
        #    return
        html=get_html(url)
        if html is not None and html<>"":
            filename=os.path.join(list_list_folder,url2filename(url))
            save_html2file(filename,html)
            if os.path.getsize(filename)<1024*20:
                error(url,"size小于%s"%(1024*20))
                print url,"error"
                return
            success(url)#以成功的为基准,剩下的都是不成功的或未执行的
            print url,"success"
        else:
            print url,"error"
            error(url,"html为None或为空")
    
    @cost_time
    @statistics
    def batch_get_html(urls):
        print "执行批量下载网页工作"
        pool=eventlet.GreenPool(g_pool_num)
        for url in urls:
            pool.spawn_n(save_url2file,url)
        pool.waitall()
        print "done!"
    
    def process_continue():
        "接着success抓取剩下的部分"
        #读取完整的部分和已完成的部分进行取非交集合
        done=set(open(success_file,"r").read().split("\n"))
        all=set(open(list_links_file,"r").read().split("\n"))
        left=all-done
        batch_get_html(left)
    
    if __name__=="__main__":
        init()
        l=len(sys.argv)
        if l==1:
            content=""
            if not select.select([sys.stdin,],[],[],0.0)[0]:
                print "load from %s"%list_links_file
                content=open(list_links_file,"r").read()
            else:
                print "load from stdin"
                content=sys.stdin.read()
            urls=content.strip().split("\n")
            #print urls
            batch_get_html(urls)
            size(list_list_folder)
        elif l==2:
            argv=sys.argv[1]
            if argv=="clear":
                clear()
            if argv=="continue":
                process_continue()
        elif l==3:
            argv=sys.argv[1]
            if argv=="load":
                url=sys.argv[2]
                print url
                save_url2file(url)
        print "done!"
    

    C

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.16,v0.22
    2010.10.11,v0.21
    2010.10.09,v0.2
    2010.10.07,v0.1
    从列表页抽取详细页的链接和缩略图链接的脚本
    """
    import sys
    import re
    import os.path
    
    list_list_folder      = os.path.join("./","lists")
    success_file        = os.path.join("./","lists_infos/success.txt")
    detail_links_file   = os.path.join("./","extract_detail_links.txt")
    
    #g_pattern=r"""
    
    [^"]*?)\1[\s\S]*?[^"]*?)\3 """ g_pattern=r"""http://www.cnblogs.com/lexus/archive/\d{4}/\d{1,2}/\d{1,2}/\d{1,}\.html)\1[\s\S]*?>(?P[\s\S]*?)<[\s\S]*?/[\s\S]*?a[\s\S]*?>""" if g_pattern[-2]=='"': g_pattern=g_pattern[:-2]+'\\"' else: g_pattern=g_pattern[:-1] def url2filename(url): import base64 return base64.urlsafe_b64encode(url) def url2filename2(url): url=url.strip() idx=url.rfind("/") r=url[idx+1:] if idx==-1 or len(r)==0: # raise ValueError("url2filename function parser error") print "启用特殊url2filename" r = re.findall(r"\d+", url)[-1] return r def delete(src): '''delete files and folders''' #permission(src) if os.path.isfile(src): try: os.remove(src) print "删除文件%s"%src except: pass elif os.path.isdir(src): for item in os.listdir(src): itemsrc=os.path.join(src,item) delete(itemsrc) try: os.rmdir(src) print "删除文件夹%s"%src except: pass def clear(): delete(detail_links_file) def extract_detail_link(url): lines=[] regex=re.compile(g_pattern) file=os.path.join(list_list_folder,url2filename(url)) subject=open(file,"r").read() for match in regex.finditer(subject): #line="%s,%s\n"%(match.group("link").replace("&","&"),match.group("img").replace("http:/www","http://www").replace(","," ")) line="%s,\n"%(match.group("link").replace("&","&"),) lines.append(line) return lines def batch_extract_detail_links(): f=open(detail_links_file,"w") urls=open(success_file,"r").read().strip().split("\n") total=[] for url in urls: lines=extract_detail_link(url) total.extend(lines) print "%s,%s"%(url,len(lines)) s=set(total) f.writelines(s) f.close() print "done!" print "repeat count:%s"%(len(total)-len(s)) print "total lines:%s"%len(s) if __name__=="__main__": l=len(sys.argv) if l==1: batch_extract_detail_links() elif l==2: if sys.argv[1]=="clear": clear()

    D

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.16,v0.13
    2010.10.15,v0.12
    2010.10.13,v0.11
    2010.10.07,v0.1
    批量抓取详细页
    """
    from __future__ import with_statement
    from __future__ import division
    
    import socket as original_socket
    original_socket.setdefaulttimeout(10)
    from eventlet.timeout import with_timeout
    from eventlet.green import urllib2
    
    from urlparse import urljoin
    import sys
    ####reload(sys)
    ####sys.setdefaultencoding('utf-8')
    
    import eventlet
    from eventlet import pools
    #httplib2 = eventlet.import_patched('httplib2')
    #httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)
    
    import time
    
    import os
    
    import os.path
    
    import stat
    
    import select
    
    g_host                  = "http://www.cnblogs.com/lexus"
    g_data_folder           = os.path.join("./","details")
    g_info_folder           = os.path.join("./","details_infos")
    g_status_file           = os.path.join("./","details_infos/status.txt")
    g_error_file            = os.path.join("./","details_infos/error.txt")
    g_success_file          = os.path.join("./","details_infos/success.txt")
    g_result_links_file     = os.path.join("./","extract_detail_links.txt")
    g_pool_num              = 1
    g_headers={}
    headers                 = """"""
    headers                 = headers.strip().replace("\r\n","\n")
    if headers<>"":
        for elem in headers.split("\n"):
            if elem.strip()=="":
                continue
            a,b=elem.split(":",1)
            a=a.strip()
            b=b.strip()
            g_headers[a]=b
    
    def init():
        if not os.path.exists(g_data_folder):
            os.mkdir(g_data_folder)
        if not os.path.exists(g_info_folder):
            os.mkdir(g_info_folder)
        print "完成初始化"
    
    def delete(src):
        '''delete files and folders'''
        permission(src)
        if os.path.isfile(src):
            try:
                os.remove(src)
            except:
                pass
        elif os.path.isdir(src):
            for item in os.listdir(src):
                itemsrc=os.path.join(src,item)
                delete(itemsrc)
            try:
                os.rmdir(src)
            except:
                pass
    
    def permission(src):
        os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)    
    
    def clear():
        delete(g_data_folder)
        delete(g_info_folder)
        print "还原为初始"
    
    def size(src):
        "检查文件或文件夹大小"
        r = 0L
        if os.path.isfile(src):
            r=os.path.getsize(src)
        else:
            for root, dirs, files in os.walk(src):
               r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
        l=len(str(r))
    
        if l>9:
            r=r/1024/1024/1024
            r="%.2f GiB"%r
        elif l>6:
            r=r/1024/1024
            r="%.2f MiB"%r
        elif l>3:
            r=r/1024
            r="%.2f KiB"%r
        print "%s 大小为:%s"%(src,r)
    
    def status(str):
        "running/stop"
        f=open(g_status_file,"w")
        f.write(str)
        f.close()    
    
    def error(url,ex):
        f=open(g_error_file,"a")
        f.write("%s\n"%(url,))
        f.close()
    
    def success(url):
        f=open(g_success_file,"a")
        f.write("%s\n"%url)
        f.close()
    
    def url2filename(url):
        import base64
        return base64.urlsafe_b64encode(url)
    
    def url2filename2(url):
        url=url.strip()
        idx=url.rfind("/")
        r=url[idx+1:]
        if idx==-1 or len(r)==0:
    #       raise ValueError("url2filename function parser error")
            print "启用特殊url2filename"
            r = re.findall(r"\d+", url)[-1]
        return r
    
    def statistics(func):
        def tongji():
            total,successed=0,0
            if os.path.exists(g_result_links_file):
                total=len(set(open(g_result_links_file,"r").readlines()))
                print "total lines:%s"%total
            if os.path.exists(g_success_file):
                successed=len(set(open(g_success_file,"r").readlines()))
                print "successed lines:%s"%successed
            print "left lines:%s"%(total-successed)
        def newFunc(*args,**args2):
            tongji()
            back = func(*args, **args2)
            tongji()
            return back
        return newFunc
    
    def cost_time(func):
        def newFunc(*args, **args2):
            t0 = time.time()
            print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
            back = func(*args, **args2)
            print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
            print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
            return back
        return newFunc
    
    def get_html(url):
        def do(url):
            html=""
            try:
                req = urllib2.Request(url = url,headers = g_headers)
                html = urllib2.urlopen(req).read()
                return html
            except Exception,e:
                print url,"error",e
                error(url,e)
                return None
        rr = with_timeout(10, do, url, timeout_value=None)
        return rr
    
    def get_html2(url):
        #print url
        with httppool.item() as http:
            #eventlet.sleep(0)
            resp, content = http.request(url,'GET',headers=g_headers)
            #resp, content = http.request(url)
            return content
    
    def save_html2file(filename,html):
        f=open(filename,"w")
        f.write(html)
        f.close()
    
    def save_url2file(url):
        a,b=url.strip().split(",")
        if not a.startswith("http://"):
            a=urljoin(g_host,a)
        #a=a.replace("&","&")
        html=get_html(a)
        if html is not None and html<>"":
            filename=os.path.join(g_data_folder,url2filename(a))
            save_html2file(filename,html)
            if os.path.getsize(filename)<1024*10:
                error(url,"size小于%s"%(1024*10))
                print url,"error"
                return
            success(url)#以成功的为基准,剩下的都是不成功的或未执行的
            print url,"success"
        else:
            print url,"error"
            error(url,"html为None或为空")
    
    def save_url2file2(url):
        a,b=url.strip().split(",")
        if not a.startswith("http://"):
            a=urljoin(g_host,a)
        html=""
        try:
            html=get_html(a)
        except Exception,e:
            print url,e,"fetch error"
            error(url,e)
            return
    
        if html<>"":
            filename=os.path.join(g_data_folder,url2filename(a))
            save_html2file(filename,html)
            if os.path.getsize(filename)<1024*10:             error(url,"size小于%s"%(1024*10))             print url,"error"             return         success(url)#以成功的为基准,剩下的都是不成功的或未执行的         print url,"success" @cost_time @statistics def batch_get_html(urls):     print "执行批量下载网页工作"     pool=eventlet.GreenPool(g_pool_num)     for url in urls:         pool.spawn_n(save_url2file,url)     pool.waitall()     size(g_data_folder)     print "done!" def count():     total,successed=set(),set()     if os.path.exists(g_success_file):         successed=set(open(g_success_file,"r").read().strip().split("\n"))     if os.path.exists(g_result_links_file):         total=set(open(g_result_links_file,"r").read().strip().split("\n"))     left=total-successed     return total,successed,left def process_continue():     "接着success抓取剩下的部分"     #读取完整的部分和已完成的部分进行取非交集合     total,successed,left=count()     batch_get_html(left) def process_forever():     "循环处理,直到全部完成"     total,successed,left=count()     print "left"     while len(left)>0:
            print "由于还没未完成页面,再次循环执行"
            process_continue()
            total,successed,left=count()
    
    if __name__=="__main__":
        init()
        l=len(sys.argv)
        if l==1:
            content=""
            if not select.select([sys.stdin,],[],[],0.0)[0]:
                print "load from %s"%g_result_links_file
                content=open(g_result_links_file,"r").read()
            else:
                print "load from stdin"
                content=sys.stdin.read()
            urls=content.strip().split("\n")
            #print urls
            batch_get_html(urls)
        elif l==2:
            argv=sys.argv[1]
            if argv=="clear":
                clear()
            if argv=="continue":
                process_continue()
            if argv=="loop":
                process_forever()
        elif l==3:
            if sys.argv[1]=="load":
                url=sys.argv[2]
                save_url2file(url)
        print "done!"
    

    代码我使用pre标签进行标识,直接在browser下查看可能有连行的问题,本来想找高亮工具,但是我的blog client没有这个feature,就算了,copy代码的话,查看源代码定位pre标签换行什么的都有的,不需要改动就可以用了,
    这几年写了大概3060篇blog,大概3/4应该是转载加一些自己的注解吧,整个抓下来大概265M的样子。
    代码使用了协程coroutine来加快下载,
    本来有一个fabric的统领性的脚本可以传到远程服务器上利用服务器IO再解析再返回本地的脚本,找不到了,大概是上次电脑crash的时候丢了,我再努力找找,我有备份的好习惯应该能找到,呵呵
    先这样,下一篇我再写一下,如何来索引数据
    PS:也不知道博客园的积分排名是怎么算的,我写的blog也不少,怎么就排不进首页呢,总是在470左右的样子

    这个爬虫的一个问题是没有办法发现新的链接,及内容的变化,这也是我为什么想用hbase+whenever来做的原因,不过这次先做为备份数据,先保留一份到本地吧。

  • 相关阅读:
    计算机网络 3.* 数据通信技术基础 .1
    计算机网络3.2&3.3(第二节介质&第三节多路复用)
    python mooc 3维可视化<第一周第二&三单元>
    python mooc 3维可视化<第一周第一单元>
    conda 下配置环境
    第二周<岭回归>
    第二周<线性回归>
    第二周<导学/分类>
    iOS UILabel详解
    开始iOS 7中自动布局教程(一)
  • 原文地址:https://www.cnblogs.com/lexus/p/2285393.html
Copyright © 2011-2022 走看看