zoukankan      html  css  js  c++  java
  • 构建自己的博客专用搜索引擎--抓数据

    博客园有自己的lucene.net搜索引擎,还有google的站内搜索,不过即使是google搜索,也不会完全索引我的内容,它也挑三捡四的,即使索引了,由于计算机方面的搜索的词典的冷门性,也没办法有很高的查全率。所以我一直希望做一个自己的博客的全文索引。

    本来想搞一个能用的基于rake+hbase+whenever+massive_record的方案来实现可扩展,做了一半,感觉整个工程周期太长,还是放了一旁,拿起以前的代码,改吧改吧先能用起来再说

    我使用的是以前15-1688小额批发搜索引擎的部分脚本,之前使用web ui的方式来定制抓取的脚本模板,这里就直接拿来用了。

    整个抓取数据的过程分为4步,共4个脚本,

    A生成列表页链接

    B抓取列表页

    C抽取詳細页链接

    D抽取詳細页

    我就直接上代码了

    A

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.15,v0.2
    2010.10.07,v0.1
    批量生成列表页链接
    """
    import sys,os,time
    list_url_template = "http://www.cnblogs.com/lexus/default.html?page=%s"
    list_url_start    = 1
    list_url_end      = 154
    list_links_file   = os.path.join("./","list_links.txt")
    g_step=1
    
    def cost_time(func):
        def newFunc(*args, **args2):
            t0 = time.time()
            print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
            back = func(*args, **args2)
            print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
            print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
            return back
        return newFunc
    
    @cost_time
    def show(list_url_start=list_url_start,\
             list_url_end=list_url_end,\
             list_url_template=list_url_template):
        lines=[]
        for i in xrange(list_url_start,list_url_end+1):
            line="http://www.cnblogs.com/lexus/default.html?page=%s\n"%(i*g_step)
            print line.rstrip()
            lines.append(line)
        open(list_links_file,"w").writelines(lines)
        print "total count:%s"%len(lines)
        print "done!"
    
    #import os.path
    #print os.path.abspath(".")
    if __name__=="__main__":
        l=len(sys.argv)
        if l==1:
            show()
        elif l==2:
            show(list_url_end=int(sys.argv[1]))
        elif l==3:
            show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]))
        elif l==4:
            show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]),list_url_template=sys.argv[3])
    

    B

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.16,v0.3
    2010.10.09,v0.2
    2010.10.07,v0.1
    批量抓取列表页
    """
    from __future__ import with_statement
    from __future__ import division
    
    import socket as original_socket
    original_socket.setdefaulttimeout(10)
    from eventlet.timeout import with_timeout
    from eventlet.green import urllib2
    
    import sys
    ####reload(sys)
    ####sys.setdefaultencoding('utf-8')
    
    import eventlet
    from eventlet import pools
    #httplib2 = eventlet.import_patched('httplib2')
    #httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)
    
    import time
    
    import os
    
    import os.path
    
    import stat
    
    import select
    
    import shutil
    
    import re
    
    import gzip
    import StringIO
    
    list_list_folder    = os.path.join("./","lists")
    list_info_folder    = os.path.join("./","lists_infos")
    status_file         = os.path.join("./","lists_infos/status.txt")
    error_file          = os.path.join("./","lists_infos/error.txt")
    error_file_bak      = os.path.join("./","lists_infos/error.txt.bak")
    success_file        = os.path.join("./","lists_infos/success.txt")
    list_links_file     = os.path.join("./","list_links.txt")
    g_headers={}
    g_pool_num          = 5
    
    def init():
        if not os.path.exists(list_list_folder):
            os.mkdir(list_list_folder)
        if not os.path.exists(list_info_folder):
            os.mkdir(list_info_folder)
        print "完成初始化"
    
    def delete(src):
        '''delete files and folders'''
        permission(src)
        if os.path.isfile(src):
            try:
                os.remove(src)
            except:
                pass
        elif os.path.isdir(src):
            for item in os.listdir(src):
                itemsrc=os.path.join(src,item)
                delete(itemsrc)
            try:
                os.rmdir(src)
            except:
                pass
    
    def permission(src):
        os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)    
    
    def clear():
        delete(list_list_folder)
        delete(list_info_folder)
        print "还原为初始"
    
    def size(src):
        "检查文件或文件夹大小"
        r = 0L
        if os.path.isfile(src):
            r=os.path.getsize(src)
        else:
            for root, dirs, files in os.walk(src):
               r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
        l=len(str(r))
    
        if l>9:
            r=r/1024/1024/1024
            r="%.2f GiB"%r
        elif l>6:
            r=r/1024/1024
            r="%.2f MiB"%r
        elif l>3:
            r=r/1024
            r="%.2f KiB"%r
        print "%s 大小为:%s"%(src,r)
    
    def status(str):
        "running/stop"
        f=open(status_file,"w")
        f.write(str)
        f.close()    
    
    def error(url,ex):
        f=open(error_file,"a")
        f.write("%s\n"%(url,))
        f.close()
    
    def success(url):
        f=open(success_file,"a")
        f.write("%s\n"%url)
        f.close()
    
    def url2filename(url):
        import base64
        return base64.urlsafe_b64encode(url)
    
    def url2filename2(url):
        url=url.strip()
        idx=url.rfind("/")
        r=url[idx+1:]
        if idx==-1 or len(r)==0:
    #       raise ValueError("url2filename function parser error")
            print "启用特殊url2filename"
            r = re.findall(r"\d+", url)[-1]
        return r
    
    def cost_time(func):
        def newFunc(*args, **args2):
            t0 = time.time()
            print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
            back = func(*args, **args2)
            print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
            print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
            return back
        return newFunc
    
    def statistics(func):
        def tongji():
            total,successed=0,0
            if os.path.exists(list_links_file):
                total=len(set(open(list_links_file,"r").readlines()))
                print "total lines:%s"%total
            if os.path.exists(success_file):
                successed=len(set(open(success_file,"r").readlines()))
                print "successed lines:%s"%successed
            print "left lines:%s"%(total-successed)
        def newFunc(*args,**args2):
            tongji()
            back = func(*args, **args2)
            tongji()
            return back
        return newFunc
    
    def get_html(url):
        def do(url):
            html=""
            try:
                req = urllib2.Request(url = url,headers = g_headers)
                html = urllib2.urlopen(req).read()
                return html
            except Exception,e:
                print url,"error",e
                error(url,e)
                return None
        rr = with_timeout(10, do, url, timeout_value=None)
        return rr
    
    def get_html22(url):
        import types
        def do(url):
            html=""
            try:
                req = urllib2.Request(url = url,headers = g_headers)
                html = urllib2.urlopen(req).read()
                t=type(html)
                if t==types.StringTypes or t==types.UnicodeType:
                    return html
                else:
                    print url,"error======"
                    return ""
            except Exception,e1:
                pdata = StringIO.StringIO(rr)#下面6行是实现解压缩
                gzipper = gzip.GzipFile(fileobj = pdata)
                try:
                    html = gzipper.read()
                    return html
                except Exception,e2:
                    print url,e1,e2
                    error(url,e1)
                return ""
        rr = with_timeout(10, do, url, timeout_value="")
        return rr
    
    def get_html2(url):
        "when use gzipped page will get fetch error"
        #print url
        with httppool.item() as http:
            #eventlet.sleep(0)
            resp, content = http.request(url)
            print content
            return content
    
    def save_html2file(filename,html):
        f=open(filename,"w")
        f.write(html)
        f.close()
    
    def save_url2file(url):
        #html=""
        #try:
        #    html=get_html(url)
        #except Exception,e:
        #    print url,"fetch error",e
        #    error(url,e)
        #    return
        html=get_html(url)
        if html is not None and html<>"":
            filename=os.path.join(list_list_folder,url2filename(url))
            save_html2file(filename,html)
            if os.path.getsize(filename)<1024*20:
                error(url,"size小于%s"%(1024*20))
                print url,"error"
                return
            success(url)#以成功的为基准,剩下的都是不成功的或未执行的
            print url,"success"
        else:
            print url,"error"
            error(url,"html为None或为空")
    
    @cost_time
    @statistics
    def batch_get_html(urls):
        print "执行批量下载网页工作"
        pool=eventlet.GreenPool(g_pool_num)
        for url in urls:
            pool.spawn_n(save_url2file,url)
        pool.waitall()
        print "done!"
    
    def process_continue():
        "接着success抓取剩下的部分"
        #读取完整的部分和已完成的部分进行取非交集合
        done=set(open(success_file,"r").read().split("\n"))
        all=set(open(list_links_file,"r").read().split("\n"))
        left=all-done
        batch_get_html(left)
    
    if __name__=="__main__":
        init()
        l=len(sys.argv)
        if l==1:
            content=""
            if not select.select([sys.stdin,],[],[],0.0)[0]:
                print "load from %s"%list_links_file
                content=open(list_links_file,"r").read()
            else:
                print "load from stdin"
                content=sys.stdin.read()
            urls=content.strip().split("\n")
            #print urls
            batch_get_html(urls)
            size(list_list_folder)
        elif l==2:
            argv=sys.argv[1]
            if argv=="clear":
                clear()
            if argv=="continue":
                process_continue()
        elif l==3:
            argv=sys.argv[1]
            if argv=="load":
                url=sys.argv[2]
                print url
                save_url2file(url)
        print "done!"
    

    C

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.16,v0.22
    2010.10.11,v0.21
    2010.10.09,v0.2
    2010.10.07,v0.1
    从列表页抽取详细页的链接和缩略图链接的脚本
    """
    import sys
    import re
    import os.path
    
    list_list_folder      = os.path.join("./","lists")
    success_file        = os.path.join("./","lists_infos/success.txt")
    detail_links_file   = os.path.join("./","extract_detail_links.txt")
    
    #g_pattern=r"""
    
    [^"]*?)\1[\s\S]*?[^"]*?)\3 """ g_pattern=r"""http://www.cnblogs.com/lexus/archive/\d{4}/\d{1,2}/\d{1,2}/\d{1,}\.html)\1[\s\S]*?>(?P[\s\S]*?)<[\s\S]*?/[\s\S]*?a[\s\S]*?>""" if g_pattern[-2]=='"': g_pattern=g_pattern[:-2]+'\\"' else: g_pattern=g_pattern[:-1] def url2filename(url): import base64 return base64.urlsafe_b64encode(url) def url2filename2(url): url=url.strip() idx=url.rfind("/") r=url[idx+1:] if idx==-1 or len(r)==0: # raise ValueError("url2filename function parser error") print "启用特殊url2filename" r = re.findall(r"\d+", url)[-1] return r def delete(src): '''delete files and folders''' #permission(src) if os.path.isfile(src): try: os.remove(src) print "删除文件%s"%src except: pass elif os.path.isdir(src): for item in os.listdir(src): itemsrc=os.path.join(src,item) delete(itemsrc) try: os.rmdir(src) print "删除文件夹%s"%src except: pass def clear(): delete(detail_links_file) def extract_detail_link(url): lines=[] regex=re.compile(g_pattern) file=os.path.join(list_list_folder,url2filename(url)) subject=open(file,"r").read() for match in regex.finditer(subject): #line="%s,%s\n"%(match.group("link").replace("&","&"),match.group("img").replace("http:/www","http://www").replace(","," ")) line="%s,\n"%(match.group("link").replace("&","&"),) lines.append(line) return lines def batch_extract_detail_links(): f=open(detail_links_file,"w") urls=open(success_file,"r").read().strip().split("\n") total=[] for url in urls: lines=extract_detail_link(url) total.extend(lines) print "%s,%s"%(url,len(lines)) s=set(total) f.writelines(s) f.close() print "done!" print "repeat count:%s"%(len(total)-len(s)) print "total lines:%s"%len(s) if __name__=="__main__": l=len(sys.argv) if l==1: batch_extract_detail_links() elif l==2: if sys.argv[1]=="clear": clear()

    D

    #!/usr/bin/env python
    #encoding=utf-8
    """
    2010.10.16,v0.13
    2010.10.15,v0.12
    2010.10.13,v0.11
    2010.10.07,v0.1
    批量抓取详细页
    """
    from __future__ import with_statement
    from __future__ import division
    
    import socket as original_socket
    original_socket.setdefaulttimeout(10)
    from eventlet.timeout import with_timeout
    from eventlet.green import urllib2
    
    from urlparse import urljoin
    import sys
    ####reload(sys)
    ####sys.setdefaultencoding('utf-8')
    
    import eventlet
    from eventlet import pools
    #httplib2 = eventlet.import_patched('httplib2')
    #httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)
    
    import time
    
    import os
    
    import os.path
    
    import stat
    
    import select
    
    g_host                  = "http://www.cnblogs.com/lexus"
    g_data_folder           = os.path.join("./","details")
    g_info_folder           = os.path.join("./","details_infos")
    g_status_file           = os.path.join("./","details_infos/status.txt")
    g_error_file            = os.path.join("./","details_infos/error.txt")
    g_success_file          = os.path.join("./","details_infos/success.txt")
    g_result_links_file     = os.path.join("./","extract_detail_links.txt")
    g_pool_num              = 1
    g_headers={}
    headers                 = """"""
    headers                 = headers.strip().replace("\r\n","\n")
    if headers<>"":
        for elem in headers.split("\n"):
            if elem.strip()=="":
                continue
            a,b=elem.split(":",1)
            a=a.strip()
            b=b.strip()
            g_headers[a]=b
    
    def init():
        if not os.path.exists(g_data_folder):
            os.mkdir(g_data_folder)
        if not os.path.exists(g_info_folder):
            os.mkdir(g_info_folder)
        print "完成初始化"
    
    def delete(src):
        '''delete files and folders'''
        permission(src)
        if os.path.isfile(src):
            try:
                os.remove(src)
            except:
                pass
        elif os.path.isdir(src):
            for item in os.listdir(src):
                itemsrc=os.path.join(src,item)
                delete(itemsrc)
            try:
                os.rmdir(src)
            except:
                pass
    
    def permission(src):
        os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)    
    
    def clear():
        delete(g_data_folder)
        delete(g_info_folder)
        print "还原为初始"
    
    def size(src):
        "检查文件或文件夹大小"
        r = 0L
        if os.path.isfile(src):
            r=os.path.getsize(src)
        else:
            for root, dirs, files in os.walk(src):
               r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
        l=len(str(r))
    
        if l>9:
            r=r/1024/1024/1024
            r="%.2f GiB"%r
        elif l>6:
            r=r/1024/1024
            r="%.2f MiB"%r
        elif l>3:
            r=r/1024
            r="%.2f KiB"%r
        print "%s 大小为:%s"%(src,r)
    
    def status(str):
        "running/stop"
        f=open(g_status_file,"w")
        f.write(str)
        f.close()    
    
    def error(url,ex):
        f=open(g_error_file,"a")
        f.write("%s\n"%(url,))
        f.close()
    
    def success(url):
        f=open(g_success_file,"a")
        f.write("%s\n"%url)
        f.close()
    
    def url2filename(url):
        import base64
        return base64.urlsafe_b64encode(url)
    
    def url2filename2(url):
        url=url.strip()
        idx=url.rfind("/")
        r=url[idx+1:]
        if idx==-1 or len(r)==0:
    #       raise ValueError("url2filename function parser error")
            print "启用特殊url2filename"
            r = re.findall(r"\d+", url)[-1]
        return r
    
    def statistics(func):
        def tongji():
            total,successed=0,0
            if os.path.exists(g_result_links_file):
                total=len(set(open(g_result_links_file,"r").readlines()))
                print "total lines:%s"%total
            if os.path.exists(g_success_file):
                successed=len(set(open(g_success_file,"r").readlines()))
                print "successed lines:%s"%successed
            print "left lines:%s"%(total-successed)
        def newFunc(*args,**args2):
            tongji()
            back = func(*args, **args2)
            tongji()
            return back
        return newFunc
    
    def cost_time(func):
        def newFunc(*args, **args2):
            t0 = time.time()
            print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
            back = func(*args, **args2)
            print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
            print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
            return back
        return newFunc
    
    def get_html(url):
        def do(url):
            html=""
            try:
                req = urllib2.Request(url = url,headers = g_headers)
                html = urllib2.urlopen(req).read()
                return html
            except Exception,e:
                print url,"error",e
                error(url,e)
                return None
        rr = with_timeout(10, do, url, timeout_value=None)
        return rr
    
    def get_html2(url):
        #print url
        with httppool.item() as http:
            #eventlet.sleep(0)
            resp, content = http.request(url,'GET',headers=g_headers)
            #resp, content = http.request(url)
            return content
    
    def save_html2file(filename,html):
        f=open(filename,"w")
        f.write(html)
        f.close()
    
    def save_url2file(url):
        a,b=url.strip().split(",")
        if not a.startswith("http://"):
            a=urljoin(g_host,a)
        #a=a.replace("&","&")
        html=get_html(a)
        if html is not None and html<>"":
            filename=os.path.join(g_data_folder,url2filename(a))
            save_html2file(filename,html)
            if os.path.getsize(filename)<1024*10:
                error(url,"size小于%s"%(1024*10))
                print url,"error"
                return
            success(url)#以成功的为基准,剩下的都是不成功的或未执行的
            print url,"success"
        else:
            print url,"error"
            error(url,"html为None或为空")
    
    def save_url2file2(url):
        a,b=url.strip().split(",")
        if not a.startswith("http://"):
            a=urljoin(g_host,a)
        html=""
        try:
            html=get_html(a)
        except Exception,e:
            print url,e,"fetch error"
            error(url,e)
            return
    
        if html<>"":
            filename=os.path.join(g_data_folder,url2filename(a))
            save_html2file(filename,html)
            if os.path.getsize(filename)<1024*10:             error(url,"size小于%s"%(1024*10))             print url,"error"             return         success(url)#以成功的为基准,剩下的都是不成功的或未执行的         print url,"success" @cost_time @statistics def batch_get_html(urls):     print "执行批量下载网页工作"     pool=eventlet.GreenPool(g_pool_num)     for url in urls:         pool.spawn_n(save_url2file,url)     pool.waitall()     size(g_data_folder)     print "done!" def count():     total,successed=set(),set()     if os.path.exists(g_success_file):         successed=set(open(g_success_file,"r").read().strip().split("\n"))     if os.path.exists(g_result_links_file):         total=set(open(g_result_links_file,"r").read().strip().split("\n"))     left=total-successed     return total,successed,left def process_continue():     "接着success抓取剩下的部分"     #读取完整的部分和已完成的部分进行取非交集合     total,successed,left=count()     batch_get_html(left) def process_forever():     "循环处理,直到全部完成"     total,successed,left=count()     print "left"     while len(left)>0:
            print "由于还没未完成页面,再次循环执行"
            process_continue()
            total,successed,left=count()
    
    if __name__=="__main__":
        init()
        l=len(sys.argv)
        if l==1:
            content=""
            if not select.select([sys.stdin,],[],[],0.0)[0]:
                print "load from %s"%g_result_links_file
                content=open(g_result_links_file,"r").read()
            else:
                print "load from stdin"
                content=sys.stdin.read()
            urls=content.strip().split("\n")
            #print urls
            batch_get_html(urls)
        elif l==2:
            argv=sys.argv[1]
            if argv=="clear":
                clear()
            if argv=="continue":
                process_continue()
            if argv=="loop":
                process_forever()
        elif l==3:
            if sys.argv[1]=="load":
                url=sys.argv[2]
                save_url2file(url)
        print "done!"
    

    代码我使用pre标签进行标识,直接在browser下查看可能有连行的问题,本来想找高亮工具,但是我的blog client没有这个feature,就算了,copy代码的话,查看源代码定位pre标签换行什么的都有的,不需要改动就可以用了,
    这几年写了大概3060篇blog,大概3/4应该是转载加一些自己的注解吧,整个抓下来大概265M的样子。
    代码使用了协程coroutine来加快下载,
    本来有一个fabric的统领性的脚本可以传到远程服务器上利用服务器IO再解析再返回本地的脚本,找不到了,大概是上次电脑crash的时候丢了,我再努力找找,我有备份的好习惯应该能找到,呵呵
    先这样,下一篇我再写一下,如何来索引数据
    PS:也不知道博客园的积分排名是怎么算的,我写的blog也不少,怎么就排不进首页呢,总是在470左右的样子

    这个爬虫的一个问题是没有办法发现新的链接,及内容的变化,这也是我为什么想用hbase+whenever来做的原因,不过这次先做为备份数据,先保留一份到本地吧。

  • 相关阅读:
    Construct Binary Tree from Preorder and Inorder Traversal
    Construct Binary Tree from Inorder and Postorder Traversal
    Maximum Depth of Binary Tree
    Sharepoint 2013 创建TimeJob 自动发送邮件
    IE8 不能够在Sharepoint平台上在线打开Office文档解决方案
    TFS安装与管理
    局域网通过IP查看对方计算机名,通过计算机名查看对方IP以及查看在线所有电脑IP
    JS 隐藏Sharepoint中List Item View页面的某一个字段
    SharePoint Calculated Column Formulas & Functions
    JS 两个一组数组转二维数组
  • 原文地址:https://www.cnblogs.com/lexus/p/2285393.html
Copyright © 2011-2022 走看看