zoukankan html css js c++ java

构建自己的博客专用搜索引擎－－抓数据

博客园有自己的lucene.net搜索引擎，还有google的站内搜索，不过即使是google搜索，也不会完全索引我的内容，它也挑三捡四的，即使索引了，由于计算机方面的搜索的词典的冷门性，也没办法有很高的查全率。所以我一直希望做一个自己的博客的全文索引。

本来想搞一个能用的基于rake+hbase+whenever+massive_record的方案来实现可扩展，做了一半，感觉整个工程周期太长，还是放了一旁，拿起以前的代码，改吧改吧先能用起来再说

我使用的是以前15-1688小额批发搜索引擎的部分脚本，之前使用web ui的方式来定制抓取的脚本模板，这里就直接拿来用了。

整个抓取数据的过程分为４步，共４个脚本，

A生成列表页链接

B抓取列表页

C抽取詳細页链接

D抽取詳細页

我就直接上代码了

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.15,v0.2
2010.10.07,v0.1
批量生成列表页链接
"""
import sys,os,time
list_url_template = "http://www.cnblogs.com/lexus/default.html?page=%s"
list_url_start    = 1
list_url_end      = 154
list_links_file   = os.path.join("./","list_links.txt")
g_step=1

def cost_time(func):
    def newFunc(*args, **args2):
        t0 = time.time()
        print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
        back = func(*args, **args2)
        print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
        print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
        return back
    return newFunc

@cost_time
def show(list_url_start=list_url_start,\
         list_url_end=list_url_end,\
         list_url_template=list_url_template):
    lines=[]
    for i in xrange(list_url_start,list_url_end+1):
        line="http://www.cnblogs.com/lexus/default.html?page=%s\n"%(i*g_step)
        print line.rstrip()
        lines.append(line)
    open(list_links_file,"w").writelines(lines)
    print "total count:%s"%len(lines)
    print "done!"

#import os.path
#print os.path.abspath(".")
if __name__=="__main__":
    l=len(sys.argv)
    if l==1:
        show()
    elif l==2:
        show(list_url_end=int(sys.argv[1]))
    elif l==3:
        show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]))
    elif l==4:
        show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]),list_url_template=sys.argv[3])

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.3
2010.10.09,v0.2
2010.10.07,v0.1
批量抓取列表页
"""
from __future__ import with_statement
from __future__ import division

import socket as original_socket
original_socket.setdefaulttimeout(10)
from eventlet.timeout import with_timeout
from eventlet.green import urllib2

import sys
####reload(sys)
####sys.setdefaultencoding('utf-8')

import eventlet
from eventlet import pools
#httplib2 = eventlet.import_patched('httplib2')
#httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)

import time

import os

import os.path

import stat

import select

import shutil

import re

import gzip
import StringIO

list_list_folder    = os.path.join("./","lists")
list_info_folder    = os.path.join("./","lists_infos")
status_file         = os.path.join("./","lists_infos/status.txt")
error_file          = os.path.join("./","lists_infos/error.txt")
error_file_bak      = os.path.join("./","lists_infos/error.txt.bak")
success_file        = os.path.join("./","lists_infos/success.txt")
list_links_file     = os.path.join("./","list_links.txt")
g_headers={}
g_pool_num          = 5

def init():
    if not os.path.exists(list_list_folder):
        os.mkdir(list_list_folder)
    if not os.path.exists(list_info_folder):
        os.mkdir(list_info_folder)
    print "完成初始化"

def delete(src):
    '''delete files and folders'''
    permission(src)
    if os.path.isfile(src):
        try:
            os.remove(src)
        except:
            pass
    elif os.path.isdir(src):
        for item in os.listdir(src):
            itemsrc=os.path.join(src,item)
            delete(itemsrc)
        try:
            os.rmdir(src)
        except:
            pass

def permission(src):
    os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)    

def clear():
    delete(list_list_folder)
    delete(list_info_folder)
    print "还原为初始"

def size(src):
    "检查文件或文件夹大小"
    r = 0L
    if os.path.isfile(src):
        r=os.path.getsize(src)
    else:
        for root, dirs, files in os.walk(src):
           r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
    l=len(str(r))

    if l>9:
        r=r/1024/1024/1024
        r="%.2f GiB"%r
    elif l>6:
        r=r/1024/1024
        r="%.2f MiB"%r
    elif l>3:
        r=r/1024
        r="%.2f KiB"%r
    print "%s 大小为:%s"%(src,r)

def status(str):
    "running/stop"
    f=open(status_file,"w")
    f.write(str)
    f.close()    

def error(url,ex):
    f=open(error_file,"a")
    f.write("%s\n"%(url,))
    f.close()

def success(url):
    f=open(success_file,"a")
    f.write("%s\n"%url)
    f.close()

def url2filename(url):
    import base64
    return base64.urlsafe_b64encode(url)

def url2filename2(url):
    url=url.strip()
    idx=url.rfind("/")
    r=url[idx+1:]
    if idx==-1 or len(r)==0:
#       raise ValueError("url2filename function parser error")
        print "启用特殊url2filename"
        r = re.findall(r"\d+", url)[-1]
    return r

def cost_time(func):
    def newFunc(*args, **args2):
        t0 = time.time()
        print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
        back = func(*args, **args2)
        print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
        print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
        return back
    return newFunc

def statistics(func):
    def tongji():
        total,successed=0,0
        if os.path.exists(list_links_file):
            total=len(set(open(list_links_file,"r").readlines()))
            print "total lines:%s"%total
        if os.path.exists(success_file):
            successed=len(set(open(success_file,"r").readlines()))
            print "successed lines:%s"%successed
        print "left lines:%s"%(total-successed)
    def newFunc(*args,**args2):
        tongji()
        back = func(*args, **args2)
        tongji()
        return back
    return newFunc

def get_html(url):
    def do(url):
        html=""
        try:
            req = urllib2.Request(url = url,headers = g_headers)
            html = urllib2.urlopen(req).read()
            return html
        except Exception,e:
            print url,"error",e
            error(url,e)
            return None
    rr = with_timeout(10, do, url, timeout_value=None)
    return rr

def get_html22(url):
    import types
    def do(url):
        html=""
        try:
            req = urllib2.Request(url = url,headers = g_headers)
            html = urllib2.urlopen(req).read()
            t=type(html)
            if t==types.StringTypes or t==types.UnicodeType:
                return html
            else:
                print url,"error======"
                return ""
        except Exception,e1:
            pdata = StringIO.StringIO(rr)#下面6行是实现解压缩
            gzipper = gzip.GzipFile(fileobj = pdata)
            try:
                html = gzipper.read()
                return html
            except Exception,e2:
                print url,e1,e2
                error(url,e1)
            return ""
    rr = with_timeout(10, do, url, timeout_value="")
    return rr

def get_html2(url):
    "when use gzipped page will get fetch error"
    #print url
    with httppool.item() as http:
        #eventlet.sleep(0)
        resp, content = http.request(url)
        print content
        return content

def save_html2file(filename,html):
    f=open(filename,"w")
    f.write(html)
    f.close()

def save_url2file(url):
    #html=""
    #try:
    #    html=get_html(url)
    #except Exception,e:
    #    print url,"fetch error",e
    #    error(url,e)
    #    return
    html=get_html(url)
    if html is not None and html<>"":
        filename=os.path.join(list_list_folder,url2filename(url))
        save_html2file(filename,html)
        if os.path.getsize(filename)<1024*20:
            error(url,"size小于%s"%(1024*20))
            print url,"error"
            return
        success(url)#以成功的为基准，剩下的都是不成功的或未执行的
        print url,"success"
    else:
        print url,"error"
        error(url,"html为None或为空")

@cost_time
@statistics
def batch_get_html(urls):
    print "执行批量下载网页工作"
    pool=eventlet.GreenPool(g_pool_num)
    for url in urls:
        pool.spawn_n(save_url2file,url)
    pool.waitall()
    print "done!"

def process_continue():
    "接着success抓取剩下的部分"
    #读取完整的部分和已完成的部分进行取非交集合
    done=set(open(success_file,"r").read().split("\n"))
    all=set(open(list_links_file,"r").read().split("\n"))
    left=all-done
    batch_get_html(left)

if __name__=="__main__":
    init()
    l=len(sys.argv)
    if l==1:
        content=""
        if not select.select([sys.stdin,],[],[],0.0)[0]:
            print "load from %s"%list_links_file
            content=open(list_links_file,"r").read()
        else:
            print "load from stdin"
            content=sys.stdin.read()
        urls=content.strip().split("\n")
        #print urls
        batch_get_html(urls)
        size(list_list_folder)
    elif l==2:
        argv=sys.argv[1]
        if argv=="clear":
            clear()
        if argv=="continue":
            process_continue()
    elif l==3:
        argv=sys.argv[1]
        if argv=="load":
            url=sys.argv[2]
            print url
            save_url2file(url)
    print "done!"

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.22
2010.10.11,v0.21
2010.10.09,v0.2
2010.10.07,v0.1
从列表页抽取详细页的链接和缩略图链接的脚本
"""
import sys
import re
import os.path

list_list_folder      = os.path.join("./","lists")
success_file        = os.path.join("./","lists_infos/success.txt")
detail_links_file   = os.path.join("./","extract_detail_links.txt")

#g_pattern=r"""
[^"]*?)\1[\s\S]*?[^"]*?)\3 """
g_pattern=r"""http://www.cnblogs.com/lexus/archive/\d{4}/\d{1,2}/\d{1,2}/\d{1,}\.html)\1[\s\S]*?>(?P[\s\S]*?)<[\s\S]*?/[\s\S]*?a[\s\S]*?>"""
if g_pattern[-2]=='"':
    g_pattern=g_pattern[:-2]+'\\"'
else:
    g_pattern=g_pattern[:-1]

def url2filename(url):
    import base64
    return base64.urlsafe_b64encode(url)

def url2filename2(url):
    url=url.strip()
    idx=url.rfind("/")
    r=url[idx+1:]
    if idx==-1 or len(r)==0:
#       raise ValueError("url2filename function parser error")
        print "启用特殊url2filename"
        r = re.findall(r"\d+", url)[-1]
    return r

def delete(src):
    '''delete files and folders'''
    #permission(src)
    if os.path.isfile(src):
        try:
            os.remove(src)
            print "删除文件%s"%src
        except:
            pass
    elif os.path.isdir(src):
        for item in os.listdir(src):
            itemsrc=os.path.join(src,item)
            delete(itemsrc)
        try:
            os.rmdir(src)
            print "删除文件夹%s"%src
        except:
            pass

def clear():
    delete(detail_links_file)

def extract_detail_link(url):
    lines=[]
    regex=re.compile(g_pattern)
    file=os.path.join(list_list_folder,url2filename(url))
    subject=open(file,"r").read()
    for match in regex.finditer(subject):
        #line="%s,%s\n"%(match.group("link").replace("&","&"),match.group("img").replace("http:/www","http://www").replace(","," "))
        line="%s,\n"%(match.group("link").replace("&","&"),)
        lines.append(line)
    return lines

def batch_extract_detail_links():
    f=open(detail_links_file,"w")
    urls=open(success_file,"r").read().strip().split("\n")
    total=[]
    for url in urls:
        lines=extract_detail_link(url)
        total.extend(lines)
        print "%s,%s"%(url,len(lines))

    s=set(total)
    f.writelines(s)
    f.close()
    print "done!"
    print "repeat count:%s"%(len(total)-len(s))
    print "total lines:%s"%len(s)

if __name__=="__main__":
    l=len(sys.argv)
    if l==1:
        batch_extract_detail_links()
    elif l==2:
        if sys.argv[1]=="clear":
            clear()

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.13
2010.10.15,v0.12
2010.10.13,v0.11
2010.10.07,v0.1
批量抓取详细页
"""
from __future__ import with_statement
from __future__ import division

import socket as original_socket
original_socket.setdefaulttimeout(10)
from eventlet.timeout import with_timeout
from eventlet.green import urllib2

from urlparse import urljoin
import sys
####reload(sys)
####sys.setdefaultencoding('utf-8')

import eventlet
from eventlet import pools
#httplib2 = eventlet.import_patched('httplib2')
#httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)

import time

import os

import os.path

import stat

import select

g_host                  = "http://www.cnblogs.com/lexus"
g_data_folder           = os.path.join("./","details")
g_info_folder           = os.path.join("./","details_infos")
g_status_file           = os.path.join("./","details_infos/status.txt")
g_error_file            = os.path.join("./","details_infos/error.txt")
g_success_file          = os.path.join("./","details_infos/success.txt")
g_result_links_file     = os.path.join("./","extract_detail_links.txt")
g_pool_num              = 1
g_headers={}
headers                 = """"""
headers                 = headers.strip().replace("\r\n","\n")
if headers<>"":
    for elem in headers.split("\n"):
        if elem.strip()=="":
            continue
        a,b=elem.split(":",1)
        a=a.strip()
        b=b.strip()
        g_headers[a]=b

def init():
    if not os.path.exists(g_data_folder):
        os.mkdir(g_data_folder)
    if not os.path.exists(g_info_folder):
        os.mkdir(g_info_folder)
    print "完成初始化"

def delete(src):
    '''delete files and folders'''
    permission(src)
    if os.path.isfile(src):
        try:
            os.remove(src)
        except:
            pass
    elif os.path.isdir(src):
        for item in os.listdir(src):
            itemsrc=os.path.join(src,item)
            delete(itemsrc)
        try:
            os.rmdir(src)
        except:
            pass

def permission(src):
    os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)    

def clear():
    delete(g_data_folder)
    delete(g_info_folder)
    print "还原为初始"

def size(src):
    "检查文件或文件夹大小"
    r = 0L
    if os.path.isfile(src):
        r=os.path.getsize(src)
    else:
        for root, dirs, files in os.walk(src):
           r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
    l=len(str(r))

    if l>9:
        r=r/1024/1024/1024
        r="%.2f GiB"%r
    elif l>6:
        r=r/1024/1024
        r="%.2f MiB"%r
    elif l>3:
        r=r/1024
        r="%.2f KiB"%r
    print "%s 大小为:%s"%(src,r)

def status(str):
    "running/stop"
    f=open(g_status_file,"w")
    f.write(str)
    f.close()    

def error(url,ex):
    f=open(g_error_file,"a")
    f.write("%s\n"%(url,))
    f.close()

def success(url):
    f=open(g_success_file,"a")
    f.write("%s\n"%url)
    f.close()

def url2filename(url):
    import base64
    return base64.urlsafe_b64encode(url)

def url2filename2(url):
    url=url.strip()
    idx=url.rfind("/")
    r=url[idx+1:]
    if idx==-1 or len(r)==0:
#       raise ValueError("url2filename function parser error")
        print "启用特殊url2filename"
        r = re.findall(r"\d+", url)[-1]
    return r

def statistics(func):
    def tongji():
        total,successed=0,0
        if os.path.exists(g_result_links_file):
            total=len(set(open(g_result_links_file,"r").readlines()))
            print "total lines:%s"%total
        if os.path.exists(g_success_file):
            successed=len(set(open(g_success_file,"r").readlines()))
            print "successed lines:%s"%successed
        print "left lines:%s"%(total-successed)
    def newFunc(*args,**args2):
        tongji()
        back = func(*args, **args2)
        tongji()
        return back
    return newFunc

def cost_time(func):
    def newFunc(*args, **args2):
        t0 = time.time()
        print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
        back = func(*args, **args2)
        print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
        print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
        return back
    return newFunc

def get_html(url):
    def do(url):
        html=""
        try:
            req = urllib2.Request(url = url,headers = g_headers)
            html = urllib2.urlopen(req).read()
            return html
        except Exception,e:
            print url,"error",e
            error(url,e)
            return None
    rr = with_timeout(10, do, url, timeout_value=None)
    return rr

def get_html2(url):
    #print url
    with httppool.item() as http:
        #eventlet.sleep(0)
        resp, content = http.request(url,'GET',headers=g_headers)
        #resp, content = http.request(url)
        return content

def save_html2file(filename,html):
    f=open(filename,"w")
    f.write(html)
    f.close()

def save_url2file(url):
    a,b=url.strip().split(",")
    if not a.startswith("http://"):
        a=urljoin(g_host,a)
    #a=a.replace("&","&")
    html=get_html(a)
    if html is not None and html<>"":
        filename=os.path.join(g_data_folder,url2filename(a))
        save_html2file(filename,html)
        if os.path.getsize(filename)<1024*10:
            error(url,"size小于%s"%(1024*10))
            print url,"error"
            return
        success(url)#以成功的为基准，剩下的都是不成功的或未执行的
        print url,"success"
    else:
        print url,"error"
        error(url,"html为None或为空")

def save_url2file2(url):
    a,b=url.strip().split(",")
    if not a.startswith("http://"):
        a=urljoin(g_host,a)
    html=""
    try:
        html=get_html(a)
    except Exception,e:
        print url,e,"fetch error"
        error(url,e)
        return

    if html<>"":
        filename=os.path.join(g_data_folder,url2filename(a))
        save_html2file(filename,html)
        if os.path.getsize(filename)<1024*10:             error(url,"size小于%s"%(1024*10))             print url,"error"             return         success(url)#以成功的为基准，剩下的都是不成功的或未执行的         print url,"success" @cost_time @statistics def batch_get_html(urls):     print "执行批量下载网页工作"     pool=eventlet.GreenPool(g_pool_num)     for url in urls:         pool.spawn_n(save_url2file,url)     pool.waitall()     size(g_data_folder)     print "done!" def count():     total,successed=set(),set()     if os.path.exists(g_success_file):         successed=set(open(g_success_file,"r").read().strip().split("\n"))     if os.path.exists(g_result_links_file):         total=set(open(g_result_links_file,"r").read().strip().split("\n"))     left=total-successed     return total,successed,left def process_continue():     "接着success抓取剩下的部分"     #读取完整的部分和已完成的部分进行取非交集合     total,successed,left=count()     batch_get_html(left) def process_forever():     "循环处理，直到全部完成"     total,successed,left=count()     print "left"     while len(left)>0:
        print "由于还没未完成页面，再次循环执行"
        process_continue()
        total,successed,left=count()

if __name__=="__main__":
    init()
    l=len(sys.argv)
    if l==1:
        content=""
        if not select.select([sys.stdin,],[],[],0.0)[0]:
            print "load from %s"%g_result_links_file
            content=open(g_result_links_file,"r").read()
        else:
            print "load from stdin"
            content=sys.stdin.read()
        urls=content.strip().split("\n")
        #print urls
        batch_get_html(urls)
    elif l==2:
        argv=sys.argv[1]
        if argv=="clear":
            clear()
        if argv=="continue":
            process_continue()
        if argv=="loop":
            process_forever()
    elif l==3:
        if sys.argv[1]=="load":
            url=sys.argv[2]
            save_url2file(url)
    print "done!"

代码我使用pre标签进行标识，直接在browser下查看可能有连行的问题，本来想找高亮工具，但是我的blog client没有这个feature，就算了，copy代码的话，查看源代码定位pre标签换行什么的都有的，不需要改动就可以用了，
这几年写了大概３０６０篇blog,大概３／４应该是转载加一些自己的注解吧，整个抓下来大概２６５Ｍ的样子。
代码使用了协程coroutine来加快下载，
本来有一个fabric的统领性的脚本可以传到远程服务器上利用服务器ＩＯ再解析再返回本地的脚本，找不到了，大概是上次电脑crash的时候丢了，我再努力找找，我有备份的好习惯应该能找到，呵呵
先这样，下一篇我再写一下，如何来索引数据
ＰＳ：也不知道博客园的积分排名是怎么算的，我写的blog也不少，怎么就排不进首页呢，总是在４７０左右的样子

这个爬虫的一个问题是没有办法发现新的链接，及内容的变化，这也是我为什么想用hbase＋whenever来做的原因，不过这次先做为备份数据，先保留一份到本地吧。

查看全文

相关阅读:
Construct Binary Tree from Preorder and Inorder Traversal
Construct Binary Tree from Inorder and Postorder Traversal
Maximum Depth of Binary Tree
Sharepoint 2013 创建TimeJob 自动发送邮件
 IE8 不能够在Sharepoint平台上在线打开Office文档解决方案
 TFS安装与管理
 局域网通过IP查看对方计算机名，通过计算机名查看对方IP以及查看在线所有电脑IP
JS 隐藏Sharepoint中List Item View页面的某一个字段
 SharePoint Calculated Column Formulas & Functions
JS 两个一组数组转二维数组

原文地址：https://www.cnblogs.com/lexus/p/2285393.html